{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999244142101285, "eval_steps": 100, "global_step": 661, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015117157974300832, "epsilon_dpo/beta": 0.10013501346111298, "epsilon_dpo/beta_margin_grad_mean": -0.5002685785293579, "epsilon_dpo/beta_margin_grad_std": 0.0056771161034703255, "epsilon_dpo/beta_margin_mean": -0.0010739759309217334, "epsilon_dpo/beta_margin_std": 0.02271234802901745, "epsilon_dpo/loss_margin_mean": -0.00900276005268097, "grad_norm": 65.00322723388672, "kl/avg_steps": -0.125, "kl/beta": 0.10000000149011612, "kl/n_epsilon_steps": 0.5625, "kl/p_epsilon_steps": 0.4375, "learning_rate": 0.0, "logits/chosen": -3.5397751331329346, "logits/rejected": -3.6458287239074707, "logps/chosen": -58.97797775268555, "logps/ref_chosen": -58.954925537109375, "logps/ref_rejected": -59.41980743408203, "logps/rejected": -59.43385696411133, "loss": 1.3875, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0024051459040492773, "rewards/margins": -0.00107401586137712, "rewards/rejected": -0.0013311300426721573, "step": 1 }, { "epoch": 0.0030234315948601664, "epsilon_dpo/beta": 0.10013516247272491, "epsilon_dpo/beta_margin_grad_mean": -0.5005137920379639, "epsilon_dpo/beta_margin_grad_std": 0.004163636360317469, "epsilon_dpo/beta_margin_mean": -0.0020554298534989357, "epsilon_dpo/beta_margin_std": 0.016656115651130676, "epsilon_dpo/loss_margin_mean": -0.01922258734703064, "grad_norm": 62.63206481933594, "kl/avg_steps": 0.0, "kl/beta": 0.10012515634298325, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 7.462686567164179e-09, "logits/chosen": -3.5709261894226074, "logits/rejected": -3.576897621154785, "logps/chosen": -48.390830993652344, "logps/ref_chosen": -48.38141632080078, "logps/ref_rejected": -60.59125900268555, "logps/rejected": -60.581451416015625, "loss": 1.3884, "rewards/accuracies": 0.5, "rewards/chosen": -0.0010149386944249272, "rewards/margins": -0.0020554400980472565, "rewards/rejected": 0.001040501520037651, "step": 2 }, { "epoch": 0.0045351473922902496, "epsilon_dpo/beta": 0.10004129260778427, "epsilon_dpo/beta_margin_grad_mean": -0.5001283288002014, "epsilon_dpo/beta_margin_grad_std": 0.006361375562846661, "epsilon_dpo/beta_margin_mean": -0.0005140957655385137, "epsilon_dpo/beta_margin_std": 0.025450462475419044, "epsilon_dpo/loss_margin_mean": -0.003169998526573181, "grad_norm": 83.4357681274414, "kl/avg_steps": 0.09375, "kl/beta": 0.10012515634298325, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 1.4925373134328357e-08, "logits/chosen": -3.5514073371887207, "logits/rejected": -3.618317127227783, "logps/chosen": -56.170440673828125, "logps/ref_chosen": -56.157108306884766, "logps/ref_rejected": -88.33527374267578, "logps/rejected": -88.34542846679688, "loss": 1.387, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0014039338566362858, "rewards/margins": -0.000514086103066802, "rewards/rejected": -0.0008898477535694838, "step": 3 }, { "epoch": 0.006046863189720333, "epsilon_dpo/beta": 0.1001664325594902, "epsilon_dpo/beta_margin_grad_mean": -0.5002225041389465, "epsilon_dpo/beta_margin_grad_std": 0.006250501144677401, "epsilon_dpo/beta_margin_mean": -0.0008897508378140628, "epsilon_dpo/beta_margin_std": 0.025006743147969246, "epsilon_dpo/loss_margin_mean": -0.006957381963729858, "grad_norm": 76.01309204101562, "kl/avg_steps": -0.125, "kl/beta": 0.10003137588500977, "kl/n_epsilon_steps": 0.5625, "kl/p_epsilon_steps": 0.4375, "learning_rate": 2.2388059701492534e-08, "logits/chosen": -3.6201987266540527, "logits/rejected": -3.628134250640869, "logps/chosen": -60.51570129394531, "logps/ref_chosen": -60.512386322021484, "logps/ref_rejected": -71.28668212890625, "logps/rejected": -71.28303527832031, "loss": 1.3873, "rewards/accuracies": 0.421875, "rewards/chosen": -0.00041586445877328515, "rewards/margins": -0.000889755436219275, "rewards/rejected": 0.0004738911520689726, "step": 4 }, { "epoch": 0.007558578987150416, "epsilon_dpo/beta": 0.10007268190383911, "epsilon_dpo/beta_margin_grad_mean": -0.49999529123306274, "epsilon_dpo/beta_margin_grad_std": 0.004807771183550358, "epsilon_dpo/beta_margin_mean": 1.8317823560209945e-05, "epsilon_dpo/beta_margin_std": 0.019233649596571922, "epsilon_dpo/loss_margin_mean": 0.0016630440950393677, "grad_norm": 68.25946807861328, "kl/avg_steps": 0.09375, "kl/beta": 0.10015657544136047, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 2.9850746268656714e-08, "logits/chosen": -3.525019407272339, "logits/rejected": -3.5357487201690674, "logps/chosen": -55.076087951660156, "logps/ref_chosen": -55.076438903808594, "logps/ref_rejected": -70.50552368164062, "logps/rejected": -70.5068359375, "loss": 1.3864, "rewards/accuracies": 0.53125, "rewards/chosen": -3.81012141588144e-05, "rewards/margins": 1.8276841728948057e-05, "rewards/rejected": -5.637804861180484e-05, "step": 5 }, { "epoch": 0.009070294784580499, "epsilon_dpo/beta": 0.09980680048465729, "epsilon_dpo/beta_margin_grad_mean": -0.4991690218448639, "epsilon_dpo/beta_margin_grad_std": 0.006899727508425713, "epsilon_dpo/beta_margin_mean": 0.0033227831590920687, "epsilon_dpo/beta_margin_std": 0.027610693126916885, "epsilon_dpo/loss_margin_mean": 0.0350785106420517, "grad_norm": 70.01203155517578, "kl/avg_steps": 0.265625, "kl/beta": 0.1000627651810646, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.731343283582089e-08, "logits/chosen": -3.5272698402404785, "logits/rejected": -3.5592615604400635, "logps/chosen": -42.394569396972656, "logps/ref_chosen": -42.39640808105469, "logps/ref_rejected": -75.95503997802734, "logps/rejected": -75.98828125, "loss": 1.3832, "rewards/accuracies": 0.59375, "rewards/chosen": 0.00011435325723141432, "rewards/margins": 0.003322768025100231, "rewards/rejected": -0.0032084148842841387, "step": 6 }, { "epoch": 0.010582010582010581, "epsilon_dpo/beta": 0.09943337738513947, "epsilon_dpo/beta_margin_grad_mean": -0.4977419376373291, "epsilon_dpo/beta_margin_grad_std": 0.005063969176262617, "epsilon_dpo/beta_margin_mean": 0.009033161215484142, "epsilon_dpo/beta_margin_std": 0.02025826834142208, "epsilon_dpo/loss_margin_mean": 0.09225571155548096, "grad_norm": 62.41374969482422, "kl/avg_steps": 0.375, "kl/beta": 0.09979767352342606, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.477611940298507e-08, "logits/chosen": -3.5664730072021484, "logits/rejected": -3.583383560180664, "logps/chosen": -46.943275451660156, "logps/ref_chosen": -46.99086380004883, "logps/ref_rejected": -60.83911895751953, "logps/rejected": -60.88378143310547, "loss": 1.3774, "rewards/accuracies": 0.65625, "rewards/chosen": 0.004670644178986549, "rewards/margins": 0.00903317704796791, "rewards/rejected": -0.004362532868981361, "step": 7 }, { "epoch": 0.012093726379440665, "epsilon_dpo/beta": 0.09918618947267532, "epsilon_dpo/beta_margin_grad_mean": -0.4986364245414734, "epsilon_dpo/beta_margin_grad_std": 0.005442216992378235, "epsilon_dpo/beta_margin_mean": 0.005456067156046629, "epsilon_dpo/beta_margin_std": 0.021775512024760246, "epsilon_dpo/loss_margin_mean": 0.05641074478626251, "grad_norm": 68.69767761230469, "kl/avg_steps": 0.25, "kl/beta": 0.0994248315691948, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 5.223880597014925e-08, "logits/chosen": -3.552912712097168, "logits/rejected": -3.533595561981201, "logps/chosen": -51.74263000488281, "logps/ref_chosen": -51.75770568847656, "logps/ref_rejected": -69.95840454101562, "logps/rejected": -69.99974060058594, "loss": 1.381, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0014538767281919718, "rewards/margins": 0.005456099286675453, "rewards/rejected": -0.004002222791314125, "step": 8 }, { "epoch": 0.013605442176870748, "epsilon_dpo/beta": 0.09906282275915146, "epsilon_dpo/beta_margin_grad_mean": -0.4988862872123718, "epsilon_dpo/beta_margin_grad_std": 0.006079367361962795, "epsilon_dpo/beta_margin_mean": 0.004455941263586283, "epsilon_dpo/beta_margin_std": 0.02432290092110634, "epsilon_dpo/loss_margin_mean": 0.04687276482582092, "grad_norm": 68.39608001708984, "kl/avg_steps": 0.125, "kl/beta": 0.09917689114809036, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 5.970149253731343e-08, "logits/chosen": -3.530184268951416, "logits/rejected": -3.632469654083252, "logps/chosen": -47.741905212402344, "logps/ref_chosen": -47.743324279785156, "logps/ref_rejected": -81.2067642211914, "logps/rejected": -81.2522201538086, "loss": 1.382, "rewards/accuracies": 0.5625, "rewards/chosen": 7.347855716943741e-05, "rewards/margins": 0.004455917980521917, "rewards/rejected": -0.004382439889013767, "step": 9 }, { "epoch": 0.015117157974300832, "epsilon_dpo/beta": 0.09903202950954437, "epsilon_dpo/beta_margin_grad_mean": -0.4994482398033142, "epsilon_dpo/beta_margin_grad_std": 0.00682299118489027, "epsilon_dpo/beta_margin_mean": 0.0022085336968302727, "epsilon_dpo/beta_margin_std": 0.027299627661705017, "epsilon_dpo/loss_margin_mean": 0.02426770329475403, "grad_norm": 72.99188232421875, "kl/avg_steps": 0.03125, "kl/beta": 0.09905307739973068, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 6.71641791044776e-08, "logits/chosen": -3.600921154022217, "logits/rejected": -3.6179487705230713, "logps/chosen": -50.462890625, "logps/ref_chosen": -50.466697692871094, "logps/ref_rejected": -80.81033325195312, "logps/rejected": -80.83079528808594, "loss": 1.3843, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0003000605502165854, "rewards/margins": 0.002208557678386569, "rewards/rejected": -0.0019084971863776445, "step": 10 }, { "epoch": 0.016628873771730914, "epsilon_dpo/beta": 0.09890823811292648, "epsilon_dpo/beta_margin_grad_mean": -0.4994899034500122, "epsilon_dpo/beta_margin_grad_std": 0.0064935204572975636, "epsilon_dpo/beta_margin_mean": 0.002040550811216235, "epsilon_dpo/beta_margin_std": 0.025979233905673027, "epsilon_dpo/loss_margin_mean": 0.022559180855751038, "grad_norm": 66.8790054321289, "kl/avg_steps": 0.125, "kl/beta": 0.09902212768793106, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 7.462686567164178e-08, "logits/chosen": -3.529179573059082, "logits/rejected": -3.588998556137085, "logps/chosen": -58.84630584716797, "logps/ref_chosen": -58.85676193237305, "logps/ref_rejected": -68.5490951538086, "logps/rejected": -68.56119537353516, "loss": 1.3844, "rewards/accuracies": 0.578125, "rewards/chosen": 0.0009481033775955439, "rewards/margins": 0.0020405417308211327, "rewards/rejected": -0.0010924384696409106, "step": 11 }, { "epoch": 0.018140589569160998, "epsilon_dpo/beta": 0.09890839457511902, "epsilon_dpo/beta_margin_grad_mean": -0.49997660517692566, "epsilon_dpo/beta_margin_grad_std": 0.0044360035099089146, "epsilon_dpo/beta_margin_mean": 9.364628203911707e-05, "epsilon_dpo/beta_margin_std": 0.017745599150657654, "epsilon_dpo/loss_margin_mean": 0.0022882670164108276, "grad_norm": 65.7290267944336, "kl/avg_steps": 0.0, "kl/beta": 0.0988985076546669, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 8.208955223880596e-08, "logits/chosen": -3.553344488143921, "logits/rejected": -3.5528182983398438, "logps/chosen": -54.05010223388672, "logps/ref_chosen": -54.048728942871094, "logps/ref_rejected": -66.68499755859375, "logps/rejected": -66.68865966796875, "loss": 1.3863, "rewards/accuracies": 0.5, "rewards/chosen": -0.00020027137361466885, "rewards/margins": 9.365554433315992e-05, "rewards/rejected": -0.00029392680153250694, "step": 12 }, { "epoch": 0.019652305366591082, "epsilon_dpo/beta": 0.09884657710790634, "epsilon_dpo/beta_margin_grad_mean": -0.5001493692398071, "epsilon_dpo/beta_margin_grad_std": 0.005271880887448788, "epsilon_dpo/beta_margin_mean": -0.0005976192187517881, "epsilon_dpo/beta_margin_std": 0.021089736372232437, "epsilon_dpo/loss_margin_mean": -0.004355251789093018, "grad_norm": 64.20111083984375, "kl/avg_steps": 0.0625, "kl/beta": 0.0988985076546669, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 8.955223880597014e-08, "logits/chosen": -3.573941230773926, "logits/rejected": -3.618351936340332, "logps/chosen": -43.061561584472656, "logps/ref_chosen": -43.04643249511719, "logps/ref_rejected": -83.55763244628906, "logps/rejected": -83.56840515136719, "loss": 1.387, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0015550702810287476, "rewards/margins": -0.0005976142128929496, "rewards/rejected": -0.0009574559517204762, "step": 13 }, { "epoch": 0.021164021164021163, "epsilon_dpo/beta": 0.09866128116846085, "epsilon_dpo/beta_margin_grad_mean": -0.49950075149536133, "epsilon_dpo/beta_margin_grad_std": 0.006601857487112284, "epsilon_dpo/beta_margin_mean": 0.001997843151912093, "epsilon_dpo/beta_margin_std": 0.02641463652253151, "epsilon_dpo/loss_margin_mean": 0.02212756872177124, "grad_norm": 67.3970947265625, "kl/avg_steps": 0.1875, "kl/beta": 0.0988367348909378, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 9.701492537313432e-08, "logits/chosen": -3.514829158782959, "logits/rejected": -3.5971553325653076, "logps/chosen": -48.203800201416016, "logps/ref_chosen": -48.19769287109375, "logps/ref_rejected": -67.02523803710938, "logps/rejected": -67.05347442626953, "loss": 1.3845, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0007104685064405203, "rewards/margins": 0.001997825223952532, "rewards/rejected": -0.002708293730393052, "step": 14 }, { "epoch": 0.022675736961451247, "epsilon_dpo/beta": 0.09863079339265823, "epsilon_dpo/beta_margin_grad_mean": -0.4990391433238983, "epsilon_dpo/beta_margin_grad_std": 0.005389949306845665, "epsilon_dpo/beta_margin_mean": 0.003844263032078743, "epsilon_dpo/beta_margin_std": 0.02156258188188076, "epsilon_dpo/loss_margin_mean": 0.04069235920906067, "grad_norm": 69.84053039550781, "kl/avg_steps": 0.03125, "kl/beta": 0.09865175932645798, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.044776119402985e-07, "logits/chosen": -3.6175551414489746, "logits/rejected": -3.6196587085723877, "logps/chosen": -58.166717529296875, "logps/ref_chosen": -58.18760299682617, "logps/ref_rejected": -79.85305786132812, "logps/rejected": -79.87286376953125, "loss": 1.3826, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0019768245983868837, "rewards/margins": 0.0038442397490143776, "rewards/rejected": -0.0018674151506274939, "step": 15 }, { "epoch": 0.02418745275888133, "epsilon_dpo/beta": 0.09863080084323883, "epsilon_dpo/beta_margin_grad_mean": -0.49983665347099304, "epsilon_dpo/beta_margin_grad_std": 0.005003057420253754, "epsilon_dpo/beta_margin_mean": 0.0006535464199259877, "epsilon_dpo/beta_margin_std": 0.020014582201838493, "epsilon_dpo/loss_margin_mean": 0.008197292685508728, "grad_norm": 59.940608978271484, "kl/avg_steps": 0.0, "kl/beta": 0.0986209437251091, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 1.1194029850746268e-07, "logits/chosen": -3.509491443634033, "logits/rejected": -3.593863010406494, "logps/chosen": -40.03649139404297, "logps/ref_chosen": -40.05681610107422, "logps/ref_rejected": -62.782501220703125, "logps/rejected": -62.77037048339844, "loss": 1.3857, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0019453626591712236, "rewards/margins": 0.0006535369320772588, "rewards/rejected": 0.001291825668886304, "step": 16 }, { "epoch": 0.025699168556311415, "epsilon_dpo/beta": 0.09869244694709778, "epsilon_dpo/beta_margin_grad_mean": -0.5002594590187073, "epsilon_dpo/beta_margin_grad_std": 0.005039663054049015, "epsilon_dpo/beta_margin_mean": -0.001037744339555502, "epsilon_dpo/beta_margin_std": 0.02016161009669304, "epsilon_dpo/loss_margin_mean": -0.008974939584732056, "grad_norm": 71.07174682617188, "kl/avg_steps": -0.0625, "kl/beta": 0.0986209437251091, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 1.1940298507462686e-07, "logits/chosen": -3.5748443603515625, "logits/rejected": -3.5934619903564453, "logps/chosen": -62.76985168457031, "logps/ref_chosen": -62.75297927856445, "logps/ref_rejected": -83.04196166992188, "logps/rejected": -83.04986572265625, "loss": 1.3874, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0017319807084277272, "rewards/margins": -0.0010376930003985763, "rewards/rejected": -0.00069428764982149, "step": 17 }, { "epoch": 0.027210884353741496, "epsilon_dpo/beta": 0.09853827953338623, "epsilon_dpo/beta_margin_grad_mean": -0.4988418221473694, "epsilon_dpo/beta_margin_grad_std": 0.0045989202335476875, "epsilon_dpo/beta_margin_mean": 0.00463343458250165, "epsilon_dpo/beta_margin_std": 0.018397780135273933, "epsilon_dpo/loss_margin_mean": 0.04849405586719513, "grad_norm": 61.46513366699219, "kl/avg_steps": 0.15625, "kl/beta": 0.09868261963129044, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 1.2686567164179106e-07, "logits/chosen": -3.524407386779785, "logits/rejected": -3.557884454727173, "logps/chosen": -42.14786911010742, "logps/ref_chosen": -42.158111572265625, "logps/ref_rejected": -67.2503662109375, "logps/rejected": -67.28861999511719, "loss": 1.3818, "rewards/accuracies": 0.5625, "rewards/chosen": 0.000927187327761203, "rewards/margins": 0.004633418750017881, "rewards/rejected": -0.0037062314804643393, "step": 18 }, { "epoch": 0.02872260015117158, "epsilon_dpo/beta": 0.0983845517039299, "epsilon_dpo/beta_margin_grad_mean": -0.49812963604927063, "epsilon_dpo/beta_margin_grad_std": 0.007250434719026089, "epsilon_dpo/beta_margin_mean": 0.007482836954295635, "epsilon_dpo/beta_margin_std": 0.029014047235250473, "epsilon_dpo/loss_margin_mean": 0.07805107533931732, "grad_norm": 70.57123565673828, "kl/avg_steps": 0.15625, "kl/beta": 0.0985286682844162, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 1.343283582089552e-07, "logits/chosen": -3.6558282375335693, "logits/rejected": -3.547760486602783, "logps/chosen": -56.68019104003906, "logps/ref_chosen": -56.74137878417969, "logps/ref_rejected": -74.71025085449219, "logps/rejected": -74.72711181640625, "loss": 1.379, "rewards/accuracies": 0.578125, "rewards/chosen": 0.005933217704296112, "rewards/margins": 0.007482876535505056, "rewards/rejected": -0.001549659064039588, "step": 19 }, { "epoch": 0.030234315948601664, "epsilon_dpo/beta": 0.09820032119750977, "epsilon_dpo/beta_margin_grad_mean": -0.4986565113067627, "epsilon_dpo/beta_margin_grad_std": 0.004346934147179127, "epsilon_dpo/beta_margin_mean": 0.005374664906412363, "epsilon_dpo/beta_margin_std": 0.017390085384249687, "epsilon_dpo/loss_margin_mean": 0.05604107677936554, "grad_norm": 65.79425811767578, "kl/avg_steps": 0.1875, "kl/beta": 0.09837495535612106, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 1.4179104477611938e-07, "logits/chosen": -3.569793224334717, "logits/rejected": -3.5764927864074707, "logps/chosen": -48.64110565185547, "logps/ref_chosen": -48.623435974121094, "logps/ref_rejected": -68.80427551269531, "logps/rejected": -68.87798309326172, "loss": 1.381, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0017850392032414675, "rewards/margins": 0.005374701227992773, "rewards/rejected": -0.007159740664064884, "step": 20 }, { "epoch": 0.031746031746031744, "epsilon_dpo/beta": 0.09804722666740417, "epsilon_dpo/beta_margin_grad_mean": -0.49857431650161743, "epsilon_dpo/beta_margin_grad_std": 0.005364253185689449, "epsilon_dpo/beta_margin_mean": 0.005702945403754711, "epsilon_dpo/beta_margin_std": 0.02146092802286148, "epsilon_dpo/loss_margin_mean": 0.059761419892311096, "grad_norm": 68.31267547607422, "kl/avg_steps": 0.15625, "kl/beta": 0.09819085150957108, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 1.4925373134328355e-07, "logits/chosen": -3.5573177337646484, "logits/rejected": -3.580090045928955, "logps/chosen": -55.063194274902344, "logps/ref_chosen": -55.05763244628906, "logps/ref_rejected": -76.93499755859375, "logps/rejected": -77.00032043457031, "loss": 1.3807, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0006060463492758572, "rewards/margins": 0.0057029761373996735, "rewards/rejected": -0.006309022195637226, "step": 21 }, { "epoch": 0.03325774754346183, "epsilon_dpo/beta": 0.09772560000419617, "epsilon_dpo/beta_margin_grad_mean": -0.4983994662761688, "epsilon_dpo/beta_margin_grad_std": 0.006077747792005539, "epsilon_dpo/beta_margin_mean": 0.006402758881449699, "epsilon_dpo/beta_margin_std": 0.024315940216183662, "epsilon_dpo/loss_margin_mean": 0.06717559695243835, "grad_norm": 65.34651947021484, "kl/avg_steps": 0.328125, "kl/beta": 0.09803766757249832, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.5671641791044775e-07, "logits/chosen": -3.5851380825042725, "logits/rejected": -3.637094736099243, "logps/chosen": -50.14364242553711, "logps/ref_chosen": -50.15445327758789, "logps/ref_rejected": -74.65166473388672, "logps/rejected": -74.7080307006836, "loss": 1.38, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0009799579856917262, "rewards/margins": 0.006402815692126751, "rewards/rejected": -0.005422857590019703, "step": 22 }, { "epoch": 0.03476946334089191, "epsilon_dpo/beta": 0.09748248755931854, "epsilon_dpo/beta_margin_grad_mean": -0.4977494478225708, "epsilon_dpo/beta_margin_grad_std": 0.007980192080140114, "epsilon_dpo/beta_margin_mean": 0.009005580097436905, "epsilon_dpo/beta_margin_std": 0.0319342240691185, "epsilon_dpo/loss_margin_mean": 0.09471076726913452, "grad_norm": 77.26547241210938, "kl/avg_steps": 0.25, "kl/beta": 0.0977170318365097, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.6417910447761193e-07, "logits/chosen": -3.5601205825805664, "logits/rejected": -3.607548952102661, "logps/chosen": -58.49126052856445, "logps/ref_chosen": -58.51670837402344, "logps/ref_rejected": -85.24751281738281, "logps/rejected": -85.3167724609375, "loss": 1.3776, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0023975432850420475, "rewards/margins": 0.009005581960082054, "rewards/rejected": -0.006608038209378719, "step": 23 }, { "epoch": 0.036281179138321996, "epsilon_dpo/beta": 0.09720892459154129, "epsilon_dpo/beta_margin_grad_mean": -0.4976484179496765, "epsilon_dpo/beta_margin_grad_std": 0.0069191427901387215, "epsilon_dpo/beta_margin_mean": 0.00940895825624466, "epsilon_dpo/beta_margin_std": 0.027687160298228264, "epsilon_dpo/loss_margin_mean": 0.09873881936073303, "grad_norm": 61.283119201660156, "kl/avg_steps": 0.28125, "kl/beta": 0.09747334569692612, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.716417910447761e-07, "logits/chosen": -3.55317759513855, "logits/rejected": -3.5941121578216553, "logps/chosen": -45.63995361328125, "logps/ref_chosen": -45.64161682128906, "logps/ref_rejected": -63.19910430908203, "logps/rejected": -63.296180725097656, "loss": 1.3771, "rewards/accuracies": 0.640625, "rewards/chosen": 7.94949010014534e-05, "rewards/margins": 0.009409010410308838, "rewards/rejected": -0.00932951457798481, "step": 24 }, { "epoch": 0.03779289493575208, "epsilon_dpo/beta": 0.0967540293931961, "epsilon_dpo/beta_margin_grad_mean": -0.49625471234321594, "epsilon_dpo/beta_margin_grad_std": 0.007684577722102404, "epsilon_dpo/beta_margin_mean": 0.014987252652645111, "epsilon_dpo/beta_margin_std": 0.030755776911973953, "epsilon_dpo/loss_margin_mean": 0.1567019522190094, "grad_norm": 67.19364166259766, "kl/avg_steps": 0.46875, "kl/beta": 0.09719997644424438, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.7910447761194027e-07, "logits/chosen": -3.6059727668762207, "logits/rejected": -3.6505181789398193, "logps/chosen": -52.91407775878906, "logps/ref_chosen": -52.92271041870117, "logps/ref_rejected": -84.4869613647461, "logps/rejected": -84.63502502441406, "loss": 1.3716, "rewards/accuracies": 0.75, "rewards/chosen": 0.0007681053248234093, "rewards/margins": 0.014987286180257797, "rewards/rejected": -0.01421917974948883, "step": 25 }, { "epoch": 0.039304610733182165, "epsilon_dpo/beta": 0.09645378589630127, "epsilon_dpo/beta_margin_grad_mean": -0.49735695123672485, "epsilon_dpo/beta_margin_grad_std": 0.007086944300681353, "epsilon_dpo/beta_margin_mean": 0.010575017891824245, "epsilon_dpo/beta_margin_std": 0.028354529291391373, "epsilon_dpo/loss_margin_mean": 0.11169630289077759, "grad_norm": 65.47696685791016, "kl/avg_steps": 0.3125, "kl/beta": 0.09674647450447083, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.8656716417910447e-07, "logits/chosen": -3.5339903831481934, "logits/rejected": -3.5546607971191406, "logps/chosen": -47.662109375, "logps/ref_chosen": -47.644371032714844, "logps/ref_rejected": -73.40299987792969, "logps/rejected": -73.53244018554688, "loss": 1.3759, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0018067193450406194, "rewards/margins": 0.01057504117488861, "rewards/rejected": -0.012381760403513908, "step": 26 }, { "epoch": 0.04081632653061224, "epsilon_dpo/beta": 0.09612315893173218, "epsilon_dpo/beta_margin_grad_mean": -0.49750056862831116, "epsilon_dpo/beta_margin_grad_std": 0.007807661779224873, "epsilon_dpo/beta_margin_mean": 0.010000926442444324, "epsilon_dpo/beta_margin_std": 0.0312412828207016, "epsilon_dpo/loss_margin_mean": 0.10625442862510681, "grad_norm": 65.35517120361328, "kl/avg_steps": 0.34375, "kl/beta": 0.09644508361816406, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.9402985074626865e-07, "logits/chosen": -3.560518264770508, "logits/rejected": -3.566493034362793, "logps/chosen": -56.50353240966797, "logps/ref_chosen": -56.45243453979492, "logps/ref_rejected": -62.60496520996094, "logps/rejected": -62.76231384277344, "loss": 1.3766, "rewards/accuracies": 0.671875, "rewards/chosen": -0.004973269067704678, "rewards/margins": 0.010000906884670258, "rewards/rejected": -0.014974175952374935, "step": 27 }, { "epoch": 0.042328042328042326, "epsilon_dpo/beta": 0.0958087369799614, "epsilon_dpo/beta_margin_grad_mean": -0.4961710572242737, "epsilon_dpo/beta_margin_grad_std": 0.008399232290685177, "epsilon_dpo/beta_margin_mean": 0.01531986054033041, "epsilon_dpo/beta_margin_std": 0.03361152857542038, "epsilon_dpo/loss_margin_mean": 0.16230668127536774, "grad_norm": 63.66240310668945, "kl/avg_steps": 0.328125, "kl/beta": 0.09611468762159348, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.0149253731343282e-07, "logits/chosen": -3.57749605178833, "logits/rejected": -3.6017184257507324, "logps/chosen": -51.37296676635742, "logps/ref_chosen": -51.36060333251953, "logps/ref_rejected": -62.52366638183594, "logps/rejected": -62.698341369628906, "loss": 1.3713, "rewards/accuracies": 0.640625, "rewards/chosen": -0.0012942205648869276, "rewards/margins": 0.015319883823394775, "rewards/rejected": -0.01661410555243492, "step": 28 }, { "epoch": 0.04383975812547241, "epsilon_dpo/beta": 0.09545063972473145, "epsilon_dpo/beta_margin_grad_mean": -0.49559664726257324, "epsilon_dpo/beta_margin_grad_std": 0.009968101046979427, "epsilon_dpo/beta_margin_mean": 0.01762251928448677, "epsilon_dpo/beta_margin_std": 0.039902232587337494, "epsilon_dpo/loss_margin_mean": 0.18724413216114044, "grad_norm": 69.36910247802734, "kl/avg_steps": 0.375, "kl/beta": 0.09580034762620926, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.08955223880597e-07, "logits/chosen": -3.6223435401916504, "logits/rejected": -3.6054773330688477, "logps/chosen": -60.97156524658203, "logps/ref_chosen": -60.97344970703125, "logps/ref_rejected": -75.95550537109375, "logps/rejected": -76.140869140625, "loss": 1.3691, "rewards/accuracies": 0.6875, "rewards/chosen": 7.41246622055769e-05, "rewards/margins": 0.017622580751776695, "rewards/rejected": -0.01754845678806305, "step": 29 }, { "epoch": 0.045351473922902494, "epsilon_dpo/beta": 0.09515369683504105, "epsilon_dpo/beta_margin_grad_mean": -0.49709352850914, "epsilon_dpo/beta_margin_grad_std": 0.011745194904506207, "epsilon_dpo/beta_margin_mean": 0.011628457345068455, "epsilon_dpo/beta_margin_std": 0.04702313616871834, "epsilon_dpo/loss_margin_mean": 0.12552256882190704, "grad_norm": 64.3920669555664, "kl/avg_steps": 0.3125, "kl/beta": 0.09544243663549423, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.1641791044776117e-07, "logits/chosen": -3.5751404762268066, "logits/rejected": -3.629140615463257, "logps/chosen": -43.58692169189453, "logps/ref_chosen": -43.47471618652344, "logps/ref_rejected": -78.57861328125, "logps/rejected": -78.81632995605469, "loss": 1.3753, "rewards/accuracies": 0.65625, "rewards/chosen": -0.010817013680934906, "rewards/margins": 0.011628467589616776, "rewards/rejected": -0.02244548127055168, "step": 30 }, { "epoch": 0.04686318972033258, "epsilon_dpo/beta": 0.09488699585199356, "epsilon_dpo/beta_margin_grad_mean": -0.49726614356040955, "epsilon_dpo/beta_margin_grad_std": 0.01422181911766529, "epsilon_dpo/beta_margin_mean": 0.010937676765024662, "epsilon_dpo/beta_margin_std": 0.05696655437350273, "epsilon_dpo/loss_margin_mean": 0.11928337812423706, "grad_norm": 73.71058654785156, "kl/avg_steps": 0.28125, "kl/beta": 0.09514510631561279, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.2388059701492537e-07, "logits/chosen": -3.6132450103759766, "logits/rejected": -3.5954675674438477, "logps/chosen": -58.194419860839844, "logps/ref_chosen": -58.1278076171875, "logps/ref_rejected": -86.19705200195312, "logps/rejected": -86.38294982910156, "loss": 1.3762, "rewards/accuracies": 0.609375, "rewards/chosen": -0.006451633758842945, "rewards/margins": 0.010937697254121304, "rewards/rejected": -0.01738933101296425, "step": 31 }, { "epoch": 0.04837490551776266, "epsilon_dpo/beta": 0.09453192353248596, "epsilon_dpo/beta_margin_grad_mean": -0.49421560764312744, "epsilon_dpo/beta_margin_grad_std": 0.01188565045595169, "epsilon_dpo/beta_margin_mean": 0.02315065823495388, "epsilon_dpo/beta_margin_std": 0.04757777974009514, "epsilon_dpo/loss_margin_mean": 0.2483389675617218, "grad_norm": 66.10027313232422, "kl/avg_steps": 0.375, "kl/beta": 0.09487826377153397, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.3134328358208954e-07, "logits/chosen": -3.615649461746216, "logits/rejected": -3.645280599594116, "logps/chosen": -63.506473541259766, "logps/ref_chosen": -63.442073822021484, "logps/ref_rejected": -78.22062683105469, "logps/rejected": -78.53337097167969, "loss": 1.3638, "rewards/accuracies": 0.734375, "rewards/chosen": -0.006223967764526606, "rewards/margins": 0.023150702938437462, "rewards/rejected": -0.02937467023730278, "step": 32 }, { "epoch": 0.049886621315192746, "epsilon_dpo/beta": 0.09403103590011597, "epsilon_dpo/beta_margin_grad_mean": -0.49343442916870117, "epsilon_dpo/beta_margin_grad_std": 0.01561080850660801, "epsilon_dpo/beta_margin_mean": 0.026265764608979225, "epsilon_dpo/beta_margin_std": 0.06257874518632889, "epsilon_dpo/loss_margin_mean": 0.28316259384155273, "grad_norm": 69.26751708984375, "kl/avg_steps": 0.53125, "kl/beta": 0.09452379494905472, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.388059701492537e-07, "logits/chosen": -3.5993685722351074, "logits/rejected": -3.6356759071350098, "logps/chosen": -55.675262451171875, "logps/ref_chosen": -55.516849517822266, "logps/ref_rejected": -64.91047668457031, "logps/rejected": -65.35205078125, "loss": 1.3612, "rewards/accuracies": 0.75, "rewards/chosen": -0.015066804364323616, "rewards/margins": 0.026265766471624374, "rewards/rejected": -0.04133257269859314, "step": 33 }, { "epoch": 0.05139833711262283, "epsilon_dpo/beta": 0.09371045231819153, "epsilon_dpo/beta_margin_grad_mean": -0.4952542185783386, "epsilon_dpo/beta_margin_grad_std": 0.011118343099951744, "epsilon_dpo/beta_margin_mean": 0.018992209807038307, "epsilon_dpo/beta_margin_std": 0.04450416937470436, "epsilon_dpo/loss_margin_mean": 0.2059399038553238, "grad_norm": 58.72181701660156, "kl/avg_steps": 0.34375, "kl/beta": 0.09402429312467575, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.4626865671641786e-07, "logits/chosen": -3.503596305847168, "logits/rejected": -3.5986719131469727, "logps/chosen": -49.4073486328125, "logps/ref_chosen": -49.25294876098633, "logps/ref_rejected": -62.969539642333984, "logps/rejected": -63.32987976074219, "loss": 1.3679, "rewards/accuracies": 0.671875, "rewards/chosen": -0.014670913107693195, "rewards/margins": 0.018992213532328606, "rewards/rejected": -0.033663127571344376, "step": 34 }, { "epoch": 0.05291005291005291, "epsilon_dpo/beta": 0.09327228367328644, "epsilon_dpo/beta_margin_grad_mean": -0.49382129311561584, "epsilon_dpo/beta_margin_grad_std": 0.013546686619520187, "epsilon_dpo/beta_margin_mean": 0.024731820449233055, "epsilon_dpo/beta_margin_std": 0.054244864732027054, "epsilon_dpo/loss_margin_mean": 0.2686765193939209, "grad_norm": 60.66328811645508, "kl/avg_steps": 0.46875, "kl/beta": 0.09370218962430954, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.537313432835821e-07, "logits/chosen": -3.5901918411254883, "logits/rejected": -3.5892491340637207, "logps/chosen": -54.43663787841797, "logps/ref_chosen": -54.3001708984375, "logps/ref_rejected": -68.18136596679688, "logps/rejected": -68.58650970458984, "loss": 1.3625, "rewards/accuracies": 0.703125, "rewards/chosen": -0.012873869389295578, "rewards/margins": 0.024731824174523354, "rewards/rejected": -0.03760569542646408, "step": 35 }, { "epoch": 0.05442176870748299, "epsilon_dpo/beta": 0.09307029843330383, "epsilon_dpo/beta_margin_grad_mean": -0.4948699474334717, "epsilon_dpo/beta_margin_grad_std": 0.020978519693017006, "epsilon_dpo/beta_margin_mean": 0.020514175295829773, "epsilon_dpo/beta_margin_std": 0.08423285186290741, "epsilon_dpo/loss_margin_mean": 0.22663387656211853, "grad_norm": 65.45482635498047, "kl/avg_steps": 0.21875, "kl/beta": 0.09326501190662384, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 2.611940298507462e-07, "logits/chosen": -3.5999999046325684, "logits/rejected": -3.6096529960632324, "logps/chosen": -46.13711166381836, "logps/ref_chosen": -45.73765563964844, "logps/ref_rejected": -72.94627380371094, "logps/rejected": -73.57235717773438, "loss": 1.3677, "rewards/accuracies": 0.59375, "rewards/chosen": -0.037446994334459305, "rewards/margins": 0.020514149218797684, "rewards/rejected": -0.05796114355325699, "step": 36 }, { "epoch": 0.055933484504913075, "epsilon_dpo/beta": 0.0925181433558464, "epsilon_dpo/beta_margin_grad_mean": -0.4909619390964508, "epsilon_dpo/beta_margin_grad_std": 0.024680888280272484, "epsilon_dpo/beta_margin_mean": 0.03606438264250755, "epsilon_dpo/beta_margin_std": 0.09948907047510147, "epsilon_dpo/loss_margin_mean": 0.3956329822540283, "grad_norm": 76.82211303710938, "kl/avg_steps": 0.59375, "kl/beta": 0.09306143969297409, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.686567164179104e-07, "logits/chosen": -3.5992069244384766, "logits/rejected": -3.7180380821228027, "logps/chosen": -50.332767486572266, "logps/ref_chosen": -50.14415740966797, "logps/ref_rejected": -87.43962860107422, "logps/rejected": -88.02386474609375, "loss": 1.353, "rewards/accuracies": 0.796875, "rewards/chosen": -0.01751083880662918, "rewards/margins": 0.03606438264250755, "rewards/rejected": -0.05357522517442703, "step": 37 }, { "epoch": 0.05744520030234316, "epsilon_dpo/beta": 0.09217444807291031, "epsilon_dpo/beta_margin_grad_mean": -0.49103057384490967, "epsilon_dpo/beta_margin_grad_std": 0.0181855671107769, "epsilon_dpo/beta_margin_mean": 0.03594838082790375, "epsilon_dpo/beta_margin_std": 0.07289386540651321, "epsilon_dpo/loss_margin_mean": 0.3950408101081848, "grad_norm": 62.090572357177734, "kl/avg_steps": 0.375, "kl/beta": 0.09251215308904648, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.761194029850746e-07, "logits/chosen": -3.5741424560546875, "logits/rejected": -3.6092722415924072, "logps/chosen": -55.30254364013672, "logps/ref_chosen": -54.997772216796875, "logps/ref_rejected": -68.17823791503906, "logps/rejected": -68.87804412841797, "loss": 1.352, "rewards/accuracies": 0.671875, "rewards/chosen": -0.028347834944725037, "rewards/margins": 0.03594838082790375, "rewards/rejected": -0.06429621577262878, "step": 38 }, { "epoch": 0.05895691609977324, "epsilon_dpo/beta": 0.09183008968830109, "epsilon_dpo/beta_margin_grad_mean": -0.48849961161613464, "epsilon_dpo/beta_margin_grad_std": 0.022049404680728912, "epsilon_dpo/beta_margin_mean": 0.046143367886543274, "epsilon_dpo/beta_margin_std": 0.08852408826351166, "epsilon_dpo/loss_margin_mean": 0.508389949798584, "grad_norm": 63.296207427978516, "kl/avg_steps": 0.375, "kl/beta": 0.09216652810573578, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.8358208955223876e-07, "logits/chosen": -3.584090232849121, "logits/rejected": -3.52459716796875, "logps/chosen": -51.42475128173828, "logps/ref_chosen": -51.02398681640625, "logps/ref_rejected": -66.25391387939453, "logps/rejected": -67.16307067871094, "loss": 1.3426, "rewards/accuracies": 0.703125, "rewards/chosen": -0.03696953505277634, "rewards/margins": 0.046143367886543274, "rewards/rejected": -0.08311291038990021, "step": 39 }, { "epoch": 0.06046863189720333, "epsilon_dpo/beta": 0.09131482243537903, "epsilon_dpo/beta_margin_grad_mean": -0.4842977523803711, "epsilon_dpo/beta_margin_grad_std": 0.02141079306602478, "epsilon_dpo/beta_margin_mean": 0.06298519670963287, "epsilon_dpo/beta_margin_std": 0.08601222187280655, "epsilon_dpo/loss_margin_mean": 0.6946882605552673, "grad_norm": 80.45830535888672, "kl/avg_steps": 0.5625, "kl/beta": 0.09182219207286835, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.9104477611940296e-07, "logits/chosen": -3.530338764190674, "logits/rejected": -3.54994797706604, "logps/chosen": -52.79600524902344, "logps/ref_chosen": -52.68380355834961, "logps/ref_rejected": -83.0903091430664, "logps/rejected": -83.89720153808594, "loss": 1.3261, "rewards/accuracies": 0.78125, "rewards/chosen": -0.01044386811554432, "rewards/margins": 0.06298521161079407, "rewards/rejected": -0.07342907786369324, "step": 40 }, { "epoch": 0.06198034769463341, "epsilon_dpo/beta": 0.0907755121588707, "epsilon_dpo/beta_margin_grad_mean": -0.4792315363883972, "epsilon_dpo/beta_margin_grad_std": 0.02854551002383232, "epsilon_dpo/beta_margin_mean": 0.08342263847589493, "epsilon_dpo/beta_margin_std": 0.11486566811800003, "epsilon_dpo/loss_margin_mean": 0.9257802963256836, "grad_norm": 65.66410827636719, "kl/avg_steps": 0.59375, "kl/beta": 0.0913085788488388, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.985074626865671e-07, "logits/chosen": -3.6880173683166504, "logits/rejected": -3.640995502471924, "logps/chosen": -62.40971755981445, "logps/ref_chosen": -62.01616287231445, "logps/ref_rejected": -74.66358947753906, "logps/rejected": -75.98292541503906, "loss": 1.3079, "rewards/accuracies": 0.796875, "rewards/chosen": -0.03614383563399315, "rewards/margins": 0.08342261612415314, "rewards/rejected": -0.11956645548343658, "step": 41 }, { "epoch": 0.06349206349206349, "epsilon_dpo/beta": 0.09029644727706909, "epsilon_dpo/beta_margin_grad_mean": -0.4832316040992737, "epsilon_dpo/beta_margin_grad_std": 0.02639036998152733, "epsilon_dpo/beta_margin_mean": 0.06725002825260162, "epsilon_dpo/beta_margin_std": 0.10609689354896545, "epsilon_dpo/loss_margin_mean": 0.7514758110046387, "grad_norm": 61.02724838256836, "kl/avg_steps": 0.53125, "kl/beta": 0.09076963365077972, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.059701492537313e-07, "logits/chosen": -3.611198663711548, "logits/rejected": -3.6217169761657715, "logps/chosen": -57.40989303588867, "logps/ref_chosen": -56.97229766845703, "logps/ref_rejected": -72.9758529663086, "logps/rejected": -74.1649169921875, "loss": 1.323, "rewards/accuracies": 0.78125, "rewards/chosen": -0.03989080712199211, "rewards/margins": 0.06725005060434341, "rewards/rejected": -0.10714085400104523, "step": 42 }, { "epoch": 0.06500377928949358, "epsilon_dpo/beta": 0.0897628515958786, "epsilon_dpo/beta_margin_grad_mean": -0.4778538942337036, "epsilon_dpo/beta_margin_grad_std": 0.03261049836874008, "epsilon_dpo/beta_margin_mean": 0.08913961052894592, "epsilon_dpo/beta_margin_std": 0.1316053420305252, "epsilon_dpo/loss_margin_mean": 1.0002970695495605, "grad_norm": 64.61426544189453, "kl/avg_steps": 0.59375, "kl/beta": 0.09028997272253036, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.134328358208955e-07, "logits/chosen": -3.553682327270508, "logits/rejected": -3.5986783504486084, "logps/chosen": -64.7459487915039, "logps/ref_chosen": -64.23754119873047, "logps/ref_rejected": -70.3188705444336, "logps/rejected": -71.82757568359375, "loss": 1.3034, "rewards/accuracies": 0.765625, "rewards/chosen": -0.04597744718194008, "rewards/margins": 0.08913963288068771, "rewards/rejected": -0.1351170837879181, "step": 43 }, { "epoch": 0.06651549508692366, "epsilon_dpo/beta": 0.08940134197473526, "epsilon_dpo/beta_margin_grad_mean": -0.47723183035850525, "epsilon_dpo/beta_margin_grad_std": 0.03856487572193146, "epsilon_dpo/beta_margin_mean": 0.09183450043201447, "epsilon_dpo/beta_margin_std": 0.15607191622257233, "epsilon_dpo/loss_margin_mean": 1.0375595092773438, "grad_norm": 66.58892059326172, "kl/avg_steps": 0.40625, "kl/beta": 0.089757040143013, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.2089552238805965e-07, "logits/chosen": -3.653578758239746, "logits/rejected": -3.633225679397583, "logps/chosen": -62.823829650878906, "logps/ref_chosen": -62.204036712646484, "logps/ref_rejected": -80.45088195800781, "logps/rejected": -82.10823059082031, "loss": 1.3026, "rewards/accuracies": 0.734375, "rewards/chosen": -0.055801667273044586, "rewards/margins": 0.09183451533317566, "rewards/rejected": -0.14763619005680084, "step": 44 }, { "epoch": 0.06802721088435375, "epsilon_dpo/beta": 0.08873230218887329, "epsilon_dpo/beta_margin_grad_mean": -0.47271308302879333, "epsilon_dpo/beta_margin_grad_std": 0.032259467989206314, "epsilon_dpo/beta_margin_mean": 0.10980548709630966, "epsilon_dpo/beta_margin_std": 0.13043250143527985, "epsilon_dpo/loss_margin_mean": 1.2426363229751587, "grad_norm": 63.6839714050293, "kl/avg_steps": 0.75, "kl/beta": 0.08939387649297714, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.2835820895522385e-07, "logits/chosen": -3.5793776512145996, "logits/rejected": -3.5925559997558594, "logps/chosen": -53.979896545410156, "logps/ref_chosen": -53.40477752685547, "logps/ref_rejected": -85.97521209716797, "logps/rejected": -87.79296875, "loss": 1.2837, "rewards/accuracies": 0.90625, "rewards/chosen": -0.051206521689891815, "rewards/margins": 0.10980547964572906, "rewards/rejected": -0.16101199388504028, "step": 45 }, { "epoch": 0.06953892668178382, "epsilon_dpo/beta": 0.08837679028511047, "epsilon_dpo/beta_margin_grad_mean": -0.4748702943325043, "epsilon_dpo/beta_margin_grad_std": 0.042006317526102066, "epsilon_dpo/beta_margin_mean": 0.10130669921636581, "epsilon_dpo/beta_margin_std": 0.16962194442749023, "epsilon_dpo/loss_margin_mean": 1.1589288711547852, "grad_norm": 63.441226959228516, "kl/avg_steps": 0.40625, "kl/beta": 0.0887284129858017, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.3582089552238805e-07, "logits/chosen": -3.6010684967041016, "logits/rejected": -3.647663116455078, "logps/chosen": -53.51953887939453, "logps/ref_chosen": -52.967742919921875, "logps/ref_rejected": -73.82437133789062, "logps/rejected": -75.53508758544922, "loss": 1.2947, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0491541251540184, "rewards/margins": 0.10130667686462402, "rewards/rejected": -0.15046080946922302, "step": 46 }, { "epoch": 0.0710506424792139, "epsilon_dpo/beta": 0.08788112550973892, "epsilon_dpo/beta_margin_grad_mean": -0.4751608669757843, "epsilon_dpo/beta_margin_grad_std": 0.04114978387951851, "epsilon_dpo/beta_margin_mean": 0.10010926425457001, "epsilon_dpo/beta_margin_std": 0.1660405546426773, "epsilon_dpo/loss_margin_mean": 1.150001883506775, "grad_norm": 60.32307815551758, "kl/avg_steps": 0.5625, "kl/beta": 0.08836941421031952, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.432835820895522e-07, "logits/chosen": -3.561004400253296, "logits/rejected": -3.6481542587280273, "logps/chosen": -49.49091339111328, "logps/ref_chosen": -48.610904693603516, "logps/ref_rejected": -61.73335266113281, "logps/rejected": -63.76336669921875, "loss": 1.2955, "rewards/accuracies": 0.765625, "rewards/chosen": -0.07808675616979599, "rewards/margins": 0.10010924935340881, "rewards/rejected": -0.1781959980726242, "step": 47 }, { "epoch": 0.07256235827664399, "epsilon_dpo/beta": 0.08752687275409698, "epsilon_dpo/beta_margin_grad_mean": -0.4692089557647705, "epsilon_dpo/beta_margin_grad_std": 0.046542707830667496, "epsilon_dpo/beta_margin_mean": 0.12501280009746552, "epsilon_dpo/beta_margin_std": 0.19035613536834717, "epsilon_dpo/loss_margin_mean": 1.4402213096618652, "grad_norm": 54.42438888549805, "kl/avg_steps": 0.40625, "kl/beta": 0.0878751128911972, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.507462686567164e-07, "logits/chosen": -3.5957438945770264, "logits/rejected": -3.5760746002197266, "logps/chosen": -46.63817596435547, "logps/ref_chosen": -45.775848388671875, "logps/ref_rejected": -58.183353424072266, "logps/rejected": -60.485904693603516, "loss": 1.2741, "rewards/accuracies": 0.734375, "rewards/chosen": -0.0758018046617508, "rewards/margins": 0.1250128149986267, "rewards/rejected": -0.2008146047592163, "step": 48 }, { "epoch": 0.07407407407407407, "epsilon_dpo/beta": 0.08728214353322983, "epsilon_dpo/beta_margin_grad_mean": -0.47707274556159973, "epsilon_dpo/beta_margin_grad_std": 0.06627562642097473, "epsilon_dpo/beta_margin_mean": 0.093878373503685, "epsilon_dpo/beta_margin_std": 0.2709443271160126, "epsilon_dpo/loss_margin_mean": 1.0965173244476318, "grad_norm": 62.433040618896484, "kl/avg_steps": 0.28125, "kl/beta": 0.0875195637345314, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 3.5820895522388055e-07, "logits/chosen": -3.5887973308563232, "logits/rejected": -3.6188864707946777, "logps/chosen": -47.37166976928711, "logps/ref_chosen": -45.937103271484375, "logps/ref_rejected": -57.69104766845703, "logps/rejected": -60.222129821777344, "loss": 1.3127, "rewards/accuracies": 0.640625, "rewards/chosen": -0.12646175920963287, "rewards/margins": 0.0938783586025238, "rewards/rejected": -0.22034013271331787, "step": 49 }, { "epoch": 0.07558578987150416, "epsilon_dpo/beta": 0.08701007813215256, "epsilon_dpo/beta_margin_grad_mean": -0.4758742153644562, "epsilon_dpo/beta_margin_grad_std": 0.05359407886862755, "epsilon_dpo/beta_margin_mean": 0.09841331839561462, "epsilon_dpo/beta_margin_std": 0.2190161496400833, "epsilon_dpo/loss_margin_mean": 1.1475844383239746, "grad_norm": 60.549922943115234, "kl/avg_steps": 0.3125, "kl/beta": 0.08727411180734634, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.6567164179104475e-07, "logits/chosen": -3.562687873840332, "logits/rejected": -3.6017415523529053, "logps/chosen": -50.9469108581543, "logps/ref_chosen": -49.76499938964844, "logps/ref_rejected": -68.50381469726562, "logps/rejected": -70.83331298828125, "loss": 1.3021, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10354946553707123, "rewards/margins": 0.09841330349445343, "rewards/rejected": -0.20196276903152466, "step": 50 }, { "epoch": 0.07709750566893424, "epsilon_dpo/beta": 0.0866030603647232, "epsilon_dpo/beta_margin_grad_mean": -0.4704733192920685, "epsilon_dpo/beta_margin_grad_std": 0.06932269781827927, "epsilon_dpo/beta_margin_mean": 0.12144739180803299, "epsilon_dpo/beta_margin_std": 0.2858573794364929, "epsilon_dpo/loss_margin_mean": 1.4216961860656738, "grad_norm": 68.56024932861328, "kl/avg_steps": 0.46875, "kl/beta": 0.0870022252202034, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.7313432835820895e-07, "logits/chosen": -3.6086220741271973, "logits/rejected": -3.58148455619812, "logps/chosen": -55.835044860839844, "logps/ref_chosen": -53.99163818359375, "logps/ref_rejected": -80.39358520507812, "logps/rejected": -83.65869140625, "loss": 1.2886, "rewards/accuracies": 0.734375, "rewards/chosen": -0.16036289930343628, "rewards/margins": 0.12144741415977478, "rewards/rejected": -0.28181031346321106, "step": 51 }, { "epoch": 0.07860922146636433, "epsilon_dpo/beta": 0.08626652508974075, "epsilon_dpo/beta_margin_grad_mean": -0.46988561749458313, "epsilon_dpo/beta_margin_grad_std": 0.06996109336614609, "epsilon_dpo/beta_margin_mean": 0.12322840094566345, "epsilon_dpo/beta_margin_std": 0.28682681918144226, "epsilon_dpo/loss_margin_mean": 1.4506239891052246, "grad_norm": 66.95682525634766, "kl/avg_steps": 0.390625, "kl/beta": 0.0865963026881218, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.805970149253731e-07, "logits/chosen": -3.667311906814575, "logits/rejected": -3.704521417617798, "logps/chosen": -60.87031555175781, "logps/ref_chosen": -58.90207290649414, "logps/ref_rejected": -87.87213134765625, "logps/rejected": -91.29100036621094, "loss": 1.2871, "rewards/accuracies": 0.703125, "rewards/chosen": -0.17080238461494446, "rewards/margins": 0.12322834134101868, "rewards/rejected": -0.29403072595596313, "step": 52 }, { "epoch": 0.0801209372637944, "epsilon_dpo/beta": 0.08586360514163971, "epsilon_dpo/beta_margin_grad_mean": -0.4474100172519684, "epsilon_dpo/beta_margin_grad_std": 0.0785534605383873, "epsilon_dpo/beta_margin_mean": 0.21759696304798126, "epsilon_dpo/beta_margin_std": 0.3268547058105469, "epsilon_dpo/loss_margin_mean": 2.5575051307678223, "grad_norm": 55.70866394042969, "kl/avg_steps": 0.46875, "kl/beta": 0.08625935763120651, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.880597014925373e-07, "logits/chosen": -3.5891575813293457, "logits/rejected": -3.587313652038574, "logps/chosen": -52.06053924560547, "logps/ref_chosen": -50.257423400878906, "logps/ref_rejected": -52.29582214355469, "logps/rejected": -56.65644073486328, "loss": 1.2065, "rewards/accuracies": 0.734375, "rewards/chosen": -0.15547868609428406, "rewards/margins": 0.21759696304798126, "rewards/rejected": -0.3730756640434265, "step": 53 }, { "epoch": 0.08163265306122448, "epsilon_dpo/beta": 0.085570327937603, "epsilon_dpo/beta_margin_grad_mean": -0.4677498936653137, "epsilon_dpo/beta_margin_grad_std": 0.06854632496833801, "epsilon_dpo/beta_margin_mean": 0.13270346820354462, "epsilon_dpo/beta_margin_std": 0.2826445996761322, "epsilon_dpo/loss_margin_mean": 1.5725265741348267, "grad_norm": 55.031959533691406, "kl/avg_steps": 0.34375, "kl/beta": 0.08585689961910248, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.9552238805970144e-07, "logits/chosen": -3.530078411102295, "logits/rejected": -3.5772924423217773, "logps/chosen": -50.53800964355469, "logps/ref_chosen": -48.163211822509766, "logps/ref_rejected": -57.136348724365234, "logps/rejected": -61.083675384521484, "loss": 1.2776, "rewards/accuracies": 0.703125, "rewards/chosen": -0.203971266746521, "rewards/margins": 0.13270346820354462, "rewards/rejected": -0.3366747200489044, "step": 54 }, { "epoch": 0.08314436885865457, "epsilon_dpo/beta": 0.08519696444272995, "epsilon_dpo/beta_margin_grad_mean": -0.43981167674064636, "epsilon_dpo/beta_margin_grad_std": 0.09088561683893204, "epsilon_dpo/beta_margin_mean": 0.2508555054664612, "epsilon_dpo/beta_margin_std": 0.3826192319393158, "epsilon_dpo/loss_margin_mean": 2.974224090576172, "grad_norm": 60.19376754760742, "kl/avg_steps": 0.4375, "kl/beta": 0.08556278049945831, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.0298507462686564e-07, "logits/chosen": -3.5636472702026367, "logits/rejected": -3.6182875633239746, "logps/chosen": -44.739524841308594, "logps/ref_chosen": -42.096336364746094, "logps/ref_rejected": -82.34873962402344, "logps/rejected": -87.96615600585938, "loss": 1.1865, "rewards/accuracies": 0.71875, "rewards/chosen": -0.22584018111228943, "rewards/margins": 0.25085556507110596, "rewards/rejected": -0.4766957759857178, "step": 55 }, { "epoch": 0.08465608465608465, "epsilon_dpo/beta": 0.08477260172367096, "epsilon_dpo/beta_margin_grad_mean": -0.43432483077049255, "epsilon_dpo/beta_margin_grad_std": 0.09119874984025955, "epsilon_dpo/beta_margin_mean": 0.27483227849006653, "epsilon_dpo/beta_margin_std": 0.3861686587333679, "epsilon_dpo/loss_margin_mean": 3.2685892581939697, "grad_norm": 57.67151641845703, "kl/avg_steps": 0.5, "kl/beta": 0.0851900726556778, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.1044776119402984e-07, "logits/chosen": -3.6413087844848633, "logits/rejected": -3.599949359893799, "logps/chosen": -54.51702880859375, "logps/ref_chosen": -53.04023742675781, "logps/ref_rejected": -61.41521453857422, "logps/rejected": -66.16059875488281, "loss": 1.1661, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12605857849121094, "rewards/margins": 0.27483224868774414, "rewards/rejected": -0.4008908271789551, "step": 56 }, { "epoch": 0.08616780045351474, "epsilon_dpo/beta": 0.08429785817861557, "epsilon_dpo/beta_margin_grad_mean": -0.4190262258052826, "epsilon_dpo/beta_margin_grad_std": 0.11077257990837097, "epsilon_dpo/beta_margin_mean": 0.3449619710445404, "epsilon_dpo/beta_margin_std": 0.477033406496048, "epsilon_dpo/loss_margin_mean": 4.123124122619629, "grad_norm": 60.82868576049805, "kl/avg_steps": 0.5625, "kl/beta": 0.08476623892784119, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.17910447761194e-07, "logits/chosen": -3.6695680618286133, "logits/rejected": -3.640049934387207, "logps/chosen": -57.3392448425293, "logps/ref_chosen": -55.135032653808594, "logps/ref_rejected": -69.75180053710938, "logps/rejected": -76.07914733886719, "loss": 1.1249, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18684326112270355, "rewards/margins": 0.3449619710445404, "rewards/rejected": -0.5318052172660828, "step": 57 }, { "epoch": 0.08767951625094482, "epsilon_dpo/beta": 0.08390536904335022, "epsilon_dpo/beta_margin_grad_mean": -0.4341975152492523, "epsilon_dpo/beta_margin_grad_std": 0.10622064024209976, "epsilon_dpo/beta_margin_mean": 0.275643914937973, "epsilon_dpo/beta_margin_std": 0.4503594636917114, "epsilon_dpo/loss_margin_mean": 3.3191847801208496, "grad_norm": 66.65570068359375, "kl/avg_steps": 0.46875, "kl/beta": 0.08429209887981415, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.253731343283582e-07, "logits/chosen": -3.709059476852417, "logits/rejected": -3.6754136085510254, "logps/chosen": -71.13023376464844, "logps/ref_chosen": -67.07564544677734, "logps/ref_rejected": -79.59711456298828, "logps/rejected": -86.97088623046875, "loss": 1.1784, "rewards/accuracies": 0.765625, "rewards/chosen": -0.34206002950668335, "rewards/margins": 0.27564388513565063, "rewards/rejected": -0.617703914642334, "step": 58 }, { "epoch": 0.08919123204837491, "epsilon_dpo/beta": 0.08343522995710373, "epsilon_dpo/beta_margin_grad_mean": -0.4273790717124939, "epsilon_dpo/beta_margin_grad_std": 0.10428285598754883, "epsilon_dpo/beta_margin_mean": 0.3099270164966583, "epsilon_dpo/beta_margin_std": 0.4575274586677551, "epsilon_dpo/loss_margin_mean": 3.7425787448883057, "grad_norm": 62.80437469482422, "kl/avg_steps": 0.5625, "kl/beta": 0.08389881998300552, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.3283582089552234e-07, "logits/chosen": -3.5744237899780273, "logits/rejected": -3.629863739013672, "logps/chosen": -49.41357421875, "logps/ref_chosen": -46.237396240234375, "logps/ref_rejected": -71.51837158203125, "logps/rejected": -78.43712615966797, "loss": 1.1494, "rewards/accuracies": 0.78125, "rewards/chosen": -0.26609304547309875, "rewards/margins": 0.3099270164966583, "rewards/rejected": -0.5760200619697571, "step": 59 }, { "epoch": 0.09070294784580499, "epsilon_dpo/beta": 0.08312498033046722, "epsilon_dpo/beta_margin_grad_mean": -0.4324890673160553, "epsilon_dpo/beta_margin_grad_std": 0.12437763810157776, "epsilon_dpo/beta_margin_mean": 0.2913932800292969, "epsilon_dpo/beta_margin_std": 0.5456759929656982, "epsilon_dpo/loss_margin_mean": 3.545145034790039, "grad_norm": 65.50759887695312, "kl/avg_steps": 0.375, "kl/beta": 0.08342953026294708, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.4029850746268654e-07, "logits/chosen": -3.594968795776367, "logits/rejected": -3.5757360458374023, "logps/chosen": -54.57878112792969, "logps/ref_chosen": -51.502197265625, "logps/ref_rejected": -56.842464447021484, "logps/rejected": -63.464195251464844, "loss": 1.1862, "rewards/accuracies": 0.734375, "rewards/chosen": -0.2572042644023895, "rewards/margins": 0.2913932800292969, "rewards/rejected": -0.5485975742340088, "step": 60 }, { "epoch": 0.09221466364323508, "epsilon_dpo/beta": 0.08276247978210449, "epsilon_dpo/beta_margin_grad_mean": -0.4284430742263794, "epsilon_dpo/beta_margin_grad_std": 0.1417059749364853, "epsilon_dpo/beta_margin_mean": 0.3133644461631775, "epsilon_dpo/beta_margin_std": 0.6312664747238159, "epsilon_dpo/loss_margin_mean": 3.8356871604919434, "grad_norm": 68.0422592163086, "kl/avg_steps": 0.4375, "kl/beta": 0.08311784267425537, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.4776119402985074e-07, "logits/chosen": -3.6363086700439453, "logits/rejected": -3.625993251800537, "logps/chosen": -47.573150634765625, "logps/ref_chosen": -43.841529846191406, "logps/ref_rejected": -69.73106384277344, "logps/rejected": -77.29837036132812, "loss": 1.1904, "rewards/accuracies": 0.703125, "rewards/chosen": -0.31046566367149353, "rewards/margins": 0.3133644461631775, "rewards/rejected": -0.6238300800323486, "step": 61 }, { "epoch": 0.09372637944066516, "epsilon_dpo/beta": 0.08222091943025589, "epsilon_dpo/beta_margin_grad_mean": -0.38994866609573364, "epsilon_dpo/beta_margin_grad_std": 0.1302434802055359, "epsilon_dpo/beta_margin_mean": 0.49356260895729065, "epsilon_dpo/beta_margin_std": 0.6134549975395203, "epsilon_dpo/loss_margin_mean": 6.03684139251709, "grad_norm": 64.02687072753906, "kl/avg_steps": 0.65625, "kl/beta": 0.08275578171014786, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.552238805970149e-07, "logits/chosen": -3.657285690307617, "logits/rejected": -3.638922691345215, "logps/chosen": -53.906002044677734, "logps/ref_chosen": -49.25220489501953, "logps/ref_rejected": -67.45822143554688, "logps/rejected": -78.14886474609375, "loss": 1.0362, "rewards/accuracies": 0.84375, "rewards/chosen": -0.38426846265792847, "rewards/margins": 0.49356257915496826, "rewards/rejected": -0.8778310418128967, "step": 62 }, { "epoch": 0.09523809523809523, "epsilon_dpo/beta": 0.0817105621099472, "epsilon_dpo/beta_margin_grad_mean": -0.37466397881507874, "epsilon_dpo/beta_margin_grad_std": 0.14748454093933105, "epsilon_dpo/beta_margin_mean": 0.5534582734107971, "epsilon_dpo/beta_margin_std": 0.6765353679656982, "epsilon_dpo/loss_margin_mean": 6.818574905395508, "grad_norm": 69.56266784667969, "kl/avg_steps": 0.625, "kl/beta": 0.08221624046564102, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.626865671641791e-07, "logits/chosen": -3.5730316638946533, "logits/rejected": -3.5252084732055664, "logps/chosen": -61.742088317871094, "logps/ref_chosen": -57.321510314941406, "logps/ref_rejected": -81.04618835449219, "logps/rejected": -92.28535461425781, "loss": 1.0121, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3626123368740082, "rewards/margins": 0.5534582138061523, "rewards/rejected": -0.9160705208778381, "step": 63 }, { "epoch": 0.09674981103552532, "epsilon_dpo/beta": 0.08143285661935806, "epsilon_dpo/beta_margin_grad_mean": -0.44668301939964294, "epsilon_dpo/beta_margin_grad_std": 0.13906729221343994, "epsilon_dpo/beta_margin_mean": 0.23866620659828186, "epsilon_dpo/beta_margin_std": 0.6315385699272156, "epsilon_dpo/loss_margin_mean": 2.978726863861084, "grad_norm": 73.9322280883789, "kl/avg_steps": 0.34375, "kl/beta": 0.08170557767152786, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.701492537313433e-07, "logits/chosen": -3.6107091903686523, "logits/rejected": -3.6445960998535156, "logps/chosen": -54.53688430786133, "logps/ref_chosen": -48.90271759033203, "logps/ref_rejected": -64.57941436767578, "logps/rejected": -73.19230651855469, "loss": 1.2542, "rewards/accuracies": 0.703125, "rewards/chosen": -0.46170222759246826, "rewards/margins": 0.23866620659828186, "rewards/rejected": -0.7003684043884277, "step": 64 }, { "epoch": 0.0982615268329554, "epsilon_dpo/beta": 0.08110299706459045, "epsilon_dpo/beta_margin_grad_mean": -0.3978002369403839, "epsilon_dpo/beta_margin_grad_std": 0.17008855938911438, "epsilon_dpo/beta_margin_mean": 0.4663151502609253, "epsilon_dpo/beta_margin_std": 0.8097885251045227, "epsilon_dpo/loss_margin_mean": 5.818244934082031, "grad_norm": 73.4081802368164, "kl/avg_steps": 0.40625, "kl/beta": 0.08142568171024323, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.776119402985074e-07, "logits/chosen": -3.6486706733703613, "logits/rejected": -3.630551815032959, "logps/chosen": -70.02645874023438, "logps/ref_chosen": -63.059471130371094, "logps/ref_rejected": -72.15412902832031, "logps/rejected": -84.93936157226562, "loss": 1.1197, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5676844716072083, "rewards/margins": 0.4663151502609253, "rewards/rejected": -1.0339996814727783, "step": 65 }, { "epoch": 0.09977324263038549, "epsilon_dpo/beta": 0.08064812421798706, "epsilon_dpo/beta_margin_grad_mean": -0.39151403307914734, "epsilon_dpo/beta_margin_grad_std": 0.16932065784931183, "epsilon_dpo/beta_margin_mean": 0.5038672685623169, "epsilon_dpo/beta_margin_std": 0.8001295924186707, "epsilon_dpo/loss_margin_mean": 6.3037800788879395, "grad_norm": 75.10379791259766, "kl/avg_steps": 0.5625, "kl/beta": 0.08109622448682785, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.850746268656717e-07, "logits/chosen": -3.7132389545440674, "logits/rejected": -3.696784496307373, "logps/chosen": -70.03843688964844, "logps/ref_chosen": -63.544776916503906, "logps/ref_rejected": -69.46114349365234, "logps/rejected": -82.25859069824219, "loss": 1.0873, "rewards/accuracies": 0.78125, "rewards/chosen": -0.525876522064209, "rewards/margins": 0.5038673281669617, "rewards/rejected": -1.0297439098358154, "step": 66 }, { "epoch": 0.10128495842781557, "epsilon_dpo/beta": 0.08032303303480148, "epsilon_dpo/beta_margin_grad_mean": -0.4044095575809479, "epsilon_dpo/beta_margin_grad_std": 0.18034374713897705, "epsilon_dpo/beta_margin_mean": 0.46292153000831604, "epsilon_dpo/beta_margin_std": 0.880039632320404, "epsilon_dpo/loss_margin_mean": 5.831025123596191, "grad_norm": 72.0900650024414, "kl/avg_steps": 0.40625, "kl/beta": 0.08064261078834534, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.925373134328357e-07, "logits/chosen": -3.629335403442383, "logits/rejected": -3.612574577331543, "logps/chosen": -61.8409538269043, "logps/ref_chosen": -55.75690460205078, "logps/ref_rejected": -63.51603698730469, "logps/rejected": -75.43110656738281, "loss": 1.1457, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4915390610694885, "rewards/margins": 0.4629215598106384, "rewards/rejected": -0.954460620880127, "step": 67 }, { "epoch": 0.10279667422524566, "epsilon_dpo/beta": 0.0800231471657753, "epsilon_dpo/beta_margin_grad_mean": -0.39887458086013794, "epsilon_dpo/beta_margin_grad_std": 0.17603877186775208, "epsilon_dpo/beta_margin_mean": 0.495108962059021, "epsilon_dpo/beta_margin_std": 0.8734935522079468, "epsilon_dpo/loss_margin_mean": 6.256532669067383, "grad_norm": 74.73369598388672, "kl/avg_steps": 0.375, "kl/beta": 0.08031632751226425, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 5e-07, "logits/chosen": -3.6974828243255615, "logits/rejected": -3.7152044773101807, "logps/chosen": -74.85816955566406, "logps/ref_chosen": -65.72211456298828, "logps/ref_rejected": -82.25398254394531, "logps/rejected": -97.64656829833984, "loss": 1.1145, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7328962087631226, "rewards/margins": 0.495108962059021, "rewards/rejected": -1.2280051708221436, "step": 68 }, { "epoch": 0.10430839002267574, "epsilon_dpo/beta": 0.07962413877248764, "epsilon_dpo/beta_margin_grad_mean": -0.3918708860874176, "epsilon_dpo/beta_margin_grad_std": 0.18983057141304016, "epsilon_dpo/beta_margin_mean": 0.5220910310745239, "epsilon_dpo/beta_margin_std": 0.9239572882652283, "epsilon_dpo/loss_margin_mean": 6.628524303436279, "grad_norm": 84.05654907226562, "kl/avg_steps": 0.5, "kl/beta": 0.08001626282930374, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.999965034812934e-07, "logits/chosen": -3.6352009773254395, "logits/rejected": -3.6255834102630615, "logps/chosen": -63.700294494628906, "logps/ref_chosen": -54.92646789550781, "logps/ref_rejected": -82.88018798828125, "logps/rejected": -98.28253173828125, "loss": 1.1175, "rewards/accuracies": 0.75, "rewards/chosen": -0.7021700143814087, "rewards/margins": 0.5220910906791687, "rewards/rejected": -1.2242610454559326, "step": 69 }, { "epoch": 0.10582010582010581, "epsilon_dpo/beta": 0.07930265367031097, "epsilon_dpo/beta_margin_grad_mean": -0.38648274540901184, "epsilon_dpo/beta_margin_grad_std": 0.19681678712368011, "epsilon_dpo/beta_margin_mean": 0.5365067720413208, "epsilon_dpo/beta_margin_std": 0.9472194910049438, "epsilon_dpo/loss_margin_mean": 6.849758625030518, "grad_norm": 77.7649917602539, "kl/avg_steps": 0.40625, "kl/beta": 0.0796181708574295, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.999860140229787e-07, "logits/chosen": -3.6085243225097656, "logits/rejected": -3.598844051361084, "logps/chosen": -62.70173645019531, "logps/ref_chosen": -55.54304504394531, "logps/ref_rejected": -72.46739959716797, "logps/rejected": -86.47584533691406, "loss": 1.1185, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5718520283699036, "rewards/margins": 0.5365067720413208, "rewards/rejected": -1.1083588600158691, "step": 70 }, { "epoch": 0.1073318216175359, "epsilon_dpo/beta": 0.07913047820329666, "epsilon_dpo/beta_margin_grad_mean": -0.4042842984199524, "epsilon_dpo/beta_margin_grad_std": 0.18438421189785004, "epsilon_dpo/beta_margin_mean": 0.46302467584609985, "epsilon_dpo/beta_margin_std": 0.9153339266777039, "epsilon_dpo/loss_margin_mean": 5.9294328689575195, "grad_norm": 77.0405044555664, "kl/avg_steps": 0.21875, "kl/beta": 0.07929603010416031, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.999685319184688e-07, "logits/chosen": -3.588857889175415, "logits/rejected": -3.5640816688537598, "logps/chosen": -63.266414642333984, "logps/ref_chosen": -54.22770309448242, "logps/ref_rejected": -59.25541687011719, "logps/rejected": -74.22355651855469, "loss": 1.158, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7186065912246704, "rewards/margins": 0.46302467584609985, "rewards/rejected": -1.181631326675415, "step": 71 }, { "epoch": 0.10884353741496598, "epsilon_dpo/beta": 0.07873521000146866, "epsilon_dpo/beta_margin_grad_mean": -0.3645576536655426, "epsilon_dpo/beta_margin_grad_std": 0.21695895493030548, "epsilon_dpo/beta_margin_mean": 0.7148928642272949, "epsilon_dpo/beta_margin_std": 1.1266639232635498, "epsilon_dpo/loss_margin_mean": 9.170205116271973, "grad_norm": 78.14938354492188, "kl/avg_steps": 0.5, "kl/beta": 0.07912295311689377, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.999440576567755e-07, "logits/chosen": -3.514237403869629, "logits/rejected": -3.585242509841919, "logps/chosen": -56.696189880371094, "logps/ref_chosen": -48.87178421020508, "logps/ref_rejected": -62.1739387512207, "logps/rejected": -79.16854858398438, "loss": 1.0547, "rewards/accuracies": 0.765625, "rewards/chosen": -0.619001030921936, "rewards/margins": 0.7148928642272949, "rewards/rejected": -1.3338937759399414, "step": 72 }, { "epoch": 0.11035525321239607, "epsilon_dpo/beta": 0.0785895437002182, "epsilon_dpo/beta_margin_grad_mean": -0.43877267837524414, "epsilon_dpo/beta_margin_grad_std": 0.2261706441640854, "epsilon_dpo/beta_margin_mean": 0.3175167739391327, "epsilon_dpo/beta_margin_std": 1.1321009397506714, "epsilon_dpo/loss_margin_mean": 4.150314807891846, "grad_norm": 95.90277099609375, "kl/avg_steps": 0.1875, "kl/beta": 0.0787293016910553, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.999125919224965e-07, "logits/chosen": -3.632051467895508, "logits/rejected": -3.6478147506713867, "logps/chosen": -71.2369384765625, "logps/ref_chosen": -59.030941009521484, "logps/ref_rejected": -76.38026428222656, "logps/rejected": -92.736572265625, "loss": 1.3737, "rewards/accuracies": 0.625, "rewards/chosen": -0.964561939239502, "rewards/margins": 0.3175167441368103, "rewards/rejected": -1.282078742980957, "step": 73 }, { "epoch": 0.11186696900982615, "epsilon_dpo/beta": 0.07815991342067719, "epsilon_dpo/beta_margin_grad_mean": -0.3408944010734558, "epsilon_dpo/beta_margin_grad_std": 0.217587411403656, "epsilon_dpo/beta_margin_mean": 0.8345863819122314, "epsilon_dpo/beta_margin_std": 1.1703593730926514, "epsilon_dpo/loss_margin_mean": 10.764760971069336, "grad_norm": 65.18840789794922, "kl/avg_steps": 0.546875, "kl/beta": 0.07858196645975113, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.998741355957963e-07, "logits/chosen": -3.5298900604248047, "logits/rejected": -3.503810405731201, "logps/chosen": -51.178375244140625, "logps/ref_chosen": -43.23417663574219, "logps/ref_rejected": -73.47119140625, "logps/rejected": -92.18014526367188, "loss": 0.9934, "rewards/accuracies": 0.765625, "rewards/chosen": -0.624750018119812, "rewards/margins": 0.8345862627029419, "rewards/rejected": -1.459336280822754, "step": 74 }, { "epoch": 0.11337868480725624, "epsilon_dpo/beta": 0.07786925882101059, "epsilon_dpo/beta_margin_grad_mean": -0.3808808922767639, "epsilon_dpo/beta_margin_grad_std": 0.219953253865242, "epsilon_dpo/beta_margin_mean": 0.644891619682312, "epsilon_dpo/beta_margin_std": 1.2404156923294067, "epsilon_dpo/loss_margin_mean": 8.382410049438477, "grad_norm": 73.35260009765625, "kl/avg_steps": 0.375, "kl/beta": 0.078154556453228, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.998286897523808e-07, "logits/chosen": -3.529576063156128, "logits/rejected": -3.6022372245788574, "logps/chosen": -47.48907470703125, "logps/ref_chosen": -39.339385986328125, "logps/ref_rejected": -59.23255920410156, "logps/rejected": -75.76466369628906, "loss": 1.1421, "rewards/accuracies": 0.71875, "rewards/chosen": -0.638184666633606, "rewards/margins": 0.644891619682312, "rewards/rejected": -1.283076286315918, "step": 75 }, { "epoch": 0.11489040060468632, "epsilon_dpo/beta": 0.07743233442306519, "epsilon_dpo/beta_margin_grad_mean": -0.3443877398967743, "epsilon_dpo/beta_margin_grad_std": 0.20156516134738922, "epsilon_dpo/beta_margin_mean": 0.8102350831031799, "epsilon_dpo/beta_margin_std": 1.090848445892334, "epsilon_dpo/loss_margin_mean": 10.537938117980957, "grad_norm": 60.99016571044922, "kl/avg_steps": 0.5625, "kl/beta": 0.07786256819963455, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.997762556634679e-07, "logits/chosen": -3.557114839553833, "logits/rejected": -3.5581860542297363, "logps/chosen": -51.26044464111328, "logps/ref_chosen": -46.23114013671875, "logps/ref_rejected": -69.7197036743164, "logps/rejected": -85.28694152832031, "loss": 0.969, "rewards/accuracies": 0.78125, "rewards/chosen": -0.39314568042755127, "rewards/margins": 0.8102351427078247, "rewards/rejected": -1.203380823135376, "step": 76 }, { "epoch": 0.1164021164021164, "epsilon_dpo/beta": 0.07709600776433945, "epsilon_dpo/beta_margin_grad_mean": -0.34725263714790344, "epsilon_dpo/beta_margin_grad_std": 0.21298830211162567, "epsilon_dpo/beta_margin_mean": 0.7925384640693665, "epsilon_dpo/beta_margin_std": 1.128037929534912, "epsilon_dpo/loss_margin_mean": 10.376481056213379, "grad_norm": 71.08395385742188, "kl/avg_steps": 0.4375, "kl/beta": 0.07742704451084137, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.99716834795752e-07, "logits/chosen": -3.6186790466308594, "logits/rejected": -3.626835823059082, "logps/chosen": -54.421024322509766, "logps/ref_chosen": -44.967525482177734, "logps/ref_rejected": -72.94367980957031, "logps/rejected": -92.77365112304688, "loss": 1.0024, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7322876453399658, "rewards/margins": 0.7925384044647217, "rewards/rejected": -1.5248260498046875, "step": 77 }, { "epoch": 0.11791383219954649, "epsilon_dpo/beta": 0.0767601877450943, "epsilon_dpo/beta_margin_grad_mean": -0.4033251702785492, "epsilon_dpo/beta_margin_grad_std": 0.21421436965465546, "epsilon_dpo/beta_margin_mean": 0.4989599585533142, "epsilon_dpo/beta_margin_std": 1.0872387886047363, "epsilon_dpo/loss_margin_mean": 6.592899799346924, "grad_norm": 88.50785827636719, "kl/avg_steps": 0.4375, "kl/beta": 0.07708977907896042, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.996504288113623e-07, "logits/chosen": -3.6268763542175293, "logits/rejected": -3.5964431762695312, "logps/chosen": -71.16470336914062, "logps/ref_chosen": -62.18614196777344, "logps/ref_rejected": -72.76731872558594, "logps/rejected": -88.33878326416016, "loss": 1.2005, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6939281821250916, "rewards/margins": 0.498960018157959, "rewards/rejected": -1.1928881406784058, "step": 78 }, { "epoch": 0.11942554799697656, "epsilon_dpo/beta": 0.07623392343521118, "epsilon_dpo/beta_margin_grad_mean": -0.3381175100803375, "epsilon_dpo/beta_margin_grad_std": 0.2067301720380783, "epsilon_dpo/beta_margin_mean": 0.8460524678230286, "epsilon_dpo/beta_margin_std": 1.192158818244934, "epsilon_dpo/loss_margin_mean": 11.167950630187988, "grad_norm": 72.36978912353516, "kl/avg_steps": 0.6875, "kl/beta": 0.07675398141145706, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.995770395678171e-07, "logits/chosen": -3.5440924167633057, "logits/rejected": -3.667853355407715, "logps/chosen": -52.910247802734375, "logps/ref_chosen": -43.70287322998047, "logps/ref_rejected": -79.67294311523438, "logps/rejected": -100.04827117919922, "loss": 0.9805, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7051568031311035, "rewards/margins": 0.8460524678230286, "rewards/rejected": -1.5512093305587769, "step": 79 }, { "epoch": 0.12093726379440665, "epsilon_dpo/beta": 0.07595163583755493, "epsilon_dpo/beta_margin_grad_mean": -0.3493572771549225, "epsilon_dpo/beta_margin_grad_std": 0.21282392740249634, "epsilon_dpo/beta_margin_mean": 0.7918369770050049, "epsilon_dpo/beta_margin_std": 1.124416470527649, "epsilon_dpo/loss_margin_mean": 10.530081748962402, "grad_norm": 79.38997650146484, "kl/avg_steps": 0.375, "kl/beta": 0.07622990012168884, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.994966691179711e-07, "logits/chosen": -3.6433496475219727, "logits/rejected": -3.6324872970581055, "logps/chosen": -61.097984313964844, "logps/ref_chosen": -50.88941955566406, "logps/ref_rejected": -67.38335418701172, "logps/rejected": -88.12200164794922, "loss": 0.9991, "rewards/accuracies": 0.71875, "rewards/chosen": -0.778607964515686, "rewards/margins": 0.7918369770050049, "rewards/rejected": -1.5704448223114014, "step": 80 }, { "epoch": 0.12244897959183673, "epsilon_dpo/beta": 0.07552546262741089, "epsilon_dpo/beta_margin_grad_mean": -0.3401520550251007, "epsilon_dpo/beta_margin_grad_std": 0.2042655497789383, "epsilon_dpo/beta_margin_mean": 0.8663771748542786, "epsilon_dpo/beta_margin_std": 1.1342484951019287, "epsilon_dpo/loss_margin_mean": 11.549817085266113, "grad_norm": 66.8261947631836, "kl/avg_steps": 0.5625, "kl/beta": 0.07594510167837143, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.994093197099587e-07, "logits/chosen": -3.6094839572906494, "logits/rejected": -3.6188840866088867, "logps/chosen": -62.024810791015625, "logps/ref_chosen": -52.98720169067383, "logps/ref_rejected": -71.15193176269531, "logps/rejected": -91.73936462402344, "loss": 0.9456, "rewards/accuracies": 0.765625, "rewards/chosen": -0.685111403465271, "rewards/margins": 0.8663771152496338, "rewards/rejected": -1.5514886379241943, "step": 81 }, { "epoch": 0.12396069538926682, "epsilon_dpo/beta": 0.07503219693899155, "epsilon_dpo/beta_margin_grad_mean": -0.3019990622997284, "epsilon_dpo/beta_margin_grad_std": 0.19777894020080566, "epsilon_dpo/beta_margin_mean": 1.0670703649520874, "epsilon_dpo/beta_margin_std": 1.1176426410675049, "epsilon_dpo/loss_margin_mean": 14.290183067321777, "grad_norm": 71.0761489868164, "kl/avg_steps": 0.65625, "kl/beta": 0.07552029937505722, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.993149937871306e-07, "logits/chosen": -3.507404088973999, "logits/rejected": -3.520355224609375, "logps/chosen": -45.10761260986328, "logps/ref_chosen": -40.267547607421875, "logps/ref_rejected": -66.41444396972656, "logps/rejected": -85.544677734375, "loss": 0.817, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3659554719924927, "rewards/margins": 1.0670702457427979, "rewards/rejected": -1.43302583694458, "step": 82 }, { "epoch": 0.1254724111866969, "epsilon_dpo/beta": 0.07456645369529724, "epsilon_dpo/beta_margin_grad_mean": -0.33078646659851074, "epsilon_dpo/beta_margin_grad_std": 0.2081425040960312, "epsilon_dpo/beta_margin_mean": 0.8852189779281616, "epsilon_dpo/beta_margin_std": 1.139277696609497, "epsilon_dpo/loss_margin_mean": 11.948138236999512, "grad_norm": 87.97261047363281, "kl/avg_steps": 0.625, "kl/beta": 0.07502792775630951, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.992136939879856e-07, "logits/chosen": -3.541891574859619, "logits/rejected": -3.610997200012207, "logps/chosen": -53.40816879272461, "logps/ref_chosen": -46.25514221191406, "logps/ref_rejected": -64.25912475585938, "logps/rejected": -83.36029052734375, "loss": 0.9427, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5360057353973389, "rewards/margins": 0.8852189779281616, "rewards/rejected": -1.4212247133255005, "step": 83 }, { "epoch": 0.12698412698412698, "epsilon_dpo/beta": 0.07419653236865997, "epsilon_dpo/beta_margin_grad_mean": -0.3491345942020416, "epsilon_dpo/beta_margin_grad_std": 0.19632074236869812, "epsilon_dpo/beta_margin_mean": 0.7667987942695618, "epsilon_dpo/beta_margin_std": 1.05858314037323, "epsilon_dpo/loss_margin_mean": 10.419161796569824, "grad_norm": 73.54373931884766, "kl/avg_steps": 0.5, "kl/beta": 0.07456191629171371, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.991054231460969e-07, "logits/chosen": -3.5714755058288574, "logits/rejected": -3.613593816757202, "logps/chosen": -66.04085540771484, "logps/ref_chosen": -54.9249267578125, "logps/ref_rejected": -76.3885498046875, "logps/rejected": -97.92364501953125, "loss": 0.9854, "rewards/accuracies": 0.78125, "rewards/chosen": -0.82747483253479, "rewards/margins": 0.7667988538742065, "rewards/rejected": -1.5942736864089966, "step": 84 }, { "epoch": 0.12849584278155707, "epsilon_dpo/beta": 0.0737578347325325, "epsilon_dpo/beta_margin_grad_mean": -0.33331748843193054, "epsilon_dpo/beta_margin_grad_std": 0.20745497941970825, "epsilon_dpo/beta_margin_mean": 0.8824166655540466, "epsilon_dpo/beta_margin_std": 1.1137104034423828, "epsilon_dpo/loss_margin_mean": 12.0418119430542, "grad_norm": 73.45349884033203, "kl/avg_steps": 0.59375, "kl/beta": 0.07419096678495407, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.989901842900325e-07, "logits/chosen": -3.552335262298584, "logits/rejected": -3.5140767097473145, "logps/chosen": -55.24536895751953, "logps/ref_chosen": -47.86343765258789, "logps/ref_rejected": -58.64827346801758, "logps/rejected": -78.072021484375, "loss": 0.9343, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5470678806304932, "rewards/margins": 0.8824167251586914, "rewards/rejected": -1.4294846057891846, "step": 85 }, { "epoch": 0.13000755857898716, "epsilon_dpo/beta": 0.07348383218050003, "epsilon_dpo/beta_margin_grad_mean": -0.3725527822971344, "epsilon_dpo/beta_margin_grad_std": 0.2182748019695282, "epsilon_dpo/beta_margin_mean": 0.6760433316230774, "epsilon_dpo/beta_margin_std": 1.143048882484436, "epsilon_dpo/loss_margin_mean": 9.304327011108398, "grad_norm": 81.55567169189453, "kl/avg_steps": 0.375, "kl/beta": 0.07375305891036987, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.988679806432711e-07, "logits/chosen": -3.6162681579589844, "logits/rejected": -3.6235525608062744, "logps/chosen": -63.926788330078125, "logps/ref_chosen": -53.91974639892578, "logps/ref_rejected": -66.42182159423828, "logps/rejected": -85.73320007324219, "loss": 1.088, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7390810251235962, "rewards/margins": 0.6760433912277222, "rewards/rejected": -1.4151244163513184, "step": 86 }, { "epoch": 0.13151927437641722, "epsilon_dpo/beta": 0.0731174424290657, "epsilon_dpo/beta_margin_grad_mean": -0.329822838306427, "epsilon_dpo/beta_margin_grad_std": 0.23226206004619598, "epsilon_dpo/beta_margin_mean": 0.9244822859764099, "epsilon_dpo/beta_margin_std": 1.3805556297302246, "epsilon_dpo/loss_margin_mean": 12.757396697998047, "grad_norm": 72.45609283447266, "kl/avg_steps": 0.5, "kl/beta": 0.07347751408815384, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.987388156241114e-07, "logits/chosen": -3.572812080383301, "logits/rejected": -3.6262753009796143, "logps/chosen": -63.66492462158203, "logps/ref_chosen": -56.29553985595703, "logps/ref_rejected": -75.63311767578125, "logps/rejected": -95.75990295410156, "loss": 1.0183, "rewards/accuracies": 0.75, "rewards/chosen": -0.5435619950294495, "rewards/margins": 0.9244823455810547, "rewards/rejected": -1.4680442810058594, "step": 87 }, { "epoch": 0.1330309901738473, "epsilon_dpo/beta": 0.07270796597003937, "epsilon_dpo/beta_margin_grad_mean": -0.3360753357410431, "epsilon_dpo/beta_margin_grad_std": 0.21390895545482635, "epsilon_dpo/beta_margin_mean": 0.8687413930892944, "epsilon_dpo/beta_margin_std": 1.2205601930618286, "epsilon_dpo/loss_margin_mean": 12.032463073730469, "grad_norm": 66.3083724975586, "kl/avg_steps": 0.5625, "kl/beta": 0.07311195135116577, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.986026928455767e-07, "logits/chosen": -3.5915050506591797, "logits/rejected": -3.5167384147644043, "logps/chosen": -64.58416748046875, "logps/ref_chosen": -58.008209228515625, "logps/ref_rejected": -67.19764709472656, "logps/rejected": -85.80606079101562, "loss": 0.9848, "rewards/accuracies": 0.8125, "rewards/chosen": -0.479391872882843, "rewards/margins": 0.8687413930892944, "rewards/rejected": -1.3481333255767822, "step": 88 }, { "epoch": 0.1345427059712774, "epsilon_dpo/beta": 0.07246033102273941, "epsilon_dpo/beta_margin_grad_mean": -0.3966783285140991, "epsilon_dpo/beta_margin_grad_std": 0.21072609722614288, "epsilon_dpo/beta_margin_mean": 0.5499569177627563, "epsilon_dpo/beta_margin_std": 1.1226202249526978, "epsilon_dpo/loss_margin_mean": 7.691743850708008, "grad_norm": 83.16149139404297, "kl/avg_steps": 0.34375, "kl/beta": 0.07270300388336182, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.984596161153135e-07, "logits/chosen": -3.5263407230377197, "logits/rejected": -3.604508638381958, "logps/chosen": -46.194881439208984, "logps/ref_chosen": -39.618568420410156, "logps/ref_rejected": -75.35650634765625, "logps/rejected": -89.62456512451172, "loss": 1.1693, "rewards/accuracies": 0.6875, "rewards/chosen": -0.47995725274086, "rewards/margins": 0.5499569177627563, "rewards/rejected": -1.029914140701294, "step": 89 }, { "epoch": 0.1360544217687075, "epsilon_dpo/beta": 0.07205358892679214, "epsilon_dpo/beta_margin_grad_mean": -0.345061331987381, "epsilon_dpo/beta_margin_grad_std": 0.21293526887893677, "epsilon_dpo/beta_margin_mean": 0.808558464050293, "epsilon_dpo/beta_margin_std": 1.1790647506713867, "epsilon_dpo/loss_margin_mean": 11.309035301208496, "grad_norm": 88.09986877441406, "kl/avg_steps": 0.5625, "kl/beta": 0.07245393842458725, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.983095894354857e-07, "logits/chosen": -3.561004161834717, "logits/rejected": -3.554290533065796, "logps/chosen": -62.14801788330078, "logps/ref_chosen": -54.349002838134766, "logps/ref_rejected": -79.10935974121094, "logps/rejected": -98.2174072265625, "loss": 1.0085, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5635464191436768, "rewards/margins": 0.808558464050293, "rewards/rejected": -1.3721048831939697, "step": 90 }, { "epoch": 0.13756613756613756, "epsilon_dpo/beta": 0.07176313549280167, "epsilon_dpo/beta_margin_grad_mean": -0.3689362704753876, "epsilon_dpo/beta_margin_grad_std": 0.2214219570159912, "epsilon_dpo/beta_margin_mean": 0.7057680487632751, "epsilon_dpo/beta_margin_std": 1.202399730682373, "epsilon_dpo/loss_margin_mean": 9.943520545959473, "grad_norm": 69.88008880615234, "kl/avg_steps": 0.40625, "kl/beta": 0.07204866409301758, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.98152617002662e-07, "logits/chosen": -3.5581440925598145, "logits/rejected": -3.594620704650879, "logps/chosen": -54.25669479370117, "logps/ref_chosen": -46.5614013671875, "logps/ref_rejected": -63.25788879394531, "logps/rejected": -80.89669799804688, "loss": 1.0899, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5556970834732056, "rewards/margins": 0.7057680487632751, "rewards/rejected": -1.261465072631836, "step": 91 }, { "epoch": 0.13907785336356765, "epsilon_dpo/beta": 0.07136065512895584, "epsilon_dpo/beta_margin_grad_mean": -0.3597443103790283, "epsilon_dpo/beta_margin_grad_std": 0.1939815878868103, "epsilon_dpo/beta_margin_mean": 0.7340101003646851, "epsilon_dpo/beta_margin_std": 1.0939009189605713, "epsilon_dpo/loss_margin_mean": 10.364131927490234, "grad_norm": 66.7519302368164, "kl/avg_steps": 0.5625, "kl/beta": 0.07175715267658234, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.979887032076988e-07, "logits/chosen": -3.5171523094177246, "logits/rejected": -3.6251306533813477, "logps/chosen": -56.95709228515625, "logps/ref_chosen": -49.348655700683594, "logps/ref_rejected": -75.03353881835938, "logps/rejected": -93.006103515625, "loss": 1.0137, "rewards/accuracies": 0.75, "rewards/chosen": -0.5460585951805115, "rewards/margins": 0.7340099811553955, "rewards/rejected": -1.2800686359405518, "step": 92 }, { "epoch": 0.14058956916099774, "epsilon_dpo/beta": 0.07107299566268921, "epsilon_dpo/beta_margin_grad_mean": -0.38023558259010315, "epsilon_dpo/beta_margin_grad_std": 0.20673654973506927, "epsilon_dpo/beta_margin_mean": 0.6062856912612915, "epsilon_dpo/beta_margin_std": 1.0584166049957275, "epsilon_dpo/loss_margin_mean": 8.62741470336914, "grad_norm": 72.2237777709961, "kl/avg_steps": 0.40625, "kl/beta": 0.07135577499866486, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.978178526356172e-07, "logits/chosen": -3.527400493621826, "logits/rejected": -3.52805495262146, "logps/chosen": -53.27133560180664, "logps/ref_chosen": -46.696109771728516, "logps/ref_rejected": -52.51020050048828, "logps/rejected": -67.71283721923828, "loss": 1.1057, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4708820581436157, "rewards/margins": 0.6062856912612915, "rewards/rejected": -1.0771677494049072, "step": 93 }, { "epoch": 0.1421012849584278, "epsilon_dpo/beta": 0.07067438215017319, "epsilon_dpo/beta_margin_grad_mean": -0.3539963662624359, "epsilon_dpo/beta_margin_grad_std": 0.18799494206905365, "epsilon_dpo/beta_margin_mean": 0.7336546182632446, "epsilon_dpo/beta_margin_std": 1.0302444696426392, "epsilon_dpo/loss_margin_mean": 10.455020904541016, "grad_norm": 54.94295883178711, "kl/avg_steps": 0.5625, "kl/beta": 0.07106706500053406, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.976400700654751e-07, "logits/chosen": -3.5829176902770996, "logits/rejected": -3.5992307662963867, "logps/chosen": -56.55706787109375, "logps/ref_chosen": -51.918800354003906, "logps/ref_rejected": -67.240234375, "logps/rejected": -82.3335189819336, "loss": 0.9948, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3284054696559906, "rewards/margins": 0.7336546182632446, "rewards/rejected": -1.0620601177215576, "step": 94 }, { "epoch": 0.1436130007558579, "epsilon_dpo/beta": 0.07025697082281113, "epsilon_dpo/beta_margin_grad_mean": -0.34081026911735535, "epsilon_dpo/beta_margin_grad_std": 0.21506638824939728, "epsilon_dpo/beta_margin_mean": 0.8314006924629211, "epsilon_dpo/beta_margin_std": 1.1409651041030884, "epsilon_dpo/loss_margin_mean": 11.924653053283691, "grad_norm": 73.7522964477539, "kl/avg_steps": 0.59375, "kl/beta": 0.07066954672336578, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.974553604702332e-07, "logits/chosen": -3.567167282104492, "logits/rejected": -3.5548033714294434, "logps/chosen": -60.07548904418945, "logps/ref_chosen": -52.87811279296875, "logps/ref_rejected": -81.57817077636719, "logps/rejected": -100.70020294189453, "loss": 0.9823, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5080251097679138, "rewards/margins": 0.8314006924629211, "rewards/rejected": -1.339425802230835, "step": 95 }, { "epoch": 0.14512471655328799, "epsilon_dpo/beta": 0.06988619267940521, "epsilon_dpo/beta_margin_grad_mean": -0.3414158225059509, "epsilon_dpo/beta_margin_grad_std": 0.2209470272064209, "epsilon_dpo/beta_margin_mean": 0.8541780114173889, "epsilon_dpo/beta_margin_std": 1.2788479328155518, "epsilon_dpo/loss_margin_mean": 12.328063011169434, "grad_norm": 68.02438354492188, "kl/avg_steps": 0.53125, "kl/beta": 0.070252425968647, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.972637290166157e-07, "logits/chosen": -3.553460121154785, "logits/rejected": -3.6067492961883545, "logps/chosen": -55.40770721435547, "logps/ref_chosen": -49.08948516845703, "logps/ref_rejected": -84.07553100585938, "logps/rejected": -102.72181701660156, "loss": 1.014, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4448799788951874, "rewards/margins": 0.8541780114173889, "rewards/rejected": -1.299057960510254, "step": 96 }, { "epoch": 0.14663643235071808, "epsilon_dpo/beta": 0.0696260929107666, "epsilon_dpo/beta_margin_grad_mean": -0.41157081723213196, "epsilon_dpo/beta_margin_grad_std": 0.20113174617290497, "epsilon_dpo/beta_margin_mean": 0.4696374833583832, "epsilon_dpo/beta_margin_std": 1.0460851192474365, "epsilon_dpo/loss_margin_mean": 6.841196537017822, "grad_norm": 79.79370880126953, "kl/avg_steps": 0.375, "kl/beta": 0.06988118588924408, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.970651810649666e-07, "logits/chosen": -3.546657085418701, "logits/rejected": -3.5788378715515137, "logps/chosen": -65.90841674804688, "logps/ref_chosen": -57.28911590576172, "logps/ref_rejected": -72.4705581665039, "logps/rejected": -87.9310531616211, "loss": 1.1987, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6042090654373169, "rewards/margins": 0.4696374833583832, "rewards/rejected": -1.0738465785980225, "step": 97 }, { "epoch": 0.14814814814814814, "epsilon_dpo/beta": 0.06917014718055725, "epsilon_dpo/beta_margin_grad_mean": -0.3598596751689911, "epsilon_dpo/beta_margin_grad_std": 0.17661502957344055, "epsilon_dpo/beta_margin_mean": 0.6891156435012817, "epsilon_dpo/beta_margin_std": 0.9075184464454651, "epsilon_dpo/loss_margin_mean": 10.019757270812988, "grad_norm": 54.35417556762695, "kl/avg_steps": 0.65625, "kl/beta": 0.06962011009454727, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.968597221690985e-07, "logits/chosen": -3.534125328063965, "logits/rejected": -3.5029351711273193, "logps/chosen": -58.02435302734375, "logps/ref_chosen": -53.89137268066406, "logps/ref_rejected": -66.7619857788086, "logps/rejected": -80.91471862792969, "loss": 0.9833, "rewards/accuracies": 0.796875, "rewards/chosen": -0.28773120045661926, "rewards/margins": 0.6891156435012817, "rewards/rejected": -0.9768468141555786, "step": 98 }, { "epoch": 0.14965986394557823, "epsilon_dpo/beta": 0.06876240670681, "epsilon_dpo/beta_margin_grad_mean": -0.3565883934497833, "epsilon_dpo/beta_margin_grad_std": 0.18600483238697052, "epsilon_dpo/beta_margin_mean": 0.7350455522537231, "epsilon_dpo/beta_margin_std": 1.0124030113220215, "epsilon_dpo/loss_margin_mean": 10.75841236114502, "grad_norm": 71.46509552001953, "kl/avg_steps": 0.59375, "kl/beta": 0.06916620582342148, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.966473580761389e-07, "logits/chosen": -3.5785701274871826, "logits/rejected": -3.576293468475342, "logps/chosen": -53.51729965209961, "logps/ref_chosen": -48.90321350097656, "logps/ref_rejected": -71.833740234375, "logps/rejected": -87.20623779296875, "loss": 0.9858, "rewards/accuracies": 0.8125, "rewards/chosen": -0.32003170251846313, "rewards/margins": 0.7350455522537231, "rewards/rejected": -1.055077314376831, "step": 99 }, { "epoch": 0.15117157974300832, "epsilon_dpo/beta": 0.06846398115158081, "epsilon_dpo/beta_margin_grad_mean": -0.35547131299972534, "epsilon_dpo/beta_margin_grad_std": 0.19236336648464203, "epsilon_dpo/beta_margin_mean": 0.7511593699455261, "epsilon_dpo/beta_margin_std": 1.0296648740768433, "epsilon_dpo/loss_margin_mean": 11.06265640258789, "grad_norm": 67.99085235595703, "kl/avg_steps": 0.4375, "kl/beta": 0.06875795125961304, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.964280947263676e-07, "logits/chosen": -3.6528518199920654, "logits/rejected": -3.5980725288391113, "logps/chosen": -75.37802124023438, "logps/ref_chosen": -69.31944274902344, "logps/ref_rejected": -84.00056457519531, "logps/rejected": -101.12179565429688, "loss": 0.9822, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4164029359817505, "rewards/margins": 0.7511593699455261, "rewards/rejected": -1.1675622463226318, "step": 100 }, { "epoch": 0.15117157974300832, "eval_epsilon_dpo/beta": 0.06822934001684189, "eval_epsilon_dpo/beta_margin_grad_mean": -0.4125640094280243, "eval_epsilon_dpo/beta_margin_grad_std": 0.19712793827056885, "eval_epsilon_dpo/beta_margin_mean": 0.4365081191062927, "eval_epsilon_dpo/beta_margin_std": 1.0171581506729126, "eval_epsilon_dpo/loss_margin_mean": 6.496590614318848, "eval_kl/n_epsilon_steps": 0.3274647891521454, "eval_kl/p_epsilon_steps": 0.6720950603485107, "eval_logits/chosen": -3.559885263442993, "eval_logits/rejected": -3.5776286125183105, "eval_logps/chosen": -85.75508880615234, "eval_logps/ref_chosen": -77.40868377685547, "eval_logps/ref_rejected": -73.52816772460938, "eval_logps/rejected": -88.37115478515625, "eval_loss": 0.6126788258552551, "eval_rewards/accuracies": 0.6646126508712769, "eval_rewards/chosen": -0.5723935961723328, "eval_rewards/margins": 0.4365081191062927, "eval_rewards/rejected": -1.008901834487915, "eval_runtime": 37.3773, "eval_samples_per_second": 61.615, "eval_steps_per_second": 1.926, "step": 100 }, { "epoch": 0.15268329554043839, "epsilon_dpo/beta": 0.06799458712339401, "epsilon_dpo/beta_margin_grad_mean": -0.3459049165248871, "epsilon_dpo/beta_margin_grad_std": 0.17626599967479706, "epsilon_dpo/beta_margin_mean": 0.7596405148506165, "epsilon_dpo/beta_margin_std": 0.9050381183624268, "epsilon_dpo/loss_margin_mean": 11.231376647949219, "grad_norm": 56.005489349365234, "kl/avg_steps": 0.6875, "kl/beta": 0.0684584453701973, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.96201938253052e-07, "logits/chosen": -3.5751891136169434, "logits/rejected": -3.5783748626708984, "logps/chosen": -53.315773010253906, "logps/ref_chosen": -48.05763626098633, "logps/ref_rejected": -61.476898193359375, "logps/rejected": -77.96641540527344, "loss": 0.9338, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3587988018989563, "rewards/margins": 0.7596405148506165, "rewards/rejected": -1.1184393167495728, "step": 101 }, { "epoch": 0.15419501133786848, "epsilon_dpo/beta": 0.06767906993627548, "epsilon_dpo/beta_margin_grad_mean": -0.35825103521347046, "epsilon_dpo/beta_margin_grad_std": 0.20622548460960388, "epsilon_dpo/beta_margin_mean": 0.7302479147911072, "epsilon_dpo/beta_margin_std": 1.0540887117385864, "epsilon_dpo/loss_margin_mean": 10.890307426452637, "grad_norm": 67.409423828125, "kl/avg_steps": 0.46875, "kl/beta": 0.06799101084470749, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.959688949822748e-07, "logits/chosen": -3.6385841369628906, "logits/rejected": -3.5608317852020264, "logps/chosen": -60.683475494384766, "logps/ref_chosen": -55.533119201660156, "logps/ref_rejected": -61.735572814941406, "logps/rejected": -77.77622985839844, "loss": 1.0149, "rewards/accuracies": 0.734375, "rewards/chosen": -0.35122111439704895, "rewards/margins": 0.7302478551864624, "rewards/rejected": -1.081468939781189, "step": 102 }, { "epoch": 0.15570672713529857, "epsilon_dpo/beta": 0.06742674857378006, "epsilon_dpo/beta_margin_grad_mean": -0.36357906460762024, "epsilon_dpo/beta_margin_grad_std": 0.1782323569059372, "epsilon_dpo/beta_margin_mean": 0.6898066401481628, "epsilon_dpo/beta_margin_std": 0.9445146322250366, "epsilon_dpo/loss_margin_mean": 10.319247245788574, "grad_norm": 72.8338851928711, "kl/avg_steps": 0.375, "kl/beta": 0.06767378747463226, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.957289714327572e-07, "logits/chosen": -3.5234222412109375, "logits/rejected": -3.513826608657837, "logps/chosen": -64.4974365234375, "logps/ref_chosen": -57.1231689453125, "logps/ref_rejected": -63.90118408203125, "logps/rejected": -81.59469604492188, "loss": 0.9897, "rewards/accuracies": 0.75, "rewards/chosen": -0.4984205961227417, "rewards/margins": 0.6898066997528076, "rewards/rejected": -1.1882272958755493, "step": 103 }, { "epoch": 0.15721844293272866, "epsilon_dpo/beta": 0.06711163371801376, "epsilon_dpo/beta_margin_grad_mean": -0.36066433787345886, "epsilon_dpo/beta_margin_grad_std": 0.2093551903963089, "epsilon_dpo/beta_margin_mean": 0.7659928798675537, "epsilon_dpo/beta_margin_std": 1.1850172281265259, "epsilon_dpo/loss_margin_mean": 11.51352310180664, "grad_norm": 69.51878356933594, "kl/avg_steps": 0.46875, "kl/beta": 0.06742095947265625, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.954821743156767e-07, "logits/chosen": -3.42474102973938, "logits/rejected": -3.528376579284668, "logps/chosen": -57.66958236694336, "logps/ref_chosen": -50.00084686279297, "logps/ref_rejected": -83.87802124023438, "logps/rejected": -103.06028747558594, "loss": 1.0289, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5184776186943054, "rewards/margins": 0.7659928798675537, "rewards/rejected": -1.284470558166504, "step": 104 }, { "epoch": 0.15873015873015872, "epsilon_dpo/beta": 0.06679851561784744, "epsilon_dpo/beta_margin_grad_mean": -0.3829156458377838, "epsilon_dpo/beta_margin_grad_std": 0.19852976500988007, "epsilon_dpo/beta_margin_mean": 0.5970746278762817, "epsilon_dpo/beta_margin_std": 1.0629304647445679, "epsilon_dpo/loss_margin_mean": 9.033417701721191, "grad_norm": 76.06385803222656, "kl/avg_steps": 0.46875, "kl/beta": 0.06710639595985413, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.952285105344791e-07, "logits/chosen": -3.498959541320801, "logits/rejected": -3.626457452774048, "logps/chosen": -65.34221649169922, "logps/ref_chosen": -56.54688262939453, "logps/ref_rejected": -85.40049743652344, "logps/rejected": -103.229248046875, "loss": 1.1039, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5909122228622437, "rewards/margins": 0.597074568271637, "rewards/rejected": -1.1879868507385254, "step": 105 }, { "epoch": 0.1602418745275888, "epsilon_dpo/beta": 0.06642423570156097, "epsilon_dpo/beta_margin_grad_mean": -0.35905200242996216, "epsilon_dpo/beta_margin_grad_std": 0.1958753764629364, "epsilon_dpo/beta_margin_mean": 0.7247107625007629, "epsilon_dpo/beta_margin_std": 1.0886868238449097, "epsilon_dpo/loss_margin_mean": 10.989923477172852, "grad_norm": 61.79425048828125, "kl/avg_steps": 0.5625, "kl/beta": 0.06679330766201019, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.949679871846857e-07, "logits/chosen": -3.5636894702911377, "logits/rejected": -3.5526764392852783, "logps/chosen": -59.688499450683594, "logps/ref_chosen": -52.96286392211914, "logps/ref_rejected": -59.626102447509766, "logps/rejected": -77.34165954589844, "loss": 1.0224, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4488182067871094, "rewards/margins": 0.7247107625007629, "rewards/rejected": -1.1735289096832275, "step": 106 }, { "epoch": 0.1617535903250189, "epsilon_dpo/beta": 0.06605269759893417, "epsilon_dpo/beta_margin_grad_mean": -0.3398171365261078, "epsilon_dpo/beta_margin_grad_std": 0.19395723938941956, "epsilon_dpo/beta_margin_mean": 0.8332833051681519, "epsilon_dpo/beta_margin_std": 1.0627169609069824, "epsilon_dpo/loss_margin_mean": 12.69791316986084, "grad_norm": 64.81695556640625, "kl/avg_steps": 0.5625, "kl/beta": 0.06641969829797745, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.947006115536947e-07, "logits/chosen": -3.647794723510742, "logits/rejected": -3.581852912902832, "logps/chosen": -72.7358627319336, "logps/ref_chosen": -65.2283706665039, "logps/ref_rejected": -80.5244140625, "logps/rejected": -100.72981262207031, "loss": 0.9397, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4979873299598694, "rewards/margins": 0.8332833051681519, "rewards/rejected": -1.331270694732666, "step": 107 }, { "epoch": 0.16326530612244897, "epsilon_dpo/beta": 0.06574514508247375, "epsilon_dpo/beta_margin_grad_mean": -0.3466867506504059, "epsilon_dpo/beta_margin_grad_std": 0.22621804475784302, "epsilon_dpo/beta_margin_mean": 0.8275544047355652, "epsilon_dpo/beta_margin_std": 1.2847651243209839, "epsilon_dpo/loss_margin_mean": 12.713004112243652, "grad_norm": 67.85714721679688, "kl/avg_steps": 0.46875, "kl/beta": 0.06604817509651184, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.944263911205772e-07, "logits/chosen": -3.5607361793518066, "logits/rejected": -3.582914113998413, "logps/chosen": -71.61833953857422, "logps/ref_chosen": -63.451942443847656, "logps/ref_rejected": -72.61829376220703, "logps/rejected": -93.49769592285156, "loss": 1.0388, "rewards/accuracies": 0.75, "rewards/chosen": -0.5401895046234131, "rewards/margins": 0.8275543451309204, "rewards/rejected": -1.367743968963623, "step": 108 }, { "epoch": 0.16477702191987906, "epsilon_dpo/beta": 0.06535622477531433, "epsilon_dpo/beta_margin_grad_mean": -0.31177273392677307, "epsilon_dpo/beta_margin_grad_std": 0.18321716785430908, "epsilon_dpo/beta_margin_mean": 0.9792252779006958, "epsilon_dpo/beta_margin_std": 1.019906997680664, "epsilon_dpo/loss_margin_mean": 15.062385559082031, "grad_norm": 58.25711441040039, "kl/avg_steps": 0.59375, "kl/beta": 0.06574001908302307, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.941453335558681e-07, "logits/chosen": -3.5738940238952637, "logits/rejected": -3.6228842735290527, "logps/chosen": -59.53307342529297, "logps/ref_chosen": -52.251869201660156, "logps/ref_rejected": -85.80061340332031, "logps/rejected": -108.14420318603516, "loss": 0.829, "rewards/accuracies": 0.859375, "rewards/chosen": -0.4775705337524414, "rewards/margins": 0.9792253971099854, "rewards/rejected": -1.4567959308624268, "step": 109 }, { "epoch": 0.16628873771730915, "epsilon_dpo/beta": 0.06505216658115387, "epsilon_dpo/beta_margin_grad_mean": -0.3661089837551117, "epsilon_dpo/beta_margin_grad_std": 0.1744028925895691, "epsilon_dpo/beta_margin_mean": 0.6749017238616943, "epsilon_dpo/beta_margin_std": 0.9821470975875854, "epsilon_dpo/loss_margin_mean": 10.454292297363281, "grad_norm": 63.71784210205078, "kl/avg_steps": 0.46875, "kl/beta": 0.06535199284553528, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.938574467213517e-07, "logits/chosen": -3.641899824142456, "logits/rejected": -3.5652589797973633, "logps/chosen": -74.46409606933594, "logps/ref_chosen": -63.26176452636719, "logps/ref_rejected": -63.747779846191406, "logps/rejected": -85.40440368652344, "loss": 1.004, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7321139574050903, "rewards/margins": 0.6749017238616943, "rewards/rejected": -1.4070156812667847, "step": 110 }, { "epoch": 0.16780045351473924, "epsilon_dpo/beta": 0.06470799446105957, "epsilon_dpo/beta_margin_grad_mean": -0.32783564925193787, "epsilon_dpo/beta_margin_grad_std": 0.20264722406864166, "epsilon_dpo/beta_margin_mean": 0.9278824925422668, "epsilon_dpo/beta_margin_std": 1.1223479509353638, "epsilon_dpo/loss_margin_mean": 14.434383392333984, "grad_norm": 71.378173828125, "kl/avg_steps": 0.53125, "kl/beta": 0.06504708528518677, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.935627386698418e-07, "logits/chosen": -3.573324680328369, "logits/rejected": -3.649160861968994, "logps/chosen": -58.07274627685547, "logps/ref_chosen": -45.88665008544922, "logps/ref_rejected": -73.46748352050781, "logps/rejected": -100.08796691894531, "loss": 0.9029, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7916420698165894, "rewards/margins": 0.9278824329376221, "rewards/rejected": -1.7195245027542114, "step": 111 }, { "epoch": 0.1693121693121693, "epsilon_dpo/beta": 0.06436605006456375, "epsilon_dpo/beta_margin_grad_mean": -0.32370397448539734, "epsilon_dpo/beta_margin_grad_std": 0.22360259294509888, "epsilon_dpo/beta_margin_mean": 0.9911072850227356, "epsilon_dpo/beta_margin_std": 1.2853206396102905, "epsilon_dpo/loss_margin_mean": 15.509366989135742, "grad_norm": 67.39942932128906, "kl/avg_steps": 0.53125, "kl/beta": 0.06470334529876709, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.932612176449559e-07, "logits/chosen": -3.6107521057128906, "logits/rejected": -3.599374532699585, "logps/chosen": -65.18389892578125, "logps/ref_chosen": -56.05567169189453, "logps/ref_rejected": -103.29936218261719, "logps/rejected": -127.93695068359375, "loss": 0.9302, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5916974544525146, "rewards/margins": 0.9911072254180908, "rewards/rejected": -1.5828046798706055, "step": 112 }, { "epoch": 0.1708238851095994, "epsilon_dpo/beta": 0.06404602527618408, "epsilon_dpo/beta_margin_grad_mean": -0.33465975522994995, "epsilon_dpo/beta_margin_grad_std": 0.22496308386325836, "epsilon_dpo/beta_margin_mean": 0.8950889706611633, "epsilon_dpo/beta_margin_std": 1.2329754829406738, "epsilon_dpo/loss_margin_mean": 14.100404739379883, "grad_norm": 71.98365020751953, "kl/avg_steps": 0.5, "kl/beta": 0.06436142325401306, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.929528920808854e-07, "logits/chosen": -3.472930669784546, "logits/rejected": -3.5502593517303467, "logps/chosen": -68.09251403808594, "logps/ref_chosen": -54.36525344848633, "logps/ref_rejected": -65.74974822998047, "logps/rejected": -93.5774154663086, "loss": 0.975, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8820576071739197, "rewards/margins": 0.8950889110565186, "rewards/rejected": -1.777146577835083, "step": 113 }, { "epoch": 0.17233560090702948, "epsilon_dpo/beta": 0.06374739855527878, "epsilon_dpo/beta_margin_grad_mean": -0.34491175413131714, "epsilon_dpo/beta_margin_grad_std": 0.22966939210891724, "epsilon_dpo/beta_margin_mean": 0.8657147288322449, "epsilon_dpo/beta_margin_std": 1.3151226043701172, "epsilon_dpo/loss_margin_mean": 13.705957412719727, "grad_norm": 69.6376953125, "kl/avg_steps": 0.46875, "kl/beta": 0.06404121965169907, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.92637770602159e-07, "logits/chosen": -3.462810516357422, "logits/rejected": -3.498316526412964, "logps/chosen": -72.93467712402344, "logps/ref_chosen": -59.35382843017578, "logps/ref_rejected": -63.291629791259766, "logps/rejected": -90.57844543457031, "loss": 1.0256, "rewards/accuracies": 0.734375, "rewards/chosen": -0.868782639503479, "rewards/margins": 0.8657146692276001, "rewards/rejected": -1.734497308731079, "step": 114 }, { "epoch": 0.17384731670445955, "epsilon_dpo/beta": 0.0633503720164299, "epsilon_dpo/beta_margin_grad_mean": -0.3075587749481201, "epsilon_dpo/beta_margin_grad_std": 0.19822528958320618, "epsilon_dpo/beta_margin_mean": 1.0519087314605713, "epsilon_dpo/beta_margin_std": 1.1657099723815918, "epsilon_dpo/loss_margin_mean": 16.694820404052734, "grad_norm": 57.91938018798828, "kl/avg_steps": 0.625, "kl/beta": 0.06374242901802063, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.923158620234019e-07, "logits/chosen": -3.5653257369995117, "logits/rejected": -3.5820350646972656, "logps/chosen": -59.44527816772461, "logps/ref_chosen": -46.262672424316406, "logps/ref_rejected": -70.71098327636719, "logps/rejected": -100.58840942382812, "loss": 0.8385, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8374489545822144, "rewards/margins": 1.0519086122512817, "rewards/rejected": -1.889357566833496, "step": 115 }, { "epoch": 0.17535903250188964, "epsilon_dpo/beta": 0.06301628798246384, "epsilon_dpo/beta_margin_grad_mean": -0.2945246398448944, "epsilon_dpo/beta_margin_grad_std": 0.2129705250263214, "epsilon_dpo/beta_margin_mean": 1.12185800075531, "epsilon_dpo/beta_margin_std": 1.196973204612732, "epsilon_dpo/loss_margin_mean": 17.919870376586914, "grad_norm": 51.288002014160156, "kl/avg_steps": 0.53125, "kl/beta": 0.0633465126156807, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.91987175349089e-07, "logits/chosen": -3.525303840637207, "logits/rejected": -3.5536141395568848, "logps/chosen": -57.35511779785156, "logps/ref_chosen": -44.168487548828125, "logps/ref_rejected": -63.91865921020508, "logps/rejected": -95.02516174316406, "loss": 0.8255, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8340129852294922, "rewards/margins": 1.12185800075531, "rewards/rejected": -1.9558711051940918, "step": 116 }, { "epoch": 0.17687074829931973, "epsilon_dpo/beta": 0.06268327683210373, "epsilon_dpo/beta_margin_grad_mean": -0.3226037621498108, "epsilon_dpo/beta_margin_grad_std": 0.24396450817584991, "epsilon_dpo/beta_margin_mean": 0.9844721555709839, "epsilon_dpo/beta_margin_std": 1.430822730064392, "epsilon_dpo/loss_margin_mean": 15.845026016235352, "grad_norm": 69.0673828125, "kl/avg_steps": 0.53125, "kl/beta": 0.06301175802946091, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.916517197732933e-07, "logits/chosen": -3.5357208251953125, "logits/rejected": -3.5620808601379395, "logps/chosen": -65.10205841064453, "logps/ref_chosen": -51.74369812011719, "logps/ref_rejected": -66.07595825195312, "logps/rejected": -95.27933502197266, "loss": 1.0169, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8414120674133301, "rewards/margins": 0.9844720959663391, "rewards/rejected": -1.825884222984314, "step": 117 }, { "epoch": 0.17838246409674982, "epsilon_dpo/beta": 0.06221490725874901, "epsilon_dpo/beta_margin_grad_mean": -0.29418981075286865, "epsilon_dpo/beta_margin_grad_std": 0.173599973320961, "epsilon_dpo/beta_margin_mean": 1.0497143268585205, "epsilon_dpo/beta_margin_std": 0.9615666270256042, "epsilon_dpo/loss_margin_mean": 16.9322509765625, "grad_norm": 60.609718322753906, "kl/avg_steps": 0.75, "kl/beta": 0.06267877668142319, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.913095046794281e-07, "logits/chosen": -3.484391689300537, "logits/rejected": -3.5277113914489746, "logps/chosen": -61.91997528076172, "logps/ref_chosen": -49.77568817138672, "logps/ref_rejected": -74.73394775390625, "logps/rejected": -103.81048583984375, "loss": 0.7706, "rewards/accuracies": 0.875, "rewards/chosen": -0.7568327188491821, "rewards/margins": 1.0497143268585205, "rewards/rejected": -1.8065470457077026, "step": 118 }, { "epoch": 0.17989417989417988, "epsilon_dpo/beta": 0.06200453266501427, "epsilon_dpo/beta_margin_grad_mean": -0.3812069594860077, "epsilon_dpo/beta_margin_grad_std": 0.23962487280368805, "epsilon_dpo/beta_margin_mean": 0.6779479384422302, "epsilon_dpo/beta_margin_std": 1.2889457941055298, "epsilon_dpo/loss_margin_mean": 11.082527160644531, "grad_norm": 85.9545669555664, "kl/avg_steps": 0.34375, "kl/beta": 0.06221218779683113, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.909605396399855e-07, "logits/chosen": -3.624180316925049, "logits/rejected": -3.5756680965423584, "logps/chosen": -72.6949462890625, "logps/ref_chosen": -53.84432601928711, "logps/ref_rejected": -70.61807250976562, "logps/rejected": -100.55122375488281, "loss": 1.1531, "rewards/accuracies": 0.671875, "rewards/chosen": -1.1748597621917725, "rewards/margins": 0.6779479384422302, "rewards/rejected": -1.8528077602386475, "step": 119 }, { "epoch": 0.18140589569160998, "epsilon_dpo/beta": 0.06169524043798447, "epsilon_dpo/beta_margin_grad_mean": -0.315789133310318, "epsilon_dpo/beta_margin_grad_std": 0.21923761069774628, "epsilon_dpo/beta_margin_mean": 1.036021113395691, "epsilon_dpo/beta_margin_std": 1.2428100109100342, "epsilon_dpo/loss_margin_mean": 16.918027877807617, "grad_norm": 62.9605827331543, "kl/avg_steps": 0.5, "kl/beta": 0.06199906766414642, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.906048344162676e-07, "logits/chosen": -3.5478808879852295, "logits/rejected": -3.58474063873291, "logps/chosen": -67.20063781738281, "logps/ref_chosen": -52.07549285888672, "logps/ref_rejected": -69.77534484863281, "logps/rejected": -101.81851196289062, "loss": 0.8888, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9364580512046814, "rewards/margins": 1.0360209941864014, "rewards/rejected": -1.9724791049957275, "step": 120 }, { "epoch": 0.18291761148904007, "epsilon_dpo/beta": 0.06131117790937424, "epsilon_dpo/beta_margin_grad_mean": -0.3246532678604126, "epsilon_dpo/beta_margin_grad_std": 0.22677934169769287, "epsilon_dpo/beta_margin_mean": 0.9546488523483276, "epsilon_dpo/beta_margin_std": 1.3126157522201538, "epsilon_dpo/loss_margin_mean": 15.68853759765625, "grad_norm": 65.85437774658203, "kl/avg_steps": 0.625, "kl/beta": 0.06169061362743378, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.902423989581143e-07, "logits/chosen": -3.525444984436035, "logits/rejected": -3.6508102416992188, "logps/chosen": -68.1076431274414, "logps/ref_chosen": -50.04947280883789, "logps/ref_rejected": -98.42295837402344, "logps/rejected": -132.16966247558594, "loss": 0.969, "rewards/accuracies": 0.828125, "rewards/chosen": -1.110429048538208, "rewards/margins": 0.9546488523483276, "rewards/rejected": -2.065077781677246, "step": 121 }, { "epoch": 0.18442932728647016, "epsilon_dpo/beta": 0.06100700423121452, "epsilon_dpo/beta_margin_grad_mean": -0.3200055956840515, "epsilon_dpo/beta_margin_grad_std": 0.2138427495956421, "epsilon_dpo/beta_margin_mean": 0.9862568974494934, "epsilon_dpo/beta_margin_std": 1.1971243619918823, "epsilon_dpo/loss_margin_mean": 16.28790855407715, "grad_norm": 67.44239807128906, "kl/avg_steps": 0.5, "kl/beta": 0.06130744144320488, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.898732434036243e-07, "logits/chosen": -3.574190616607666, "logits/rejected": -3.5785958766937256, "logps/chosen": -74.17181396484375, "logps/ref_chosen": -56.680877685546875, "logps/ref_rejected": -69.11771392822266, "logps/rejected": -102.89656066894531, "loss": 0.8983, "rewards/accuracies": 0.734375, "rewards/chosen": -1.069732904434204, "rewards/margins": 0.9862568974494934, "rewards/rejected": -2.0559897422790527, "step": 122 }, { "epoch": 0.18594104308390022, "epsilon_dpo/beta": 0.060627225786447525, "epsilon_dpo/beta_margin_grad_mean": -0.3068823218345642, "epsilon_dpo/beta_margin_grad_std": 0.21226945519447327, "epsilon_dpo/beta_margin_mean": 1.046561360359192, "epsilon_dpo/beta_margin_std": 1.2128506898880005, "epsilon_dpo/loss_margin_mean": 17.368038177490234, "grad_norm": 65.77674865722656, "kl/avg_steps": 0.625, "kl/beta": 0.06100242957472801, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.894973780788722e-07, "logits/chosen": -3.550365924835205, "logits/rejected": -3.5962891578674316, "logps/chosen": -68.05206298828125, "logps/ref_chosen": -49.78697967529297, "logps/ref_rejected": -76.16083526611328, "logps/rejected": -111.7939453125, "loss": 0.8709, "rewards/accuracies": 0.84375, "rewards/chosen": -1.109205722808838, "rewards/margins": 1.046561360359192, "rewards/rejected": -2.1557669639587402, "step": 123 }, { "epoch": 0.1874527588813303, "epsilon_dpo/beta": 0.06019382178783417, "epsilon_dpo/beta_margin_grad_mean": -0.3152182102203369, "epsilon_dpo/beta_margin_grad_std": 0.2096729874610901, "epsilon_dpo/beta_margin_mean": 0.996179461479187, "epsilon_dpo/beta_margin_std": 1.2183893918991089, "epsilon_dpo/loss_margin_mean": 16.639822006225586, "grad_norm": 63.897647857666016, "kl/avg_steps": 0.71875, "kl/beta": 0.06062353402376175, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.89114813497619e-07, "logits/chosen": -3.5235955715179443, "logits/rejected": -3.5984060764312744, "logps/chosen": -64.92776489257812, "logps/ref_chosen": -44.6346549987793, "logps/ref_rejected": -79.34061431884766, "logps/rejected": -116.27354431152344, "loss": 0.8993, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2249343395233154, "rewards/margins": 0.996179461479187, "rewards/rejected": -2.221113920211792, "step": 124 }, { "epoch": 0.1889644746787604, "epsilon_dpo/beta": 0.05985832214355469, "epsilon_dpo/beta_margin_grad_mean": -0.3303561508655548, "epsilon_dpo/beta_margin_grad_std": 0.22177883982658386, "epsilon_dpo/beta_margin_mean": 0.9359113574028015, "epsilon_dpo/beta_margin_std": 1.3095543384552002, "epsilon_dpo/loss_margin_mean": 15.757261276245117, "grad_norm": 71.04061889648438, "kl/avg_steps": 0.5625, "kl/beta": 0.06019091233611107, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.887255603610184e-07, "logits/chosen": -3.573054790496826, "logits/rejected": -3.6582694053649902, "logps/chosen": -79.61910247802734, "logps/ref_chosen": -59.55042266845703, "logps/ref_rejected": -82.81057739257812, "logps/rejected": -118.63652801513672, "loss": 0.9737, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2038168907165527, "rewards/margins": 0.9359113574028015, "rewards/rejected": -2.13972806930542, "step": 125 }, { "epoch": 0.19047619047619047, "epsilon_dpo/beta": 0.05961703509092331, "epsilon_dpo/beta_margin_grad_mean": -0.3442944884300232, "epsilon_dpo/beta_margin_grad_std": 0.2403549998998642, "epsilon_dpo/beta_margin_mean": 0.8773694038391113, "epsilon_dpo/beta_margin_std": 1.3643807172775269, "epsilon_dpo/loss_margin_mean": 14.874757766723633, "grad_norm": 76.93744659423828, "kl/avg_steps": 0.40625, "kl/beta": 0.059854231774806976, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.883296295573176e-07, "logits/chosen": -3.6127355098724365, "logits/rejected": -3.4942989349365234, "logps/chosen": -80.25390625, "logps/ref_chosen": -63.263946533203125, "logps/ref_rejected": -57.333274841308594, "logps/rejected": -89.19798278808594, "loss": 1.0496, "rewards/accuracies": 0.703125, "rewards/chosen": -1.017090082168579, "rewards/margins": 0.8773694038391113, "rewards/rejected": -1.8944594860076904, "step": 126 }, { "epoch": 0.19198790627362056, "epsilon_dpo/beta": 0.059245407581329346, "epsilon_dpo/beta_margin_grad_mean": -0.3256434202194214, "epsilon_dpo/beta_margin_grad_std": 0.19091641902923584, "epsilon_dpo/beta_margin_mean": 0.8926177024841309, "epsilon_dpo/beta_margin_std": 1.0224562883377075, "epsilon_dpo/loss_margin_mean": 15.152204513549805, "grad_norm": 63.04719924926758, "kl/avg_steps": 0.625, "kl/beta": 0.059612058103084564, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.87927032161552e-07, "logits/chosen": -3.534280776977539, "logits/rejected": -3.521151542663574, "logps/chosen": -70.83218383789062, "logps/ref_chosen": -54.136375427246094, "logps/ref_rejected": -59.19955062866211, "logps/rejected": -91.04756164550781, "loss": 0.8898, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9907190799713135, "rewards/margins": 0.8926177024841309, "rewards/rejected": -1.8833367824554443, "step": 127 }, { "epoch": 0.19349962207105065, "epsilon_dpo/beta": 0.0589514821767807, "epsilon_dpo/beta_margin_grad_mean": -0.3429934084415436, "epsilon_dpo/beta_margin_grad_std": 0.23296302556991577, "epsilon_dpo/beta_margin_mean": 0.8633172512054443, "epsilon_dpo/beta_margin_std": 1.3524885177612305, "epsilon_dpo/loss_margin_mean": 14.78608226776123, "grad_norm": 67.40087890625, "kl/avg_steps": 0.5, "kl/beta": 0.05924179404973984, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.875177794352363e-07, "logits/chosen": -3.5723466873168945, "logits/rejected": -3.675063133239746, "logps/chosen": -78.807861328125, "logps/ref_chosen": -60.14347839355469, "logps/ref_rejected": -89.54539489746094, "logps/rejected": -122.99586486816406, "loss": 1.0446, "rewards/accuracies": 0.75, "rewards/chosen": -1.1027114391326904, "rewards/margins": 0.8633172512054443, "rewards/rejected": -1.9660285711288452, "step": 128 }, { "epoch": 0.19501133786848074, "epsilon_dpo/beta": 0.0586397685110569, "epsilon_dpo/beta_margin_grad_mean": -0.33842793107032776, "epsilon_dpo/beta_margin_grad_std": 0.2152843177318573, "epsilon_dpo/beta_margin_mean": 0.8908988833427429, "epsilon_dpo/beta_margin_std": 1.2387031316757202, "epsilon_dpo/loss_margin_mean": 15.305898666381836, "grad_norm": 70.80146026611328, "kl/avg_steps": 0.53125, "kl/beta": 0.058947060257196426, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.871018828260491e-07, "logits/chosen": -3.590847969055176, "logits/rejected": -3.5320844650268555, "logps/chosen": -75.22663116455078, "logps/ref_chosen": -58.69816589355469, "logps/ref_rejected": -61.040306091308594, "logps/rejected": -92.87466430664062, "loss": 0.9729, "rewards/accuracies": 0.75, "rewards/chosen": -0.9724023342132568, "rewards/margins": 0.8908988237380981, "rewards/rejected": -1.8633012771606445, "step": 129 }, { "epoch": 0.1965230536659108, "epsilon_dpo/beta": 0.058256588876247406, "epsilon_dpo/beta_margin_grad_mean": -0.30823227763175964, "epsilon_dpo/beta_margin_grad_std": 0.22129854559898376, "epsilon_dpo/beta_margin_mean": 1.0533243417739868, "epsilon_dpo/beta_margin_std": 1.2728021144866943, "epsilon_dpo/loss_margin_mean": 18.19447135925293, "grad_norm": 59.63694763183594, "kl/avg_steps": 0.65625, "kl/beta": 0.05863555893301964, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.866793539675126e-07, "logits/chosen": -3.5602312088012695, "logits/rejected": -3.599091053009033, "logps/chosen": -64.18798828125, "logps/ref_chosen": -51.89386749267578, "logps/ref_rejected": -73.57125091552734, "logps/rejected": -104.05984497070312, "loss": 0.893, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7195111513137817, "rewards/margins": 1.0533244609832764, "rewards/rejected": -1.772835612297058, "step": 130 }, { "epoch": 0.1980347694633409, "epsilon_dpo/beta": 0.058004215359687805, "epsilon_dpo/beta_margin_grad_mean": -0.32566961646080017, "epsilon_dpo/beta_margin_grad_std": 0.2217138558626175, "epsilon_dpo/beta_margin_mean": 0.9670554995536804, "epsilon_dpo/beta_margin_std": 1.2691491842269897, "epsilon_dpo/loss_margin_mean": 16.813655853271484, "grad_norm": 58.40495300292969, "kl/avg_steps": 0.4375, "kl/beta": 0.058253273367881775, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.86250204678667e-07, "logits/chosen": -3.4566946029663086, "logits/rejected": -3.594978094100952, "logps/chosen": -58.631591796875, "logps/ref_chosen": -43.99452209472656, "logps/ref_rejected": -74.55988311767578, "logps/rejected": -106.01060485839844, "loss": 0.9432, "rewards/accuracies": 0.75, "rewards/chosen": -0.8531282544136047, "rewards/margins": 0.9670554995536804, "rewards/rejected": -1.8201837539672852, "step": 131 }, { "epoch": 0.19954648526077098, "epsilon_dpo/beta": 0.057570286095142365, "epsilon_dpo/beta_margin_grad_mean": -0.3208879828453064, "epsilon_dpo/beta_margin_grad_std": 0.16454458236694336, "epsilon_dpo/beta_margin_mean": 0.8694888353347778, "epsilon_dpo/beta_margin_std": 0.8661810159683228, "epsilon_dpo/loss_margin_mean": 15.163482666015625, "grad_norm": 52.860023498535156, "kl/avg_steps": 0.75, "kl/beta": 0.05799952521920204, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.858144469637408e-07, "logits/chosen": -3.545797109603882, "logits/rejected": -3.576073169708252, "logps/chosen": -64.34201049804688, "logps/ref_chosen": -53.725013732910156, "logps/ref_rejected": -65.00053405761719, "logps/rejected": -90.78101348876953, "loss": 0.8476, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6127969026565552, "rewards/margins": 0.8694887757301331, "rewards/rejected": -1.482285737991333, "step": 132 }, { "epoch": 0.20105820105820105, "epsilon_dpo/beta": 0.0573396272957325, "epsilon_dpo/beta_margin_grad_mean": -0.3583378791809082, "epsilon_dpo/beta_margin_grad_std": 0.21854576468467712, "epsilon_dpo/beta_margin_mean": 0.745814323425293, "epsilon_dpo/beta_margin_std": 1.137496829032898, "epsilon_dpo/loss_margin_mean": 13.139826774597168, "grad_norm": 57.51015090942383, "kl/avg_steps": 0.40625, "kl/beta": 0.057567764073610306, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.853720930118138e-07, "logits/chosen": -3.6033637523651123, "logits/rejected": -3.5857484340667725, "logps/chosen": -68.73859405517578, "logps/ref_chosen": -58.187477111816406, "logps/ref_rejected": -65.84860229492188, "logps/rejected": -89.53955078125, "loss": 1.0395, "rewards/accuracies": 0.75, "rewards/chosen": -0.6090599298477173, "rewards/margins": 0.745814323425293, "rewards/rejected": -1.3548742532730103, "step": 133 }, { "epoch": 0.20256991685563114, "epsilon_dpo/beta": 0.05700011923909187, "epsilon_dpo/beta_margin_grad_mean": -0.31998392939567566, "epsilon_dpo/beta_margin_grad_std": 0.19890964031219482, "epsilon_dpo/beta_margin_mean": 0.9846644997596741, "epsilon_dpo/beta_margin_std": 1.1528493165969849, "epsilon_dpo/loss_margin_mean": 17.37661361694336, "grad_norm": 58.09659957885742, "kl/avg_steps": 0.59375, "kl/beta": 0.05733484402298927, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.849231551964771e-07, "logits/chosen": -3.5512003898620605, "logits/rejected": -3.593188762664795, "logps/chosen": -58.680641174316406, "logps/ref_chosen": -48.22349166870117, "logps/ref_rejected": -63.40602111816406, "logps/rejected": -91.23977661132812, "loss": 0.8727, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5977742671966553, "rewards/margins": 0.9846644997596741, "rewards/rejected": -1.5824388265609741, "step": 134 }, { "epoch": 0.20408163265306123, "epsilon_dpo/beta": 0.056699302047491074, "epsilon_dpo/beta_margin_grad_mean": -0.35570773482322693, "epsilon_dpo/beta_margin_grad_std": 0.18898187577724457, "epsilon_dpo/beta_margin_mean": 0.7198653221130371, "epsilon_dpo/beta_margin_std": 0.996110200881958, "epsilon_dpo/loss_margin_mean": 12.79709529876709, "grad_norm": 54.118099212646484, "kl/avg_steps": 0.53125, "kl/beta": 0.0569964274764061, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.844676460754862e-07, "logits/chosen": -3.526477813720703, "logits/rejected": -3.5801403522491455, "logps/chosen": -54.04167938232422, "logps/ref_chosen": -44.98304748535156, "logps/ref_rejected": -60.612892150878906, "logps/rejected": -82.46861267089844, "loss": 0.9933, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5154556035995483, "rewards/margins": 0.7198653221130371, "rewards/rejected": -1.235320806503296, "step": 135 }, { "epoch": 0.20559334845049132, "epsilon_dpo/beta": 0.05639968067407608, "epsilon_dpo/beta_margin_grad_mean": -0.33331847190856934, "epsilon_dpo/beta_margin_grad_std": 0.22204582393169403, "epsilon_dpo/beta_margin_mean": 0.9000535607337952, "epsilon_dpo/beta_margin_std": 1.2341701984405518, "epsilon_dpo/loss_margin_mean": 16.090578079223633, "grad_norm": 66.12653350830078, "kl/avg_steps": 0.53125, "kl/beta": 0.05669523403048515, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.840055783904106e-07, "logits/chosen": -3.5794856548309326, "logits/rejected": -3.6118791103363037, "logps/chosen": -75.9460678100586, "logps/ref_chosen": -63.462093353271484, "logps/ref_rejected": -82.55282592773438, "logps/rejected": -111.12737274169922, "loss": 0.9746, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7063400149345398, "rewards/margins": 0.9000535607337952, "rewards/rejected": -1.606393575668335, "step": 136 }, { "epoch": 0.20710506424792138, "epsilon_dpo/beta": 0.05611926317214966, "epsilon_dpo/beta_margin_grad_mean": -0.3649297058582306, "epsilon_dpo/beta_margin_grad_std": 0.2031417340040207, "epsilon_dpo/beta_margin_mean": 0.7089363932609558, "epsilon_dpo/beta_margin_std": 1.0984845161437988, "epsilon_dpo/loss_margin_mean": 12.74050235748291, "grad_norm": 54.84907913208008, "kl/avg_steps": 0.5, "kl/beta": 0.05639563128352165, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.835369650662767e-07, "logits/chosen": -3.517125129699707, "logits/rejected": -3.5808000564575195, "logps/chosen": -60.87702941894531, "logps/ref_chosen": -51.52919006347656, "logps/ref_rejected": -60.04787826538086, "logps/rejected": -82.13622283935547, "loss": 1.0408, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5274516344070435, "rewards/margins": 0.708936333656311, "rewards/rejected": -1.2363879680633545, "step": 137 }, { "epoch": 0.20861678004535147, "epsilon_dpo/beta": 0.05585760250687599, "epsilon_dpo/beta_margin_grad_mean": -0.370954304933548, "epsilon_dpo/beta_margin_grad_std": 0.19325093924999237, "epsilon_dpo/beta_margin_mean": 0.6258103847503662, "epsilon_dpo/beta_margin_std": 0.9835801124572754, "epsilon_dpo/loss_margin_mean": 11.314034461975098, "grad_norm": 56.331851959228516, "kl/avg_steps": 0.46875, "kl/beta": 0.056115057319402695, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.830618192112065e-07, "logits/chosen": -3.5940651893615723, "logits/rejected": -3.6036386489868164, "logps/chosen": -63.95580291748047, "logps/ref_chosen": -51.97818374633789, "logps/ref_rejected": -62.20045471191406, "logps/rejected": -85.49211120605469, "loss": 1.0603, "rewards/accuracies": 0.734375, "rewards/chosen": -0.671584963798523, "rewards/margins": 0.6258103847503662, "rewards/rejected": -1.2973952293395996, "step": 138 }, { "epoch": 0.21012849584278157, "epsilon_dpo/beta": 0.05547480285167694, "epsilon_dpo/beta_margin_grad_mean": -0.3201114237308502, "epsilon_dpo/beta_margin_grad_std": 0.18229545652866364, "epsilon_dpo/beta_margin_mean": 0.9476114511489868, "epsilon_dpo/beta_margin_std": 1.0544508695602417, "epsilon_dpo/loss_margin_mean": 17.160451889038086, "grad_norm": 53.29795455932617, "kl/avg_steps": 0.6875, "kl/beta": 0.05585324391722679, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.825801541160509e-07, "logits/chosen": -3.641890525817871, "logits/rejected": -3.5805788040161133, "logps/chosen": -73.00140380859375, "logps/ref_chosen": -61.71527099609375, "logps/ref_rejected": -74.5482177734375, "logps/rejected": -102.99479675292969, "loss": 0.855, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6271824836730957, "rewards/margins": 0.9476114511489868, "rewards/rejected": -1.5747939348220825, "step": 139 }, { "epoch": 0.21164021164021163, "epsilon_dpo/beta": 0.05513068661093712, "epsilon_dpo/beta_margin_grad_mean": -0.31432390213012695, "epsilon_dpo/beta_margin_grad_std": 0.18008609116077423, "epsilon_dpo/beta_margin_mean": 0.9658204913139343, "epsilon_dpo/beta_margin_std": 1.0079481601715088, "epsilon_dpo/loss_margin_mean": 17.605432510375977, "grad_norm": 52.28009796142578, "kl/avg_steps": 0.625, "kl/beta": 0.05547187477350235, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.820919832540181e-07, "logits/chosen": -3.612851619720459, "logits/rejected": -3.642123222351074, "logps/chosen": -59.99217224121094, "logps/ref_chosen": -52.2577018737793, "logps/ref_rejected": -77.62448120117188, "logps/rejected": -102.96438598632812, "loss": 0.8322, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4286110997200012, "rewards/margins": 0.9658205509185791, "rewards/rejected": -1.3944315910339355, "step": 140 }, { "epoch": 0.21315192743764172, "epsilon_dpo/beta": 0.05489163473248482, "epsilon_dpo/beta_margin_grad_mean": -0.3439890146255493, "epsilon_dpo/beta_margin_grad_std": 0.19013932347297668, "epsilon_dpo/beta_margin_mean": 0.8179143071174622, "epsilon_dpo/beta_margin_std": 1.0642812252044678, "epsilon_dpo/loss_margin_mean": 15.01675033569336, "grad_norm": 57.819007873535156, "kl/avg_steps": 0.4375, "kl/beta": 0.055127330124378204, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.815973202802966e-07, "logits/chosen": -3.590327024459839, "logits/rejected": -3.591952085494995, "logps/chosen": -65.06756591796875, "logps/ref_chosen": -55.019412994384766, "logps/ref_rejected": -75.90506744384766, "logps/rejected": -100.969970703125, "loss": 0.9428, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5527740120887756, "rewards/margins": 0.8179143071174622, "rewards/rejected": -1.3706883192062378, "step": 141 }, { "epoch": 0.2146636432350718, "epsilon_dpo/beta": 0.0545152984559536, "epsilon_dpo/beta_margin_grad_mean": -0.3276689946651459, "epsilon_dpo/beta_margin_grad_std": 0.17399372160434723, "epsilon_dpo/beta_margin_mean": 0.8715046048164368, "epsilon_dpo/beta_margin_std": 0.9612873196601868, "epsilon_dpo/loss_margin_mean": 16.058345794677734, "grad_norm": 51.09103775024414, "kl/avg_steps": 0.6875, "kl/beta": 0.05488719791173935, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.810961790316729e-07, "logits/chosen": -3.605912685394287, "logits/rejected": -3.574665069580078, "logps/chosen": -61.098785400390625, "logps/ref_chosen": -52.386016845703125, "logps/ref_rejected": -76.00948333740234, "logps/rejected": -100.78059387207031, "loss": 0.8728, "rewards/accuracies": 0.8125, "rewards/chosen": -0.47557875514030457, "rewards/margins": 0.8715046644210815, "rewards/rejected": -1.347083330154419, "step": 142 }, { "epoch": 0.2161753590325019, "epsilon_dpo/beta": 0.054262325167655945, "epsilon_dpo/beta_margin_grad_mean": -0.38881781697273254, "epsilon_dpo/beta_margin_grad_std": 0.1886391043663025, "epsilon_dpo/beta_margin_mean": 0.5383601784706116, "epsilon_dpo/beta_margin_std": 0.9926903247833252, "epsilon_dpo/loss_margin_mean": 10.023398399353027, "grad_norm": 68.02210998535156, "kl/avg_steps": 0.46875, "kl/beta": 0.054512426257133484, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.805885735261454e-07, "logits/chosen": -3.536149501800537, "logits/rejected": -3.483452320098877, "logps/chosen": -67.81700134277344, "logps/ref_chosen": -56.04994201660156, "logps/ref_rejected": -62.13799285888672, "logps/rejected": -83.92845916748047, "loss": 1.1238, "rewards/accuracies": 0.75, "rewards/chosen": -0.6419233083724976, "rewards/margins": 0.5383601784706116, "rewards/rejected": -1.180283546447754, "step": 143 }, { "epoch": 0.21768707482993196, "epsilon_dpo/beta": 0.054110899567604065, "epsilon_dpo/beta_margin_grad_mean": -0.37841829657554626, "epsilon_dpo/beta_margin_grad_std": 0.22205866873264313, "epsilon_dpo/beta_margin_mean": 0.6549732685089111, "epsilon_dpo/beta_margin_std": 1.1850394010543823, "epsilon_dpo/loss_margin_mean": 12.263768196105957, "grad_norm": 68.52680206298828, "kl/avg_steps": 0.28125, "kl/beta": 0.054258089512586594, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.800745179625307e-07, "logits/chosen": -3.6007070541381836, "logits/rejected": -3.5770633220672607, "logps/chosen": -68.58539581298828, "logps/ref_chosen": -55.615821838378906, "logps/ref_rejected": -72.51919555664062, "logps/rejected": -97.75253295898438, "loss": 1.1194, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7052870392799377, "rewards/margins": 0.6549733281135559, "rewards/rejected": -1.3602603673934937, "step": 144 }, { "epoch": 0.21919879062736206, "epsilon_dpo/beta": 0.05380694940686226, "epsilon_dpo/beta_margin_grad_mean": -0.342498242855072, "epsilon_dpo/beta_margin_grad_std": 0.20996421575546265, "epsilon_dpo/beta_margin_mean": 0.8622766137123108, "epsilon_dpo/beta_margin_std": 1.16110098361969, "epsilon_dpo/loss_margin_mean": 16.14082145690918, "grad_norm": 63.95447540283203, "kl/avg_steps": 0.5625, "kl/beta": 0.05410591512918472, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.795540267200686e-07, "logits/chosen": -3.654618978500366, "logits/rejected": -3.648521900177002, "logps/chosen": -83.11436462402344, "logps/ref_chosen": -71.45491027832031, "logps/ref_rejected": -76.1609115600586, "logps/rejected": -103.96119689941406, "loss": 0.9602, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6299362182617188, "rewards/margins": 0.8622766137123108, "rewards/rejected": -1.4922128915786743, "step": 145 }, { "epoch": 0.22071050642479215, "epsilon_dpo/beta": 0.05353961139917374, "epsilon_dpo/beta_margin_grad_mean": -0.3552601635456085, "epsilon_dpo/beta_margin_grad_std": 0.1939755082130432, "epsilon_dpo/beta_margin_mean": 0.7515380382537842, "epsilon_dpo/beta_margin_std": 1.0374655723571777, "epsilon_dpo/loss_margin_mean": 14.148492813110352, "grad_norm": 60.54452896118164, "kl/avg_steps": 0.5, "kl/beta": 0.05380327254533768, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.790271143580173e-07, "logits/chosen": -3.5167057514190674, "logits/rejected": -3.4798407554626465, "logps/chosen": -62.25146484375, "logps/ref_chosen": -51.90599060058594, "logps/ref_rejected": -65.15228271484375, "logps/rejected": -89.64625549316406, "loss": 0.985, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5570020079612732, "rewards/margins": 0.7515380382537842, "rewards/rejected": -1.3085401058197021, "step": 146 }, { "epoch": 0.2222222222222222, "epsilon_dpo/beta": 0.053289975970983505, "epsilon_dpo/beta_margin_grad_mean": -0.3546842634677887, "epsilon_dpo/beta_margin_grad_std": 0.19978955388069153, "epsilon_dpo/beta_margin_mean": 0.7370317578315735, "epsilon_dpo/beta_margin_std": 1.0573750734329224, "epsilon_dpo/loss_margin_mean": 13.94896125793457, "grad_norm": 64.5087661743164, "kl/avg_steps": 0.46875, "kl/beta": 0.053535595536231995, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.784937956152489e-07, "logits/chosen": -3.6374592781066895, "logits/rejected": -3.5946922302246094, "logps/chosen": -71.42426300048828, "logps/ref_chosen": -58.26661682128906, "logps/ref_rejected": -73.25558471679688, "logps/rejected": -100.36219787597656, "loss": 1.0079, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7039262056350708, "rewards/margins": 0.7370317578315735, "rewards/rejected": -1.440958023071289, "step": 147 }, { "epoch": 0.2237339380196523, "epsilon_dpo/beta": 0.052941422909498215, "epsilon_dpo/beta_margin_grad_mean": -0.32845786213874817, "epsilon_dpo/beta_margin_grad_std": 0.20752853155136108, "epsilon_dpo/beta_margin_mean": 0.9538125991821289, "epsilon_dpo/beta_margin_std": 1.220561146736145, "epsilon_dpo/loss_margin_mean": 18.120281219482422, "grad_norm": 50.06168746948242, "kl/avg_steps": 0.65625, "kl/beta": 0.05328581854701042, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.779540854098347e-07, "logits/chosen": -3.536505937576294, "logits/rejected": -3.6118602752685547, "logps/chosen": -53.0875244140625, "logps/ref_chosen": -43.646690368652344, "logps/ref_rejected": -62.745277404785156, "logps/rejected": -90.30638885498047, "loss": 0.9172, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5025051832199097, "rewards/margins": 0.9538125991821289, "rewards/rejected": -1.4563177824020386, "step": 148 }, { "epoch": 0.2252456538170824, "epsilon_dpo/beta": 0.05269553139805794, "epsilon_dpo/beta_margin_grad_mean": -0.3515811562538147, "epsilon_dpo/beta_margin_grad_std": 0.21278268098831177, "epsilon_dpo/beta_margin_mean": 0.7391578555107117, "epsilon_dpo/beta_margin_std": 1.1325379610061646, "epsilon_dpo/loss_margin_mean": 14.162592887878418, "grad_norm": 64.3943099975586, "kl/avg_steps": 0.46875, "kl/beta": 0.05293840914964676, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.774079988386296e-07, "logits/chosen": -3.5180792808532715, "logits/rejected": -3.567558526992798, "logps/chosen": -63.62730407714844, "logps/ref_chosen": -46.01310729980469, "logps/ref_rejected": -61.530677795410156, "logps/rejected": -93.3074722290039, "loss": 1.042, "rewards/accuracies": 0.75, "rewards/chosen": -0.9329289197921753, "rewards/margins": 0.7391577959060669, "rewards/rejected": -1.6720867156982422, "step": 149 }, { "epoch": 0.22675736961451248, "epsilon_dpo/beta": 0.052416734397411346, "epsilon_dpo/beta_margin_grad_mean": -0.3062080144882202, "epsilon_dpo/beta_margin_grad_std": 0.2106676995754242, "epsilon_dpo/beta_margin_mean": 1.0532740354537964, "epsilon_dpo/beta_margin_std": 1.1993595361709595, "epsilon_dpo/loss_margin_mean": 20.23509407043457, "grad_norm": 56.90858459472656, "kl/avg_steps": 0.53125, "kl/beta": 0.052691418677568436, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.768555511768486e-07, "logits/chosen": -3.591209888458252, "logits/rejected": -3.6025662422180176, "logps/chosen": -72.72065734863281, "logps/ref_chosen": -56.64420700073242, "logps/ref_rejected": -82.00848388671875, "logps/rejected": -118.32003021240234, "loss": 0.859, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8470699787139893, "rewards/margins": 1.053274154663086, "rewards/rejected": -1.9003441333770752, "step": 150 }, { "epoch": 0.22826908541194255, "epsilon_dpo/beta": 0.05204145982861519, "epsilon_dpo/beta_margin_grad_mean": -0.3043166995048523, "epsilon_dpo/beta_margin_grad_std": 0.21629007160663605, "epsilon_dpo/beta_margin_mean": 1.0496770143508911, "epsilon_dpo/beta_margin_std": 1.200351357460022, "epsilon_dpo/loss_margin_mean": 20.278778076171875, "grad_norm": 58.78172302246094, "kl/avg_steps": 0.71875, "kl/beta": 0.0524129755795002, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.762967578776406e-07, "logits/chosen": -3.5576634407043457, "logits/rejected": -3.6119213104248047, "logps/chosen": -57.69886016845703, "logps/ref_chosen": -44.93402099609375, "logps/ref_rejected": -73.86955261230469, "logps/rejected": -106.91316986083984, "loss": 0.8703, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6674463748931885, "rewards/margins": 1.0496768951416016, "rewards/rejected": -1.7171233892440796, "step": 151 }, { "epoch": 0.22978080120937264, "epsilon_dpo/beta": 0.05173513665795326, "epsilon_dpo/beta_margin_grad_mean": -0.3350249230861664, "epsilon_dpo/beta_margin_grad_std": 0.23123003542423248, "epsilon_dpo/beta_margin_mean": 0.8763881325721741, "epsilon_dpo/beta_margin_std": 1.2190828323364258, "epsilon_dpo/loss_margin_mean": 17.084829330444336, "grad_norm": 70.6114730834961, "kl/avg_steps": 0.59375, "kl/beta": 0.05203894525766373, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.757316345716553e-07, "logits/chosen": -3.625560760498047, "logits/rejected": -3.608933210372925, "logps/chosen": -69.50746154785156, "logps/ref_chosen": -51.6949577331543, "logps/ref_rejected": -70.3248062133789, "logps/rejected": -105.22213745117188, "loss": 0.9932, "rewards/accuracies": 0.8125, "rewards/chosen": -0.925588071346283, "rewards/margins": 0.8763881325721741, "rewards/rejected": -1.801976203918457, "step": 152 }, { "epoch": 0.23129251700680273, "epsilon_dpo/beta": 0.051413603127002716, "epsilon_dpo/beta_margin_grad_mean": -0.30014315247535706, "epsilon_dpo/beta_margin_grad_std": 0.1804763227701187, "epsilon_dpo/beta_margin_mean": 1.0139654874801636, "epsilon_dpo/beta_margin_std": 1.002108097076416, "epsilon_dpo/loss_margin_mean": 19.822311401367188, "grad_norm": 53.15495300292969, "kl/avg_steps": 0.625, "kl/beta": 0.051731787621974945, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.751601970666064e-07, "logits/chosen": -3.635664939880371, "logits/rejected": -3.5478134155273438, "logps/chosen": -76.57771301269531, "logps/ref_chosen": -60.60557556152344, "logps/ref_rejected": -67.41244506835938, "logps/rejected": -103.20689392089844, "loss": 0.8067, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8240259885787964, "rewards/margins": 1.0139654874801636, "rewards/rejected": -1.83799147605896, "step": 153 }, { "epoch": 0.2328042328042328, "epsilon_dpo/beta": 0.051174599677324295, "epsilon_dpo/beta_margin_grad_mean": -0.35717567801475525, "epsilon_dpo/beta_margin_grad_std": 0.23020318150520325, "epsilon_dpo/beta_margin_mean": 0.7237842679023743, "epsilon_dpo/beta_margin_std": 1.1998422145843506, "epsilon_dpo/loss_margin_mean": 14.30461597442627, "grad_norm": 74.33574676513672, "kl/avg_steps": 0.46875, "kl/beta": 0.05141047015786171, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.745824613468292e-07, "logits/chosen": -3.624474048614502, "logits/rejected": -3.5255866050720215, "logps/chosen": -71.77596282958984, "logps/ref_chosen": -54.084388732910156, "logps/ref_rejected": -60.368202209472656, "logps/rejected": -92.36438751220703, "loss": 1.0875, "rewards/accuracies": 0.75, "rewards/chosen": -0.9091506004333496, "rewards/margins": 0.723784327507019, "rewards/rejected": -1.632934808731079, "step": 154 }, { "epoch": 0.23431594860166288, "epsilon_dpo/beta": 0.05093584209680557, "epsilon_dpo/beta_margin_grad_mean": -0.33003032207489014, "epsilon_dpo/beta_margin_grad_std": 0.23836220800876617, "epsilon_dpo/beta_margin_mean": 0.9810704588890076, "epsilon_dpo/beta_margin_std": 1.4113388061523438, "epsilon_dpo/loss_margin_mean": 19.438228607177734, "grad_norm": 64.75841522216797, "kl/avg_steps": 0.46875, "kl/beta": 0.05117060989141464, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.7399844357283393e-07, "logits/chosen": -3.6578752994537354, "logits/rejected": -3.639376163482666, "logps/chosen": -81.88642120361328, "logps/ref_chosen": -60.370052337646484, "logps/ref_rejected": -76.73888397216797, "logps/rejected": -117.69347381591797, "loss": 0.9937, "rewards/accuracies": 0.734375, "rewards/chosen": -1.0995151996612549, "rewards/margins": 0.9810703992843628, "rewards/rejected": -2.080585479736328, "step": 155 }, { "epoch": 0.23582766439909297, "epsilon_dpo/beta": 0.050666358321905136, "epsilon_dpo/beta_margin_grad_mean": -0.3388752043247223, "epsilon_dpo/beta_margin_grad_std": 0.2115233689546585, "epsilon_dpo/beta_margin_mean": 0.8669487833976746, "epsilon_dpo/beta_margin_std": 1.179270625114441, "epsilon_dpo/loss_margin_mean": 17.244836807250977, "grad_norm": 58.32075881958008, "kl/avg_steps": 0.53125, "kl/beta": 0.050931867212057114, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.7340816008085305e-07, "logits/chosen": -3.590078830718994, "logits/rejected": -3.582353353500366, "logps/chosen": -82.50497436523438, "logps/ref_chosen": -60.56532669067383, "logps/ref_rejected": -76.27922058105469, "logps/rejected": -115.46370697021484, "loss": 0.9638, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1133005619049072, "rewards/margins": 0.8669488430023193, "rewards/rejected": -1.9802494049072266, "step": 156 }, { "epoch": 0.23733938019652306, "epsilon_dpo/beta": 0.050366949290037155, "epsilon_dpo/beta_margin_grad_mean": -0.33017948269844055, "epsilon_dpo/beta_margin_grad_std": 0.19075718522071838, "epsilon_dpo/beta_margin_mean": 0.8848145008087158, "epsilon_dpo/beta_margin_std": 1.0379109382629395, "epsilon_dpo/loss_margin_mean": 17.674890518188477, "grad_norm": 66.0638427734375, "kl/avg_steps": 0.59375, "kl/beta": 0.05066272243857384, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.728116273823847e-07, "logits/chosen": -3.5643856525421143, "logits/rejected": -3.5565683841705322, "logps/chosen": -67.79183959960938, "logps/ref_chosen": -46.92461395263672, "logps/ref_rejected": -56.241336822509766, "logps/rejected": -94.78345489501953, "loss": 0.8965, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0532903671264648, "rewards/margins": 0.8848145008087158, "rewards/rejected": -1.9381047487258911, "step": 157 }, { "epoch": 0.23885109599395313, "epsilon_dpo/beta": 0.050132621079683304, "epsilon_dpo/beta_margin_grad_mean": -0.3434183895587921, "epsilon_dpo/beta_margin_grad_std": 0.20261330902576447, "epsilon_dpo/beta_margin_mean": 0.8267223238945007, "epsilon_dpo/beta_margin_std": 1.0934404134750366, "epsilon_dpo/loss_margin_mean": 16.629432678222656, "grad_norm": 63.82464599609375, "kl/avg_steps": 0.46875, "kl/beta": 0.0503636859357357, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.7220886216373085e-07, "logits/chosen": -3.562502861022949, "logits/rejected": -3.5705795288085938, "logps/chosen": -77.56390380859375, "logps/ref_chosen": -55.65465545654297, "logps/ref_rejected": -74.56820678710938, "logps/rejected": -113.10688781738281, "loss": 0.9578, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1020269393920898, "rewards/margins": 0.8267223834991455, "rewards/rejected": -1.9287493228912354, "step": 158 }, { "epoch": 0.24036281179138322, "epsilon_dpo/beta": 0.049883052706718445, "epsilon_dpo/beta_margin_grad_mean": -0.3375348746776581, "epsilon_dpo/beta_margin_grad_std": 0.18772846460342407, "epsilon_dpo/beta_margin_mean": 0.8095844984054565, "epsilon_dpo/beta_margin_std": 1.085353970527649, "epsilon_dpo/loss_margin_mean": 16.356380462646484, "grad_norm": 58.861141204833984, "kl/avg_steps": 0.5, "kl/beta": 0.05012870952486992, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.715998812855304e-07, "logits/chosen": -3.5401625633239746, "logits/rejected": -3.6210856437683105, "logps/chosen": -71.43243408203125, "logps/ref_chosen": -49.36960220336914, "logps/ref_rejected": -70.33050537109375, "logps/rejected": -108.74971771240234, "loss": 0.9588, "rewards/accuracies": 0.78125, "rewards/chosen": -1.104860544204712, "rewards/margins": 0.8095844984054565, "rewards/rejected": -1.914444923400879, "step": 159 }, { "epoch": 0.2418745275888133, "epsilon_dpo/beta": 0.04960370063781738, "epsilon_dpo/beta_margin_grad_mean": -0.35852161049842834, "epsilon_dpo/beta_margin_grad_std": 0.21626383066177368, "epsilon_dpo/beta_margin_mean": 0.7398229837417603, "epsilon_dpo/beta_margin_std": 1.161458969116211, "epsilon_dpo/loss_margin_mean": 15.047536849975586, "grad_norm": 57.379150390625, "kl/avg_steps": 0.5625, "kl/beta": 0.04987931251525879, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.7098470178228755e-07, "logits/chosen": -3.447559356689453, "logits/rejected": -3.515584945678711, "logps/chosen": -66.16007232666016, "logps/ref_chosen": -40.571006774902344, "logps/ref_rejected": -56.52306365966797, "logps/rejected": -97.15966796875, "loss": 1.048, "rewards/accuracies": 0.734375, "rewards/chosen": -1.2719833850860596, "rewards/margins": 0.7398229837417603, "rewards/rejected": -2.0118064880371094, "step": 160 }, { "epoch": 0.24338624338624337, "epsilon_dpo/beta": 0.04932624101638794, "epsilon_dpo/beta_margin_grad_mean": -0.3515292704105377, "epsilon_dpo/beta_margin_grad_std": 0.22672498226165771, "epsilon_dpo/beta_margin_mean": 0.798446774482727, "epsilon_dpo/beta_margin_std": 1.2363989353179932, "epsilon_dpo/loss_margin_mean": 16.3371524810791, "grad_norm": 65.39254760742188, "kl/avg_steps": 0.5625, "kl/beta": 0.04960031062364578, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.703633408618955e-07, "logits/chosen": -3.500631093978882, "logits/rejected": -3.5386672019958496, "logps/chosen": -72.21726989746094, "logps/ref_chosen": -48.20892333984375, "logps/ref_rejected": -60.323936462402344, "logps/rejected": -100.66943359375, "loss": 1.0437, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1880271434783936, "rewards/margins": 0.798446774482727, "rewards/rejected": -1.986473798751831, "step": 161 }, { "epoch": 0.24489795918367346, "epsilon_dpo/beta": 0.04903491958975792, "epsilon_dpo/beta_margin_grad_mean": -0.3111317753791809, "epsilon_dpo/beta_margin_grad_std": 0.21344825625419617, "epsilon_dpo/beta_margin_mean": 0.994334876537323, "epsilon_dpo/beta_margin_std": 1.1776517629623413, "epsilon_dpo/loss_margin_mean": 20.402523040771484, "grad_norm": 64.57035827636719, "kl/avg_steps": 0.59375, "kl/beta": 0.04932286962866783, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.697358159051549e-07, "logits/chosen": -3.6434125900268555, "logits/rejected": -3.6016693115234375, "logps/chosen": -87.64012145996094, "logps/ref_chosen": -62.44020080566406, "logps/ref_rejected": -76.23294830322266, "logps/rejected": -121.83538818359375, "loss": 0.8938, "rewards/accuracies": 0.828125, "rewards/chosen": -1.2394516468048096, "rewards/margins": 0.9943349361419678, "rewards/rejected": -2.2337865829467773, "step": 162 }, { "epoch": 0.24640967498110355, "epsilon_dpo/beta": 0.04876081645488739, "epsilon_dpo/beta_margin_grad_mean": -0.30116501450538635, "epsilon_dpo/beta_margin_grad_std": 0.1910446733236313, "epsilon_dpo/beta_margin_mean": 1.0497266054153442, "epsilon_dpo/beta_margin_std": 1.0632407665252686, "epsilon_dpo/loss_margin_mean": 21.652732849121094, "grad_norm": 50.93362045288086, "kl/avg_steps": 0.5625, "kl/beta": 0.04903174191713333, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.691021444652876e-07, "logits/chosen": -3.565215587615967, "logits/rejected": -3.535281181335449, "logps/chosen": -64.9280776977539, "logps/ref_chosen": -45.337562561035156, "logps/ref_rejected": -63.273406982421875, "logps/rejected": -104.51664733886719, "loss": 0.8074, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9571138024330139, "rewards/margins": 1.0497266054153442, "rewards/rejected": -2.006840467453003, "step": 163 }, { "epoch": 0.24792139077853365, "epsilon_dpo/beta": 0.04845759645104408, "epsilon_dpo/beta_margin_grad_mean": -0.3239656388759613, "epsilon_dpo/beta_margin_grad_std": 0.21160705387592316, "epsilon_dpo/beta_margin_mean": 0.9332375526428223, "epsilon_dpo/beta_margin_std": 1.1710841655731201, "epsilon_dpo/loss_margin_mean": 19.389503479003906, "grad_norm": 54.76251220703125, "kl/avg_steps": 0.625, "kl/beta": 0.04875748232007027, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.6846234426744624e-07, "logits/chosen": -3.647716522216797, "logits/rejected": -3.614161968231201, "logps/chosen": -77.27468872070312, "logps/ref_chosen": -54.445289611816406, "logps/ref_rejected": -70.19892883300781, "logps/rejected": -112.41783142089844, "loss": 0.921, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1089608669281006, "rewards/margins": 0.9332375526428223, "rewards/rejected": -2.042198419570923, "step": 164 }, { "epoch": 0.2494331065759637, "epsilon_dpo/beta": 0.048202045261859894, "epsilon_dpo/beta_margin_grad_mean": -0.32203999161720276, "epsilon_dpo/beta_margin_grad_std": 0.2006681263446808, "epsilon_dpo/beta_margin_mean": 0.8855904340744019, "epsilon_dpo/beta_margin_std": 1.0164930820465088, "epsilon_dpo/loss_margin_mean": 18.509578704833984, "grad_norm": 56.154781341552734, "kl/avg_steps": 0.53125, "kl/beta": 0.0484546422958374, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.678164332082175e-07, "logits/chosen": -3.5790343284606934, "logits/rejected": -3.5486669540405273, "logps/chosen": -71.81561279296875, "logps/ref_chosen": -47.389827728271484, "logps/ref_rejected": -63.35897445678711, "logps/rejected": -106.29434204101562, "loss": 0.9023, "rewards/accuracies": 0.75, "rewards/chosen": -1.1805477142333984, "rewards/margins": 0.8855903744697571, "rewards/rejected": -2.06613826751709, "step": 165 }, { "epoch": 0.2509448223733938, "epsilon_dpo/beta": 0.048007577657699585, "epsilon_dpo/beta_margin_grad_mean": -0.36170363426208496, "epsilon_dpo/beta_margin_grad_std": 0.1843031495809555, "epsilon_dpo/beta_margin_mean": 0.6799610257148743, "epsilon_dpo/beta_margin_std": 0.9218351244926453, "epsilon_dpo/loss_margin_mean": 14.293317794799805, "grad_norm": 63.26681137084961, "kl/avg_steps": 0.40625, "kl/beta": 0.04819858446717262, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.6716442935512214e-07, "logits/chosen": -3.4971320629119873, "logits/rejected": -3.661100387573242, "logps/chosen": -77.93630981445312, "logps/ref_chosen": -57.656272888183594, "logps/ref_rejected": -79.67424774169922, "logps/rejected": -114.24760437011719, "loss": 0.9964, "rewards/accuracies": 0.703125, "rewards/chosen": -0.97629314661026, "rewards/margins": 0.6799610257148743, "rewards/rejected": -1.6562541723251343, "step": 166 }, { "epoch": 0.25245653817082386, "epsilon_dpo/beta": 0.047663308680057526, "epsilon_dpo/beta_margin_grad_mean": -0.3211416006088257, "epsilon_dpo/beta_margin_grad_std": 0.18262603878974915, "epsilon_dpo/beta_margin_mean": 0.9023188352584839, "epsilon_dpo/beta_margin_std": 1.0030823945999146, "epsilon_dpo/loss_margin_mean": 19.02277374267578, "grad_norm": 49.43205642700195, "kl/avg_steps": 0.71875, "kl/beta": 0.04800356924533844, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -3.5860724449157715, "logits/rejected": -3.5782463550567627, "logps/chosen": -71.65338134765625, "logps/ref_chosen": -51.68077850341797, "logps/ref_rejected": -67.53275299072266, "logps/rejected": -106.52812957763672, "loss": 0.8733, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9536038637161255, "rewards/margins": 0.9023188352584839, "rewards/rejected": -1.855922818183899, "step": 167 }, { "epoch": 0.25396825396825395, "epsilon_dpo/beta": 0.04745723679661751, "epsilon_dpo/beta_margin_grad_mean": -0.3584892749786377, "epsilon_dpo/beta_margin_grad_std": 0.20406082272529602, "epsilon_dpo/beta_margin_mean": 0.7274661660194397, "epsilon_dpo/beta_margin_std": 1.0659621953964233, "epsilon_dpo/loss_margin_mean": 15.473912239074707, "grad_norm": 59.509124755859375, "kl/avg_steps": 0.4375, "kl/beta": 0.047661006450653076, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.6584221638904767e-07, "logits/chosen": -3.56551194190979, "logits/rejected": -3.6357436180114746, "logps/chosen": -75.70201110839844, "logps/ref_chosen": -53.42637634277344, "logps/ref_rejected": -75.50228118896484, "logps/rejected": -113.2518310546875, "loss": 1.0193, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0603691339492798, "rewards/margins": 0.7274661064147949, "rewards/rejected": -1.7878352403640747, "step": 168 }, { "epoch": 0.25547996976568405, "epsilon_dpo/beta": 0.047176361083984375, "epsilon_dpo/beta_margin_grad_mean": -0.33562350273132324, "epsilon_dpo/beta_margin_grad_std": 0.20018763840198517, "epsilon_dpo/beta_margin_mean": 0.881025493144989, "epsilon_dpo/beta_margin_std": 1.192308783531189, "epsilon_dpo/loss_margin_mean": 18.79608917236328, "grad_norm": 54.50014877319336, "kl/avg_steps": 0.59375, "kl/beta": 0.04745339974761009, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.651720442612075e-07, "logits/chosen": -3.5871801376342773, "logits/rejected": -3.5970380306243896, "logps/chosen": -66.86529541015625, "logps/ref_chosen": -49.40599060058594, "logps/ref_rejected": -65.49603271484375, "logps/rejected": -101.75143432617188, "loss": 0.9447, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8265011310577393, "rewards/margins": 0.8810254335403442, "rewards/rejected": -1.7075265645980835, "step": 169 }, { "epoch": 0.25699168556311414, "epsilon_dpo/beta": 0.046853676438331604, "epsilon_dpo/beta_margin_grad_mean": -0.3128993511199951, "epsilon_dpo/beta_margin_grad_std": 0.18948619067668915, "epsilon_dpo/beta_margin_mean": 0.9593890309333801, "epsilon_dpo/beta_margin_std": 1.0811383724212646, "epsilon_dpo/loss_margin_mean": 20.58062744140625, "grad_norm": 46.207069396972656, "kl/avg_steps": 0.6875, "kl/beta": 0.047173306345939636, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.6449585330874425e-07, "logits/chosen": -3.5795295238494873, "logits/rejected": -3.5198726654052734, "logps/chosen": -61.862640380859375, "logps/ref_chosen": -47.971099853515625, "logps/ref_rejected": -61.10160446166992, "logps/rejected": -95.57377624511719, "loss": 0.8658, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6522313356399536, "rewards/margins": 0.9593890309333801, "rewards/rejected": -1.6116204261779785, "step": 170 }, { "epoch": 0.2585034013605442, "epsilon_dpo/beta": 0.046577684581279755, "epsilon_dpo/beta_margin_grad_mean": -0.31205323338508606, "epsilon_dpo/beta_margin_grad_std": 0.2107831835746765, "epsilon_dpo/beta_margin_mean": 0.9846940636634827, "epsilon_dpo/beta_margin_std": 1.1809011697769165, "epsilon_dpo/loss_margin_mean": 21.28053092956543, "grad_norm": 53.736328125, "kl/avg_steps": 0.59375, "kl/beta": 0.046851206570863724, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.6381366244617224e-07, "logits/chosen": -3.5749385356903076, "logits/rejected": -3.6224958896636963, "logps/chosen": -72.02838134765625, "logps/ref_chosen": -55.938690185546875, "logps/ref_rejected": -70.7587890625, "logps/rejected": -108.12901306152344, "loss": 0.8972, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7522577047348022, "rewards/margins": 0.9846941232681274, "rewards/rejected": -1.7369518280029297, "step": 171 }, { "epoch": 0.2600151171579743, "epsilon_dpo/beta": 0.04633907601237297, "epsilon_dpo/beta_margin_grad_mean": -0.33705294132232666, "epsilon_dpo/beta_margin_grad_std": 0.20389167964458466, "epsilon_dpo/beta_margin_mean": 0.874911904335022, "epsilon_dpo/beta_margin_std": 1.2169901132583618, "epsilon_dpo/loss_margin_mean": 19.021936416625977, "grad_norm": 50.66790771484375, "kl/avg_steps": 0.515625, "kl/beta": 0.046574667096138, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.631254907558365e-07, "logits/chosen": -3.5963845252990723, "logits/rejected": -3.6500320434570312, "logps/chosen": -63.69145584106445, "logps/ref_chosen": -45.553306579589844, "logps/ref_rejected": -77.72467041015625, "logps/rejected": -114.88475036621094, "loss": 0.9646, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8427436351776123, "rewards/margins": 0.874911904335022, "rewards/rejected": -1.7176555395126343, "step": 172 }, { "epoch": 0.2615268329554044, "epsilon_dpo/beta": 0.04613764211535454, "epsilon_dpo/beta_margin_grad_mean": -0.32221221923828125, "epsilon_dpo/beta_margin_grad_std": 0.23570886254310608, "epsilon_dpo/beta_margin_mean": 0.9976052045822144, "epsilon_dpo/beta_margin_std": 1.3106862306594849, "epsilon_dpo/loss_margin_mean": 21.82313346862793, "grad_norm": 53.169654846191406, "kl/avg_steps": 0.4375, "kl/beta": 0.046335749328136444, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.624313574873786e-07, "logits/chosen": -3.594334602355957, "logits/rejected": -3.637389659881592, "logps/chosen": -69.34404754638672, "logps/ref_chosen": -51.619972229003906, "logps/ref_rejected": -70.03333282470703, "logps/rejected": -109.58055114746094, "loss": 0.9509, "rewards/accuracies": 0.734375, "rewards/chosen": -0.820642352104187, "rewards/margins": 0.9976052045822144, "rewards/rejected": -1.8182475566864014, "step": 173 }, { "epoch": 0.26303854875283444, "epsilon_dpo/beta": 0.04589341580867767, "epsilon_dpo/beta_margin_grad_mean": -0.33343765139579773, "epsilon_dpo/beta_margin_grad_std": 0.21264557540416718, "epsilon_dpo/beta_margin_mean": 0.889533281326294, "epsilon_dpo/beta_margin_std": 1.1625179052352905, "epsilon_dpo/loss_margin_mean": 19.532835006713867, "grad_norm": 59.12575912475586, "kl/avg_steps": 0.53125, "kl/beta": 0.04613391309976578, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.61731282057198e-07, "logits/chosen": -3.5268261432647705, "logits/rejected": -3.5748157501220703, "logps/chosen": -71.56057739257812, "logps/ref_chosen": -51.86175537109375, "logps/ref_rejected": -81.18296813964844, "logps/rejected": -120.41461181640625, "loss": 0.9488, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9065666198730469, "rewards/margins": 0.889533281326294, "rewards/rejected": -1.7960999011993408, "step": 174 }, { "epoch": 0.26455026455026454, "epsilon_dpo/beta": 0.045650895684957504, "epsilon_dpo/beta_margin_grad_mean": -0.3223435580730438, "epsilon_dpo/beta_margin_grad_std": 0.22029392421245575, "epsilon_dpo/beta_margin_mean": 1.005196213722229, "epsilon_dpo/beta_margin_std": 1.2857012748718262, "epsilon_dpo/loss_margin_mean": 22.18084144592285, "grad_norm": 56.27669906616211, "kl/avg_steps": 0.53125, "kl/beta": 0.04589012265205383, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.6102528404790965e-07, "logits/chosen": -3.6441550254821777, "logits/rejected": -3.5665152072906494, "logps/chosen": -76.30023193359375, "logps/ref_chosen": -59.74464416503906, "logps/ref_rejected": -78.72744750976562, "logps/rejected": -117.4638671875, "loss": 0.919, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7581020593643188, "rewards/margins": 1.005196213722229, "rewards/rejected": -1.7632982730865479, "step": 175 }, { "epoch": 0.2660619803476946, "epsilon_dpo/beta": 0.045366860926151276, "epsilon_dpo/beta_margin_grad_mean": -0.33774280548095703, "epsilon_dpo/beta_margin_grad_std": 0.21605472266674042, "epsilon_dpo/beta_margin_mean": 0.8728004693984985, "epsilon_dpo/beta_margin_std": 1.3553060293197632, "epsilon_dpo/loss_margin_mean": 19.382476806640625, "grad_norm": 64.23115539550781, "kl/avg_steps": 0.625, "kl/beta": 0.045647621154785156, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.603133832077953e-07, "logits/chosen": -3.6345720291137695, "logits/rejected": -3.6211352348327637, "logps/chosen": -80.3158950805664, "logps/ref_chosen": -61.95441818237305, "logps/ref_rejected": -78.63496398925781, "logps/rejected": -116.37892150878906, "loss": 1.0251, "rewards/accuracies": 0.796875, "rewards/chosen": -0.836687445640564, "rewards/margins": 0.8728004693984985, "rewards/rejected": -1.7094879150390625, "step": 176 }, { "epoch": 0.2675736961451247, "epsilon_dpo/beta": 0.045014187693595886, "epsilon_dpo/beta_margin_grad_mean": -0.2686346173286438, "epsilon_dpo/beta_margin_grad_std": 0.1771967113018036, "epsilon_dpo/beta_margin_mean": 1.2202128171920776, "epsilon_dpo/beta_margin_std": 1.027571201324463, "epsilon_dpo/loss_margin_mean": 27.18921661376953, "grad_norm": 47.27944564819336, "kl/avg_steps": 0.78125, "kl/beta": 0.04536409303545952, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.5959559945025183e-07, "logits/chosen": -3.5726253986358643, "logits/rejected": -3.620364189147949, "logps/chosen": -67.03947448730469, "logps/ref_chosen": -52.42279815673828, "logps/ref_rejected": -85.30690002441406, "logps/rejected": -127.11279296875, "loss": 0.6989, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6589210033416748, "rewards/margins": 1.2202129364013672, "rewards/rejected": -1.879133939743042, "step": 177 }, { "epoch": 0.2690854119425548, "epsilon_dpo/beta": 0.044805917888879776, "epsilon_dpo/beta_margin_grad_mean": -0.360929012298584, "epsilon_dpo/beta_margin_grad_std": 0.1955314576625824, "epsilon_dpo/beta_margin_mean": 0.728017270565033, "epsilon_dpo/beta_margin_std": 1.049317717552185, "epsilon_dpo/loss_margin_mean": 16.383920669555664, "grad_norm": 57.02069091796875, "kl/avg_steps": 0.46875, "kl/beta": 0.04501243308186531, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.588719528532341e-07, "logits/chosen": -3.619588613510132, "logits/rejected": -3.6338610649108887, "logps/chosen": -77.85858917236328, "logps/ref_chosen": -59.63611602783203, "logps/ref_rejected": -71.87225341796875, "logps/rejected": -106.47864532470703, "loss": 1.0054, "rewards/accuracies": 0.75, "rewards/chosen": -0.8191609382629395, "rewards/margins": 0.728017270565033, "rewards/rejected": -1.5471782684326172, "step": 178 }, { "epoch": 0.2705971277399849, "epsilon_dpo/beta": 0.04455486312508583, "epsilon_dpo/beta_margin_grad_mean": -0.3204881548881531, "epsilon_dpo/beta_margin_grad_std": 0.1949918568134308, "epsilon_dpo/beta_margin_mean": 0.9805029630661011, "epsilon_dpo/beta_margin_std": 1.1742777824401855, "epsilon_dpo/loss_margin_mean": 22.144079208374023, "grad_norm": 52.48714828491211, "kl/avg_steps": 0.5625, "kl/beta": 0.04480242356657982, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.581424636586928e-07, "logits/chosen": -3.571816921234131, "logits/rejected": -3.5420970916748047, "logps/chosen": -75.46466064453125, "logps/ref_chosen": -57.10906219482422, "logps/ref_rejected": -67.3602523803711, "logps/rejected": -107.85993957519531, "loss": 0.8712, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8206260800361633, "rewards/margins": 0.9805029630661011, "rewards/rejected": -1.8011291027069092, "step": 179 }, { "epoch": 0.272108843537415, "epsilon_dpo/beta": 0.04436134174466133, "epsilon_dpo/beta_margin_grad_mean": -0.34988975524902344, "epsilon_dpo/beta_margin_grad_std": 0.21295034885406494, "epsilon_dpo/beta_margin_mean": 0.8287549614906311, "epsilon_dpo/beta_margin_std": 1.1953363418579102, "epsilon_dpo/loss_margin_mean": 18.84685516357422, "grad_norm": 58.387325286865234, "kl/avg_steps": 0.4375, "kl/beta": 0.04455181956291199, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.5740715227200897e-07, "logits/chosen": -3.5672121047973633, "logits/rejected": -3.5216469764709473, "logps/chosen": -67.44782257080078, "logps/ref_chosen": -51.37254333496094, "logps/ref_rejected": -58.99993896484375, "logps/rejected": -93.92207336425781, "loss": 0.9957, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7159271240234375, "rewards/margins": 0.8287550210952759, "rewards/rejected": -1.5446821451187134, "step": 180 }, { "epoch": 0.273620559334845, "epsilon_dpo/beta": 0.044015612453222275, "epsilon_dpo/beta_margin_grad_mean": -0.28116342425346375, "epsilon_dpo/beta_margin_grad_std": 0.16739556193351746, "epsilon_dpo/beta_margin_mean": 1.1541932821273804, "epsilon_dpo/beta_margin_std": 1.033773422241211, "epsilon_dpo/loss_margin_mean": 26.29059600830078, "grad_norm": 47.46848678588867, "kl/avg_steps": 0.78125, "kl/beta": 0.04435775429010391, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.566660392614228e-07, "logits/chosen": -3.6032066345214844, "logits/rejected": -3.663069248199463, "logps/chosen": -65.22819519042969, "logps/ref_chosen": -52.26130294799805, "logps/ref_rejected": -77.60012817382812, "logps/rejected": -116.85762023925781, "loss": 0.7235, "rewards/accuracies": 0.890625, "rewards/chosen": -0.571053683757782, "rewards/margins": 1.1541932821273804, "rewards/rejected": -1.7252469062805176, "step": 181 }, { "epoch": 0.2751322751322751, "epsilon_dpo/beta": 0.043715670704841614, "epsilon_dpo/beta_margin_grad_mean": -0.2936168611049652, "epsilon_dpo/beta_margin_grad_std": 0.194436177611351, "epsilon_dpo/beta_margin_mean": 1.1473525762557983, "epsilon_dpo/beta_margin_std": 1.194411277770996, "epsilon_dpo/loss_margin_mean": 26.357786178588867, "grad_norm": 52.253238677978516, "kl/avg_steps": 0.6875, "kl/beta": 0.044013895094394684, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.5591914535745817e-07, "logits/chosen": -3.5446319580078125, "logits/rejected": -3.580604076385498, "logps/chosen": -67.93936920166016, "logps/ref_chosen": -52.05140686035156, "logps/ref_rejected": -81.18222045898438, "logps/rejected": -123.427978515625, "loss": 0.7864, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6979972124099731, "rewards/margins": 1.1473525762557983, "rewards/rejected": -1.8453497886657715, "step": 182 }, { "epoch": 0.2766439909297052, "epsilon_dpo/beta": 0.04351281002163887, "epsilon_dpo/beta_margin_grad_mean": -0.3895297348499298, "epsilon_dpo/beta_margin_grad_std": 0.2144244760274887, "epsilon_dpo/beta_margin_mean": 0.5633929967880249, "epsilon_dpo/beta_margin_std": 1.1432398557662964, "epsilon_dpo/loss_margin_mean": 13.11093807220459, "grad_norm": 69.27127075195312, "kl/avg_steps": 0.46875, "kl/beta": 0.043713364750146866, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.551664914523433e-07, "logits/chosen": -3.534937858581543, "logits/rejected": -3.5456008911132812, "logps/chosen": -81.08416748046875, "logps/ref_chosen": -54.96217727661133, "logps/ref_rejected": -64.22110748291016, "logps/rejected": -103.45404052734375, "loss": 1.1692, "rewards/accuracies": 0.75, "rewards/chosen": -1.1410609483718872, "rewards/margins": 0.5633929967880249, "rewards/rejected": -1.704453945159912, "step": 183 }, { "epoch": 0.2781557067271353, "epsilon_dpo/beta": 0.04325540363788605, "epsilon_dpo/beta_margin_grad_mean": -0.30157220363616943, "epsilon_dpo/beta_margin_grad_std": 0.18168756365776062, "epsilon_dpo/beta_margin_mean": 1.048963189125061, "epsilon_dpo/beta_margin_std": 1.0502376556396484, "epsilon_dpo/loss_margin_mean": 24.37201499938965, "grad_norm": 48.86817169189453, "kl/avg_steps": 0.59375, "kl/beta": 0.04350941628217697, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.544080985994258e-07, "logits/chosen": -3.4998884201049805, "logits/rejected": -3.5392417907714844, "logps/chosen": -54.561336517333984, "logps/ref_chosen": -40.86670684814453, "logps/ref_rejected": -65.33458709716797, "logps/rejected": -103.40122985839844, "loss": 0.7961, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5926429033279419, "rewards/margins": 1.0489630699157715, "rewards/rejected": -1.641606092453003, "step": 184 }, { "epoch": 0.2796674225245654, "epsilon_dpo/beta": 0.043081194162368774, "epsilon_dpo/beta_margin_grad_mean": -0.32219430804252625, "epsilon_dpo/beta_margin_grad_std": 0.24138055741786957, "epsilon_dpo/beta_margin_mean": 1.036718487739563, "epsilon_dpo/beta_margin_std": 1.3732231855392456, "epsilon_dpo/loss_margin_mean": 24.285327911376953, "grad_norm": 55.269935607910156, "kl/avg_steps": 0.40625, "kl/beta": 0.043252602219581604, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.5364398801258394e-07, "logits/chosen": -3.445655584335327, "logits/rejected": -3.542100191116333, "logps/chosen": -62.8067741394043, "logps/ref_chosen": -44.29344177246094, "logps/ref_rejected": -57.49565887451172, "logps/rejected": -100.29432678222656, "loss": 0.9524, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8013401031494141, "rewards/margins": 1.036718487739563, "rewards/rejected": -1.838058590888977, "step": 185 }, { "epoch": 0.2811791383219955, "epsilon_dpo/beta": 0.04287995770573616, "epsilon_dpo/beta_margin_grad_mean": -0.29706496000289917, "epsilon_dpo/beta_margin_grad_std": 0.20956268906593323, "epsilon_dpo/beta_margin_mean": 1.1100947856903076, "epsilon_dpo/beta_margin_std": 1.2024697065353394, "epsilon_dpo/loss_margin_mean": 26.0731201171875, "grad_norm": 58.81276321411133, "kl/avg_steps": 0.46875, "kl/beta": 0.04307759925723076, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.5287418106563354e-07, "logits/chosen": -3.537816047668457, "logits/rejected": -3.5835390090942383, "logps/chosen": -70.7039794921875, "logps/ref_chosen": -56.45159912109375, "logps/ref_rejected": -76.78506469726562, "logps/rejected": -117.11056518554688, "loss": 0.8297, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6134045124053955, "rewards/margins": 1.1100947856903076, "rewards/rejected": -1.7234992980957031, "step": 186 }, { "epoch": 0.28269085411942557, "epsilon_dpo/beta": 0.04261289909482002, "epsilon_dpo/beta_margin_grad_mean": -0.3153734505176544, "epsilon_dpo/beta_margin_grad_std": 0.2297608107328415, "epsilon_dpo/beta_margin_mean": 1.0501863956451416, "epsilon_dpo/beta_margin_std": 1.3641703128814697, "epsilon_dpo/loss_margin_mean": 24.806705474853516, "grad_norm": 62.804962158203125, "kl/avg_steps": 0.625, "kl/beta": 0.04287661612033844, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.520986992917297e-07, "logits/chosen": -3.578516960144043, "logits/rejected": -3.540525436401367, "logps/chosen": -81.83871459960938, "logps/ref_chosen": -62.23444747924805, "logps/ref_rejected": -85.04208374023438, "logps/rejected": -129.45306396484375, "loss": 0.9331, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8379230499267578, "rewards/margins": 1.0501863956451416, "rewards/rejected": -1.8881094455718994, "step": 187 }, { "epoch": 0.2842025699168556, "epsilon_dpo/beta": 0.04238817095756531, "epsilon_dpo/beta_margin_grad_mean": -0.32389408349990845, "epsilon_dpo/beta_margin_grad_std": 0.2121199071407318, "epsilon_dpo/beta_margin_mean": 1.0027621984481812, "epsilon_dpo/beta_margin_std": 1.287280797958374, "epsilon_dpo/loss_margin_mean": 23.81753921508789, "grad_norm": 62.54477310180664, "kl/avg_steps": 0.53125, "kl/beta": 0.042610302567481995, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.5131756438276466e-07, "logits/chosen": -3.55548095703125, "logits/rejected": -3.472877025604248, "logps/chosen": -80.10108947753906, "logps/ref_chosen": -62.278526306152344, "logps/ref_rejected": -72.83716583251953, "logps/rejected": -114.47726440429688, "loss": 0.9145, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7587012052536011, "rewards/margins": 1.0027623176574707, "rewards/rejected": -1.7614635229110718, "step": 188 }, { "epoch": 0.2857142857142857, "epsilon_dpo/beta": 0.04215092957019806, "epsilon_dpo/beta_margin_grad_mean": -0.3220076262950897, "epsilon_dpo/beta_margin_grad_std": 0.2192068099975586, "epsilon_dpo/beta_margin_mean": 0.9814594984054565, "epsilon_dpo/beta_margin_std": 1.265926718711853, "epsilon_dpo/loss_margin_mean": 23.452102661132812, "grad_norm": 63.87660598754883, "kl/avg_steps": 0.5625, "kl/beta": 0.04238513112068176, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.5053079818876096e-07, "logits/chosen": -3.5255203247070312, "logits/rejected": -3.5486857891082764, "logps/chosen": -79.07627868652344, "logps/ref_chosen": -65.03617858886719, "logps/ref_rejected": -69.1073226928711, "logps/rejected": -106.59953308105469, "loss": 0.931, "rewards/accuracies": 0.75, "rewards/chosen": -0.5938483476638794, "rewards/margins": 0.9814594984054565, "rewards/rejected": -1.575307846069336, "step": 189 }, { "epoch": 0.2872260015117158, "epsilon_dpo/beta": 0.04187563806772232, "epsilon_dpo/beta_margin_grad_mean": -0.28723961114883423, "epsilon_dpo/beta_margin_grad_std": 0.19574160873889923, "epsilon_dpo/beta_margin_mean": 1.141900897026062, "epsilon_dpo/beta_margin_std": 1.1445918083190918, "epsilon_dpo/loss_margin_mean": 27.405109405517578, "grad_norm": 60.267250061035156, "kl/avg_steps": 0.65625, "kl/beta": 0.04214804619550705, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.4973842271726024e-07, "logits/chosen": -3.4026191234588623, "logits/rejected": -3.5591776371002197, "logps/chosen": -58.185455322265625, "logps/ref_chosen": -44.94601821899414, "logps/ref_rejected": -94.17027282714844, "logps/rejected": -134.8148193359375, "loss": 0.7817, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5568834543228149, "rewards/margins": 1.141900897026062, "rewards/rejected": -1.698784351348877, "step": 190 }, { "epoch": 0.2887377173091459, "epsilon_dpo/beta": 0.041641879826784134, "epsilon_dpo/beta_margin_grad_mean": -0.32959550619125366, "epsilon_dpo/beta_margin_grad_std": 0.2179221361875534, "epsilon_dpo/beta_margin_mean": 0.9561223387718201, "epsilon_dpo/beta_margin_std": 1.289340615272522, "epsilon_dpo/loss_margin_mean": 23.135189056396484, "grad_norm": 65.05615997314453, "kl/avg_steps": 0.5625, "kl/beta": 0.041873253881931305, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.48940460132708e-07, "logits/chosen": -3.541079521179199, "logits/rejected": -3.523141860961914, "logps/chosen": -77.8901596069336, "logps/ref_chosen": -59.36180877685547, "logps/ref_rejected": -80.5174560546875, "logps/rejected": -122.18099975585938, "loss": 0.9415, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7746530771255493, "rewards/margins": 0.9561223387718201, "rewards/rejected": -1.7307754755020142, "step": 191 }, { "epoch": 0.29024943310657597, "epsilon_dpo/beta": 0.04147402197122574, "epsilon_dpo/beta_margin_grad_mean": -0.3820907771587372, "epsilon_dpo/beta_margin_grad_std": 0.2094534933567047, "epsilon_dpo/beta_margin_mean": 0.5941454768180847, "epsilon_dpo/beta_margin_std": 1.1270208358764648, "epsilon_dpo/loss_margin_mean": 14.49808406829834, "grad_norm": 71.01517486572266, "kl/avg_steps": 0.40625, "kl/beta": 0.04163903370499611, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.481369327558329e-07, "logits/chosen": -3.457044839859009, "logits/rejected": -3.4702868461608887, "logps/chosen": -72.64617919921875, "logps/ref_chosen": -48.36589813232422, "logps/ref_rejected": -59.27112579345703, "logps/rejected": -98.04949188232422, "loss": 1.1393, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0092337131500244, "rewards/margins": 0.5941454172134399, "rewards/rejected": -1.6033791303634644, "step": 192 }, { "epoch": 0.29176114890400606, "epsilon_dpo/beta": 0.04122845456004143, "epsilon_dpo/beta_margin_grad_mean": -0.30623841285705566, "epsilon_dpo/beta_margin_grad_std": 0.19469518959522247, "epsilon_dpo/beta_margin_mean": 1.0199859142303467, "epsilon_dpo/beta_margin_std": 1.0905250310897827, "epsilon_dpo/loss_margin_mean": 24.88538360595703, "grad_norm": 56.5097541809082, "kl/avg_steps": 0.59375, "kl/beta": 0.04147056117653847, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.47327863063023e-07, "logits/chosen": -3.483124256134033, "logits/rejected": -3.4106087684631348, "logps/chosen": -64.31686401367188, "logps/ref_chosen": -47.2737922668457, "logps/ref_rejected": -54.394744873046875, "logps/rejected": -96.32319641113281, "loss": 0.8354, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7046444416046143, "rewards/margins": 1.0199859142303467, "rewards/rejected": -1.724630355834961, "step": 193 }, { "epoch": 0.29327286470143615, "epsilon_dpo/beta": 0.04102375730872154, "epsilon_dpo/beta_margin_grad_mean": -0.3304082751274109, "epsilon_dpo/beta_margin_grad_std": 0.21795611083507538, "epsilon_dpo/beta_margin_mean": 0.9337664842605591, "epsilon_dpo/beta_margin_std": 1.2138066291809082, "epsilon_dpo/loss_margin_mean": 22.944374084472656, "grad_norm": 61.967464447021484, "kl/avg_steps": 0.5, "kl/beta": 0.04122578352689743, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -3.478339672088623, "logits/rejected": -3.494858741760254, "logps/chosen": -65.27080535888672, "logps/ref_chosen": -51.22123336791992, "logps/ref_rejected": -59.38127899169922, "logps/rejected": -96.37522888183594, "loss": 0.9413, "rewards/accuracies": 0.75, "rewards/chosen": -0.5799208879470825, "rewards/margins": 0.9337664842605591, "rewards/rejected": -1.5136873722076416, "step": 194 }, { "epoch": 0.2947845804988662, "epsilon_dpo/beta": 0.04078119620680809, "epsilon_dpo/beta_margin_grad_mean": -0.2947201430797577, "epsilon_dpo/beta_margin_grad_std": 0.18896819651126862, "epsilon_dpo/beta_margin_mean": 1.0850766897201538, "epsilon_dpo/beta_margin_std": 1.1136102676391602, "epsilon_dpo/loss_margin_mean": 26.74505043029785, "grad_norm": 49.77657699584961, "kl/avg_steps": 0.59375, "kl/beta": 0.04102068021893501, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.4569318740967043e-07, "logits/chosen": -3.5864458084106445, "logits/rejected": -3.5390872955322266, "logps/chosen": -81.12614440917969, "logps/ref_chosen": -61.28152847290039, "logps/ref_rejected": -70.15548706054688, "logps/rejected": -116.74514770507812, "loss": 0.8034, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8131213188171387, "rewards/margins": 1.0850765705108643, "rewards/rejected": -1.898197889328003, "step": 195 }, { "epoch": 0.2962962962962963, "epsilon_dpo/beta": 0.04050225391983986, "epsilon_dpo/beta_margin_grad_mean": -0.30442044138908386, "epsilon_dpo/beta_margin_grad_std": 0.2004178911447525, "epsilon_dpo/beta_margin_mean": 1.0567375421524048, "epsilon_dpo/beta_margin_std": 1.1955032348632812, "epsilon_dpo/loss_margin_mean": 26.22124671936035, "grad_norm": 54.39183044433594, "kl/avg_steps": 0.6875, "kl/beta": 0.04077855497598648, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.448676271745197e-07, "logits/chosen": -3.4688010215759277, "logits/rejected": -3.577119827270508, "logps/chosen": -71.30818176269531, "logps/ref_chosen": -53.89015197753906, "logps/ref_rejected": -78.57406616210938, "logps/rejected": -122.21334075927734, "loss": 0.8498, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7074722051620483, "rewards/margins": 1.0567374229431152, "rewards/rejected": -1.7642097473144531, "step": 196 }, { "epoch": 0.29780801209372637, "epsilon_dpo/beta": 0.04025102034211159, "epsilon_dpo/beta_margin_grad_mean": -0.3317333161830902, "epsilon_dpo/beta_margin_grad_std": 0.21209289133548737, "epsilon_dpo/beta_margin_mean": 0.8992827534675598, "epsilon_dpo/beta_margin_std": 1.1738866567611694, "epsilon_dpo/loss_margin_mean": 22.486616134643555, "grad_norm": 54.86751174926758, "kl/avg_steps": 0.625, "kl/beta": 0.04050011932849884, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.440366160729392e-07, "logits/chosen": -3.4235453605651855, "logits/rejected": -3.582223415374756, "logps/chosen": -58.10289001464844, "logps/ref_chosen": -44.981239318847656, "logps/ref_rejected": -64.61582946777344, "logps/rejected": -100.22410583496094, "loss": 0.9468, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5306128263473511, "rewards/margins": 0.899282693862915, "rewards/rejected": -1.4298955202102661, "step": 197 }, { "epoch": 0.29931972789115646, "epsilon_dpo/beta": 0.0399758517742157, "epsilon_dpo/beta_margin_grad_mean": -0.3303770422935486, "epsilon_dpo/beta_margin_grad_std": 0.1886788308620453, "epsilon_dpo/beta_margin_mean": 0.8959324955940247, "epsilon_dpo/beta_margin_std": 1.0838531255722046, "epsilon_dpo/loss_margin_mean": 22.528654098510742, "grad_norm": 51.66859436035156, "kl/avg_steps": 0.6875, "kl/beta": 0.0402485653758049, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.432001773500957e-07, "logits/chosen": -3.4696359634399414, "logits/rejected": -3.555976152420044, "logps/chosen": -66.08497619628906, "logps/ref_chosen": -52.30570983886719, "logps/ref_rejected": -70.6080551147461, "logps/rejected": -106.91597747802734, "loss": 0.8981, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5536590814590454, "rewards/margins": 0.8959324955940247, "rewards/rejected": -1.4495916366577148, "step": 198 }, { "epoch": 0.30083144368858655, "epsilon_dpo/beta": 0.039777856320142746, "epsilon_dpo/beta_margin_grad_mean": -0.3384016752243042, "epsilon_dpo/beta_margin_grad_std": 0.21800272166728973, "epsilon_dpo/beta_margin_mean": 0.8261865973472595, "epsilon_dpo/beta_margin_std": 1.1496398448944092, "epsilon_dpo/loss_margin_mean": 20.95412254333496, "grad_norm": 61.684173583984375, "kl/avg_steps": 0.5, "kl/beta": 0.03997374698519707, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.4235833440297856e-07, "logits/chosen": -3.4730348587036133, "logits/rejected": -3.4606375694274902, "logps/chosen": -67.54776000976562, "logps/ref_chosen": -48.64945602416992, "logps/ref_rejected": -63.26519012451172, "logps/rejected": -103.11761474609375, "loss": 0.9927, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7551217675209045, "rewards/margins": 0.8261865973472595, "rewards/rejected": -1.581308364868164, "step": 199 }, { "epoch": 0.30234315948601664, "epsilon_dpo/beta": 0.03950536996126175, "epsilon_dpo/beta_margin_grad_mean": -0.308423787355423, "epsilon_dpo/beta_margin_grad_std": 0.18733318150043488, "epsilon_dpo/beta_margin_mean": 1.0158190727233887, "epsilon_dpo/beta_margin_std": 1.0778899192810059, "epsilon_dpo/loss_margin_mean": 25.827695846557617, "grad_norm": 54.301971435546875, "kl/avg_steps": 0.6875, "kl/beta": 0.03977487236261368, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.415111107797445e-07, "logits/chosen": -3.5410304069519043, "logits/rejected": -3.5030875205993652, "logps/chosen": -62.250614166259766, "logps/ref_chosen": -49.355560302734375, "logps/ref_rejected": -78.42619323730469, "logps/rejected": -117.1489486694336, "loss": 0.8262, "rewards/accuracies": 0.859375, "rewards/chosen": -0.51116943359375, "rewards/margins": 1.0158190727233887, "rewards/rejected": -1.5269885063171387, "step": 200 }, { "epoch": 0.30234315948601664, "eval_epsilon_dpo/beta": 0.03934308513998985, "eval_epsilon_dpo/beta_margin_grad_mean": -0.38161319494247437, "eval_epsilon_dpo/beta_margin_grad_std": 0.21187205612659454, "eval_epsilon_dpo/beta_margin_mean": 0.6240970492362976, "eval_epsilon_dpo/beta_margin_std": 1.1421700716018677, "eval_epsilon_dpo/loss_margin_mean": 16.05150604248047, "eval_kl/n_epsilon_steps": 0.29181337356567383, "eval_kl/p_epsilon_steps": 0.7073063254356384, "eval_logits/chosen": -3.581502676010132, "eval_logits/rejected": -3.564389228820801, "eval_logps/chosen": -97.914306640625, "eval_logps/ref_chosen": -77.40868377685547, "eval_logps/ref_rejected": -73.52816772460938, "eval_logps/rejected": -110.08529663085938, "eval_loss": 0.5655415654182434, "eval_rewards/accuracies": 0.7007042169570923, "eval_rewards/chosen": -0.8101938962936401, "eval_rewards/margins": 0.6240970492362976, "eval_rewards/rejected": -1.434290885925293, "eval_runtime": 37.0902, "eval_samples_per_second": 62.092, "eval_steps_per_second": 1.941, "step": 200 }, { "epoch": 0.30385487528344673, "epsilon_dpo/beta": 0.03928500786423683, "epsilon_dpo/beta_margin_grad_mean": -0.3455072045326233, "epsilon_dpo/beta_margin_grad_std": 0.1999111771583557, "epsilon_dpo/beta_margin_mean": 0.8040106296539307, "epsilon_dpo/beta_margin_std": 1.07520592212677, "epsilon_dpo/loss_margin_mean": 20.615453720092773, "grad_norm": 58.57802200317383, "kl/avg_steps": 0.5625, "kl/beta": 0.03950328752398491, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.4065853017905953e-07, "logits/chosen": -3.520378351211548, "logits/rejected": -3.489109516143799, "logps/chosen": -69.96910095214844, "logps/ref_chosen": -52.09891891479492, "logps/ref_rejected": -74.692138671875, "logps/rejected": -113.17778015136719, "loss": 0.9665, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7040920257568359, "rewards/margins": 0.8040106296539307, "rewards/rejected": -1.508102536201477, "step": 201 }, { "epoch": 0.30536659108087677, "epsilon_dpo/beta": 0.039114370942115784, "epsilon_dpo/beta_margin_grad_mean": -0.3650599718093872, "epsilon_dpo/beta_margin_grad_std": 0.201198548078537, "epsilon_dpo/beta_margin_mean": 0.6892377138137817, "epsilon_dpo/beta_margin_std": 1.02873694896698, "epsilon_dpo/loss_margin_mean": 17.799182891845703, "grad_norm": 55.17136001586914, "kl/avg_steps": 0.4375, "kl/beta": 0.03928232192993164, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.3980061644943575e-07, "logits/chosen": -3.480362892150879, "logits/rejected": -3.5090184211730957, "logps/chosen": -63.37360382080078, "logps/ref_chosen": -46.399715423583984, "logps/ref_rejected": -64.59156036376953, "logps/rejected": -99.36463165283203, "loss": 1.0314, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6678280830383301, "rewards/margins": 0.6892377138137817, "rewards/rejected": -1.3570657968521118, "step": 202 }, { "epoch": 0.30687830687830686, "epsilon_dpo/beta": 0.03888287767767906, "epsilon_dpo/beta_margin_grad_mean": -0.3348226249217987, "epsilon_dpo/beta_margin_grad_std": 0.20196466147899628, "epsilon_dpo/beta_margin_mean": 0.8438022136688232, "epsilon_dpo/beta_margin_std": 1.072197437286377, "epsilon_dpo/loss_margin_mean": 21.853126525878906, "grad_norm": 60.297645568847656, "kl/avg_steps": 0.59375, "kl/beta": 0.03911121189594269, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.3893739358856455e-07, "logits/chosen": -3.552091360092163, "logits/rejected": -3.6573781967163086, "logps/chosen": -76.8156967163086, "logps/ref_chosen": -55.67274475097656, "logps/ref_rejected": -87.79222106933594, "logps/rejected": -130.78829956054688, "loss": 0.9442, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8248593807220459, "rewards/margins": 0.8438022136688232, "rewards/rejected": -1.6686615943908691, "step": 203 }, { "epoch": 0.30839002267573695, "epsilon_dpo/beta": 0.038641221821308136, "epsilon_dpo/beta_margin_grad_mean": -0.32485708594322205, "epsilon_dpo/beta_margin_grad_std": 0.19857537746429443, "epsilon_dpo/beta_margin_mean": 0.9325970411300659, "epsilon_dpo/beta_margin_std": 1.1077971458435059, "epsilon_dpo/loss_margin_mean": 24.271240234375, "grad_norm": 48.70488357543945, "kl/avg_steps": 0.625, "kl/beta": 0.0388803593814373, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.380688857426449e-07, "logits/chosen": -3.468535900115967, "logits/rejected": -3.4710283279418945, "logps/chosen": -63.69876480102539, "logps/ref_chosen": -45.89850616455078, "logps/ref_rejected": -58.094093322753906, "logps/rejected": -100.16558837890625, "loss": 0.8942, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6892681121826172, "rewards/margins": 0.9325970411300659, "rewards/rejected": -1.6218652725219727, "step": 204 }, { "epoch": 0.30990173847316704, "epsilon_dpo/beta": 0.03840121626853943, "epsilon_dpo/beta_margin_grad_mean": -0.3173699975013733, "epsilon_dpo/beta_margin_grad_std": 0.2037927359342575, "epsilon_dpo/beta_margin_mean": 0.9641197323799133, "epsilon_dpo/beta_margin_std": 1.1961047649383545, "epsilon_dpo/loss_margin_mean": 25.26545524597168, "grad_norm": 53.5556640625, "kl/avg_steps": 0.625, "kl/beta": 0.03863886743783951, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.3719511720570814e-07, "logits/chosen": -3.5484752655029297, "logits/rejected": -3.5840582847595215, "logps/chosen": -78.00169372558594, "logps/ref_chosen": -60.3880615234375, "logps/ref_rejected": -81.88725280761719, "logps/rejected": -124.76634216308594, "loss": 0.9077, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6794295310974121, "rewards/margins": 0.9641197323799133, "rewards/rejected": -1.6435492038726807, "step": 205 }, { "epoch": 0.31141345427059713, "epsilon_dpo/beta": 0.038222700357437134, "epsilon_dpo/beta_margin_grad_mean": -0.3520456552505493, "epsilon_dpo/beta_margin_grad_std": 0.19689998030662537, "epsilon_dpo/beta_margin_mean": 0.7701746225357056, "epsilon_dpo/beta_margin_std": 1.0494840145111084, "epsilon_dpo/loss_margin_mean": 20.316253662109375, "grad_norm": 64.67769622802734, "kl/avg_steps": 0.46875, "kl/beta": 0.038398873060941696, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.363161124189387e-07, "logits/chosen": -3.4846065044403076, "logits/rejected": -3.550844669342041, "logps/chosen": -78.38066101074219, "logps/ref_chosen": -59.106754302978516, "logps/ref_rejected": -74.94796752929688, "logps/rejected": -114.53813171386719, "loss": 0.9765, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7397232055664062, "rewards/margins": 0.7701746225357056, "rewards/rejected": -1.5098978281021118, "step": 206 }, { "epoch": 0.3129251700680272, "epsilon_dpo/beta": 0.03799659013748169, "epsilon_dpo/beta_margin_grad_mean": -0.32501423358917236, "epsilon_dpo/beta_margin_grad_std": 0.20867857336997986, "epsilon_dpo/beta_margin_mean": 0.944418728351593, "epsilon_dpo/beta_margin_std": 1.2296777963638306, "epsilon_dpo/loss_margin_mean": 25.022621154785156, "grad_norm": 53.95839309692383, "kl/avg_steps": 0.59375, "kl/beta": 0.038219720125198364, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.3543189596998986e-07, "logits/chosen": -3.482269763946533, "logits/rejected": -3.597884178161621, "logps/chosen": -83.94605255126953, "logps/ref_chosen": -58.953582763671875, "logps/ref_rejected": -79.804443359375, "logps/rejected": -129.8195343017578, "loss": 0.9302, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9532793164253235, "rewards/margins": 0.9444186687469482, "rewards/rejected": -1.8976980447769165, "step": 207 }, { "epoch": 0.3144368858654573, "epsilon_dpo/beta": 0.037784188985824585, "epsilon_dpo/beta_margin_grad_mean": -0.34813281893730164, "epsilon_dpo/beta_margin_grad_std": 0.1934841275215149, "epsilon_dpo/beta_margin_mean": 0.7770121693611145, "epsilon_dpo/beta_margin_std": 1.0266234874725342, "epsilon_dpo/loss_margin_mean": 20.716184616088867, "grad_norm": 53.02324676513672, "kl/avg_steps": 0.5625, "kl/beta": 0.037994127720594406, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.3454249259229664e-07, "logits/chosen": -3.5287129878997803, "logits/rejected": -3.4734394550323486, "logps/chosen": -69.11830139160156, "logps/ref_chosen": -52.9552001953125, "logps/ref_rejected": -64.94300842285156, "logps/rejected": -101.82229614257812, "loss": 0.9657, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6125558614730835, "rewards/margins": 0.7770121693611145, "rewards/rejected": -1.3895680904388428, "step": 208 }, { "epoch": 0.31594860166288735, "epsilon_dpo/beta": 0.03758465126156807, "epsilon_dpo/beta_margin_grad_mean": -0.3302347660064697, "epsilon_dpo/beta_margin_grad_std": 0.19674944877624512, "epsilon_dpo/beta_margin_mean": 0.8775627613067627, "epsilon_dpo/beta_margin_std": 1.1221497058868408, "epsilon_dpo/loss_margin_mean": 23.515052795410156, "grad_norm": 54.00407409667969, "kl/avg_steps": 0.53125, "kl/beta": 0.03778160735964775, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.336479271643833e-07, "logits/chosen": -3.4869484901428223, "logits/rejected": -3.5835154056549072, "logps/chosen": -78.59196472167969, "logps/ref_chosen": -57.14421081542969, "logps/ref_rejected": -83.43329620361328, "logps/rejected": -128.39608764648438, "loss": 0.9327, "rewards/accuracies": 0.828125, "rewards/chosen": -0.809949517250061, "rewards/margins": 0.8775627613067627, "rewards/rejected": -1.6875122785568237, "step": 209 }, { "epoch": 0.31746031746031744, "epsilon_dpo/beta": 0.03738604113459587, "epsilon_dpo/beta_margin_grad_mean": -0.32786819338798523, "epsilon_dpo/beta_margin_grad_std": 0.21687400341033936, "epsilon_dpo/beta_margin_mean": 0.9459735751152039, "epsilon_dpo/beta_margin_std": 1.213609218597412, "epsilon_dpo/loss_margin_mean": 25.493974685668945, "grad_norm": 47.389076232910156, "kl/avg_steps": 0.53125, "kl/beta": 0.03758195415139198, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.327482247091679e-07, "logits/chosen": -3.5033187866210938, "logits/rejected": -3.605921745300293, "logps/chosen": -72.78206634521484, "logps/ref_chosen": -53.84038543701172, "logps/ref_rejected": -79.80218505859375, "logps/rejected": -124.23783874511719, "loss": 0.9287, "rewards/accuracies": 0.75, "rewards/chosen": -0.7122611999511719, "rewards/margins": 0.9459735155105591, "rewards/rejected": -1.6582348346710205, "step": 210 }, { "epoch": 0.31897203325774753, "epsilon_dpo/beta": 0.03714174032211304, "epsilon_dpo/beta_margin_grad_mean": -0.3196851909160614, "epsilon_dpo/beta_margin_grad_std": 0.18467329442501068, "epsilon_dpo/beta_margin_mean": 0.9319191575050354, "epsilon_dpo/beta_margin_std": 1.0876432657241821, "epsilon_dpo/loss_margin_mean": 25.21488380432129, "grad_norm": 49.89854431152344, "kl/avg_steps": 0.65625, "kl/beta": 0.03738335520029068, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.3184341039326217e-07, "logits/chosen": -3.4429006576538086, "logits/rejected": -3.576409101486206, "logps/chosen": -67.97000122070312, "logps/ref_chosen": -47.98066711425781, "logps/ref_rejected": -86.38960266113281, "logps/rejected": -131.59381103515625, "loss": 0.8795, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7448057532310486, "rewards/margins": 0.9319191575050354, "rewards/rejected": -1.676724910736084, "step": 211 }, { "epoch": 0.3204837490551776, "epsilon_dpo/beta": 0.03694601356983185, "epsilon_dpo/beta_margin_grad_mean": -0.3292403519153595, "epsilon_dpo/beta_margin_grad_std": 0.2179199606180191, "epsilon_dpo/beta_margin_mean": 0.9338294863700867, "epsilon_dpo/beta_margin_std": 1.2748308181762695, "epsilon_dpo/loss_margin_mean": 25.47688102722168, "grad_norm": 67.411376953125, "kl/avg_steps": 0.53125, "kl/beta": 0.03713962435722351, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.309335095262675e-07, "logits/chosen": -3.5001413822174072, "logits/rejected": -3.455796241760254, "logps/chosen": -72.02558898925781, "logps/ref_chosen": -47.24024963378906, "logps/ref_rejected": -72.20423126220703, "logps/rejected": -122.46646118164062, "loss": 0.9607, "rewards/accuracies": 0.75, "rewards/chosen": -0.9192423820495605, "rewards/margins": 0.9338294267654419, "rewards/rejected": -1.8530718088150024, "step": 212 }, { "epoch": 0.3219954648526077, "epsilon_dpo/beta": 0.036750778555870056, "epsilon_dpo/beta_margin_grad_mean": -0.3358621299266815, "epsilon_dpo/beta_margin_grad_std": 0.2188369482755661, "epsilon_dpo/beta_margin_mean": 0.9037906527519226, "epsilon_dpo/beta_margin_std": 1.275901198387146, "epsilon_dpo/loss_margin_mean": 24.7879695892334, "grad_norm": 55.734466552734375, "kl/avg_steps": 0.53125, "kl/beta": 0.03694336488842964, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.3001854756006724e-07, "logits/chosen": -3.598665237426758, "logits/rejected": -3.5424110889434814, "logps/chosen": -74.67687225341797, "logps/ref_chosen": -56.815185546875, "logps/ref_rejected": -73.15438842773438, "logps/rejected": -115.80404663085938, "loss": 0.9776, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6588544845581055, "rewards/margins": 0.9037907123565674, "rewards/rejected": -1.5626451969146729, "step": 213 }, { "epoch": 0.3235071806500378, "epsilon_dpo/beta": 0.03655656799674034, "epsilon_dpo/beta_margin_grad_mean": -0.32683849334716797, "epsilon_dpo/beta_margin_grad_std": 0.2327646166086197, "epsilon_dpo/beta_margin_mean": 0.9596722722053528, "epsilon_dpo/beta_margin_std": 1.3057315349578857, "epsilon_dpo/loss_margin_mean": 26.464004516601562, "grad_norm": 63.16493225097656, "kl/avg_steps": 0.53125, "kl/beta": 0.03674813732504845, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.290985500881143e-07, "logits/chosen": -3.538296937942505, "logits/rejected": -3.4560623168945312, "logps/chosen": -80.40599060058594, "logps/ref_chosen": -55.0026741027832, "logps/ref_rejected": -59.64122772216797, "logps/rejected": -111.508544921875, "loss": 0.9696, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9335716962814331, "rewards/margins": 0.9596723318099976, "rewards/rejected": -1.8932440280914307, "step": 214 }, { "epoch": 0.3250188964474679, "epsilon_dpo/beta": 0.03635196387767792, "epsilon_dpo/beta_margin_grad_mean": -0.2959337830543518, "epsilon_dpo/beta_margin_grad_std": 0.21179771423339844, "epsilon_dpo/beta_margin_mean": 1.1824769973754883, "epsilon_dpo/beta_margin_std": 1.3134583234786987, "epsilon_dpo/loss_margin_mean": 32.724403381347656, "grad_norm": 52.883399963378906, "kl/avg_steps": 0.5625, "kl/beta": 0.036553945392370224, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.281735428447157e-07, "logits/chosen": -3.492943286895752, "logits/rejected": -3.519343137741089, "logps/chosen": -83.35348510742188, "logps/ref_chosen": -56.205360412597656, "logps/ref_rejected": -84.36219787597656, "logps/rejected": -144.23472595214844, "loss": 0.8196, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9902360439300537, "rewards/margins": 1.1824769973754883, "rewards/rejected": -2.172712802886963, "step": 215 }, { "epoch": 0.32653061224489793, "epsilon_dpo/beta": 0.03621678799390793, "epsilon_dpo/beta_margin_grad_mean": -0.33863499760627747, "epsilon_dpo/beta_margin_grad_std": 0.2535441219806671, "epsilon_dpo/beta_margin_mean": 0.9454448819160461, "epsilon_dpo/beta_margin_std": 1.4889917373657227, "epsilon_dpo/loss_margin_mean": 26.397720336914062, "grad_norm": 69.55057525634766, "kl/avg_steps": 0.375, "kl/beta": 0.03634947910904884, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.2724355170431247e-07, "logits/chosen": -3.576535224914551, "logits/rejected": -3.60660457611084, "logps/chosen": -90.8216552734375, "logps/ref_chosen": -62.24871826171875, "logps/ref_rejected": -84.99552917480469, "logps/rejected": -139.9661865234375, "loss": 1.0582, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0395915508270264, "rewards/margins": 0.9454448819160461, "rewards/rejected": -1.9850363731384277, "step": 216 }, { "epoch": 0.328042328042328, "epsilon_dpo/beta": 0.0360022597014904, "epsilon_dpo/beta_margin_grad_mean": -0.2982618808746338, "epsilon_dpo/beta_margin_grad_std": 0.20458948612213135, "epsilon_dpo/beta_margin_mean": 1.0852073431015015, "epsilon_dpo/beta_margin_std": 1.189354658126831, "epsilon_dpo/loss_margin_mean": 30.31786346435547, "grad_norm": 58.940521240234375, "kl/avg_steps": 0.59375, "kl/beta": 0.03621367737650871, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.26308602680756e-07, "logits/chosen": -3.510631799697876, "logits/rejected": -3.4744439125061035, "logps/chosen": -83.55587768554688, "logps/ref_chosen": -55.43699645996094, "logps/ref_rejected": -74.10791778564453, "logps/rejected": -132.54466247558594, "loss": 0.8348, "rewards/accuracies": 0.859375, "rewards/chosen": -1.015423059463501, "rewards/margins": 1.0852073431015015, "rewards/rejected": -2.100630283355713, "step": 217 }, { "epoch": 0.3295540438397581, "epsilon_dpo/beta": 0.035846009850502014, "epsilon_dpo/beta_margin_grad_mean": -0.36956170201301575, "epsilon_dpo/beta_margin_grad_std": 0.23382312059402466, "epsilon_dpo/beta_margin_mean": 0.7297128438949585, "epsilon_dpo/beta_margin_std": 1.3466575145721436, "epsilon_dpo/loss_margin_mean": 20.59137725830078, "grad_norm": 64.67948150634766, "kl/avg_steps": 0.4375, "kl/beta": 0.035999927669763565, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.253687219265803e-07, "logits/chosen": -3.586333751678467, "logits/rejected": -3.5443947315216064, "logps/chosen": -90.42942810058594, "logps/ref_chosen": -63.364742279052734, "logps/ref_rejected": -66.95477294921875, "logps/rejected": -114.61083221435547, "loss": 1.1325, "rewards/accuracies": 0.75, "rewards/chosen": -0.9732264876365662, "rewards/margins": 0.7297128438949585, "rewards/rejected": -1.7029392719268799, "step": 218 }, { "epoch": 0.3310657596371882, "epsilon_dpo/beta": 0.03564506024122238, "epsilon_dpo/beta_margin_grad_mean": -0.3297954201698303, "epsilon_dpo/beta_margin_grad_std": 0.18660549819469452, "epsilon_dpo/beta_margin_mean": 0.8488391637802124, "epsilon_dpo/beta_margin_std": 0.9588797688484192, "epsilon_dpo/loss_margin_mean": 23.970008850097656, "grad_norm": 54.55857467651367, "kl/avg_steps": 0.5625, "kl/beta": 0.03584311529994011, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.2442393573227043e-07, "logits/chosen": -3.5978140830993652, "logits/rejected": -3.5683038234710693, "logps/chosen": -77.64155578613281, "logps/ref_chosen": -55.58036422729492, "logps/ref_rejected": -69.35343933105469, "logps/rejected": -115.38463592529297, "loss": 0.8975, "rewards/accuracies": 0.75, "rewards/chosen": -0.7896758317947388, "rewards/margins": 0.8488391637802124, "rewards/rejected": -1.6385149955749512, "step": 219 }, { "epoch": 0.3325774754346183, "epsilon_dpo/beta": 0.0354568213224411, "epsilon_dpo/beta_margin_grad_mean": -0.3114323318004608, "epsilon_dpo/beta_margin_grad_std": 0.2148263156414032, "epsilon_dpo/beta_margin_mean": 1.008568286895752, "epsilon_dpo/beta_margin_std": 1.197187066078186, "epsilon_dpo/loss_margin_mean": 28.660755157470703, "grad_norm": 55.3653564453125, "kl/avg_steps": 0.53125, "kl/beta": 0.035642627626657486, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.234742705255272e-07, "logits/chosen": -3.397692918777466, "logits/rejected": -3.4540910720825195, "logps/chosen": -59.221744537353516, "logps/ref_chosen": -38.90839385986328, "logps/ref_rejected": -65.69367980957031, "logps/rejected": -114.66779327392578, "loss": 0.8883, "rewards/accuracies": 0.765625, "rewards/chosen": -0.722341775894165, "rewards/margins": 1.008568286895752, "rewards/rejected": -1.730910062789917, "step": 220 }, { "epoch": 0.3340891912320484, "epsilon_dpo/beta": 0.03528052940964699, "epsilon_dpo/beta_margin_grad_mean": -0.3516346216201782, "epsilon_dpo/beta_margin_grad_std": 0.21390923857688904, "epsilon_dpo/beta_margin_mean": 0.8392696380615234, "epsilon_dpo/beta_margin_std": 1.2753064632415771, "epsilon_dpo/loss_margin_mean": 23.983644485473633, "grad_norm": 62.309234619140625, "kl/avg_steps": 0.5, "kl/beta": 0.03545427322387695, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.22519752870528e-07, "logits/chosen": -3.5186450481414795, "logits/rejected": -3.6691133975982666, "logps/chosen": -68.44354248046875, "logps/ref_chosen": -51.12848663330078, "logps/ref_rejected": -85.7921142578125, "logps/rejected": -127.09080505371094, "loss": 1.0101, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6135537028312683, "rewards/margins": 0.8392696380615234, "rewards/rejected": -1.4528234004974365, "step": 221 }, { "epoch": 0.3356009070294785, "epsilon_dpo/beta": 0.035016801208257675, "epsilon_dpo/beta_margin_grad_mean": -0.2968342900276184, "epsilon_dpo/beta_margin_grad_std": 0.1912674456834793, "epsilon_dpo/beta_margin_mean": 1.1297993659973145, "epsilon_dpo/beta_margin_std": 1.1961920261383057, "epsilon_dpo/loss_margin_mean": 32.38107681274414, "grad_norm": 50.831932067871094, "kl/avg_steps": 0.75, "kl/beta": 0.03527788445353508, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.2156040946718343e-07, "logits/chosen": -3.5614213943481445, "logits/rejected": -3.6921582221984863, "logps/chosen": -65.93031311035156, "logps/ref_chosen": -47.75957489013672, "logps/ref_rejected": -107.82502746582031, "logps/rejected": -158.37684631347656, "loss": 0.7943, "rewards/accuracies": 0.859375, "rewards/chosen": -0.638631284236908, "rewards/margins": 1.1297993659973145, "rewards/rejected": -1.7684307098388672, "step": 222 }, { "epoch": 0.3371126228269085, "epsilon_dpo/beta": 0.03486556187272072, "epsilon_dpo/beta_margin_grad_mean": -0.34731340408325195, "epsilon_dpo/beta_margin_grad_std": 0.20615656673908234, "epsilon_dpo/beta_margin_mean": 0.800473153591156, "epsilon_dpo/beta_margin_std": 1.0882115364074707, "epsilon_dpo/loss_margin_mean": 23.161121368408203, "grad_norm": 58.54691696166992, "kl/avg_steps": 0.4375, "kl/beta": 0.03501527011394501, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.2059626715039065e-07, "logits/chosen": -3.5788512229919434, "logits/rejected": -3.6575284004211426, "logps/chosen": -72.98533630371094, "logps/ref_chosen": -52.85289764404297, "logps/ref_rejected": -79.98722076416016, "logps/rejected": -123.28077697753906, "loss": 0.9759, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7046984434127808, "rewards/margins": 0.8004730939865112, "rewards/rejected": -1.505171537399292, "step": 223 }, { "epoch": 0.3386243386243386, "epsilon_dpo/beta": 0.03471369296312332, "epsilon_dpo/beta_margin_grad_mean": -0.38026106357574463, "epsilon_dpo/beta_margin_grad_std": 0.18396450579166412, "epsilon_dpo/beta_margin_mean": 0.6009146571159363, "epsilon_dpo/beta_margin_std": 0.9940208792686462, "epsilon_dpo/loss_margin_mean": 17.472631454467773, "grad_norm": 57.300209045410156, "kl/avg_steps": 0.4375, "kl/beta": 0.03486274555325508, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.1962735288928304e-07, "logits/chosen": -3.576627254486084, "logits/rejected": -3.6696414947509766, "logps/chosen": -78.13873291015625, "logps/ref_chosen": -56.696685791015625, "logps/ref_rejected": -67.6631851196289, "logps/rejected": -106.5778579711914, "loss": 1.0718, "rewards/accuracies": 0.75, "rewards/chosen": -0.7469635009765625, "rewards/margins": 0.6009146571159363, "rewards/rejected": -1.3478782176971436, "step": 224 }, { "epoch": 0.3401360544217687, "epsilon_dpo/beta": 0.034486547112464905, "epsilon_dpo/beta_margin_grad_mean": -0.3092819154262543, "epsilon_dpo/beta_margin_grad_std": 0.18451638519763947, "epsilon_dpo/beta_margin_mean": 0.9650672078132629, "epsilon_dpo/beta_margin_std": 0.9786075353622437, "epsilon_dpo/loss_margin_mean": 28.12489891052246, "grad_norm": 46.80144119262695, "kl/avg_steps": 0.65625, "kl/beta": 0.03471088781952858, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.186536937864752e-07, "logits/chosen": -3.5316030979156494, "logits/rejected": -3.601656436920166, "logps/chosen": -76.49637603759766, "logps/ref_chosen": -57.57122039794922, "logps/ref_rejected": -80.76596069335938, "logps/rejected": -127.81600952148438, "loss": 0.8299, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6543439626693726, "rewards/margins": 0.9650672078132629, "rewards/rejected": -1.6194111108779907, "step": 225 }, { "epoch": 0.3416477702191988, "epsilon_dpo/beta": 0.03430481255054474, "epsilon_dpo/beta_margin_grad_mean": -0.320197731256485, "epsilon_dpo/beta_margin_grad_std": 0.20055249333381653, "epsilon_dpo/beta_margin_mean": 0.9438491463661194, "epsilon_dpo/beta_margin_std": 1.1082851886749268, "epsilon_dpo/loss_margin_mean": 27.70347023010254, "grad_norm": 43.68962860107422, "kl/avg_steps": 0.53125, "kl/beta": 0.03448458015918732, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.176753170773052e-07, "logits/chosen": -3.5373995304107666, "logits/rejected": -3.5118868350982666, "logps/chosen": -57.3145751953125, "logps/ref_chosen": -44.87506866455078, "logps/ref_rejected": -60.68815994262695, "logps/rejected": -100.83113098144531, "loss": 0.8888, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4285694360733032, "rewards/margins": 0.9438491463661194, "rewards/rejected": -1.3724186420440674, "step": 226 }, { "epoch": 0.3431594860166289, "epsilon_dpo/beta": 0.03412352874875069, "epsilon_dpo/beta_margin_grad_mean": -0.35344985127449036, "epsilon_dpo/beta_margin_grad_std": 0.1964007169008255, "epsilon_dpo/beta_margin_mean": 0.756708025932312, "epsilon_dpo/beta_margin_std": 1.0710712671279907, "epsilon_dpo/loss_margin_mean": 22.350990295410156, "grad_norm": 47.4157829284668, "kl/avg_steps": 0.53125, "kl/beta": 0.034302350133657455, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.166922501290729e-07, "logits/chosen": -3.511568307876587, "logits/rejected": -3.510512351989746, "logps/chosen": -65.66714477539062, "logps/ref_chosen": -49.219905853271484, "logps/ref_rejected": -69.90177917480469, "logps/rejected": -108.70001220703125, "loss": 0.9923, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5635974407196045, "rewards/margins": 0.756708025932312, "rewards/rejected": -1.3203054666519165, "step": 227 }, { "epoch": 0.34467120181405897, "epsilon_dpo/beta": 0.033953871577978134, "epsilon_dpo/beta_margin_grad_mean": -0.33064186573028564, "epsilon_dpo/beta_margin_grad_std": 0.19997283816337585, "epsilon_dpo/beta_margin_mean": 0.8808642029762268, "epsilon_dpo/beta_margin_std": 1.0767749547958374, "epsilon_dpo/loss_margin_mean": 26.133075714111328, "grad_norm": 52.0386962890625, "kl/avg_steps": 0.5, "kl/beta": 0.0341210812330246, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.1570452044027405e-07, "logits/chosen": -3.525053024291992, "logits/rejected": -3.5127744674682617, "logps/chosen": -69.43989562988281, "logps/ref_chosen": -50.81490707397461, "logps/ref_rejected": -68.55366516113281, "logps/rejected": -113.31173706054688, "loss": 0.9193, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6340755224227905, "rewards/margins": 0.8808642029762268, "rewards/rejected": -1.514939785003662, "step": 228 }, { "epoch": 0.34618291761148906, "epsilon_dpo/beta": 0.033742502331733704, "epsilon_dpo/beta_margin_grad_mean": -0.3358643651008606, "epsilon_dpo/beta_margin_grad_std": 0.19054366648197174, "epsilon_dpo/beta_margin_mean": 0.8090803027153015, "epsilon_dpo/beta_margin_std": 1.0081828832626343, "epsilon_dpo/loss_margin_mean": 24.13226890563965, "grad_norm": 52.22861099243164, "kl/avg_steps": 0.625, "kl/beta": 0.033951323479413986, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.147121556398312e-07, "logits/chosen": -3.5843217372894287, "logits/rejected": -3.5501012802124023, "logps/chosen": -61.42178726196289, "logps/ref_chosen": -46.11479949951172, "logps/ref_rejected": -71.94685363769531, "logps/rejected": -111.3861083984375, "loss": 0.9413, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5182480812072754, "rewards/margins": 0.8090802431106567, "rewards/rejected": -1.3273283243179321, "step": 229 }, { "epoch": 0.3476946334089191, "epsilon_dpo/beta": 0.03354346379637718, "epsilon_dpo/beta_margin_grad_mean": -0.3465607464313507, "epsilon_dpo/beta_margin_grad_std": 0.20480602979660034, "epsilon_dpo/beta_margin_mean": 0.780937671661377, "epsilon_dpo/beta_margin_std": 1.1852768659591675, "epsilon_dpo/loss_margin_mean": 23.4627685546875, "grad_norm": 57.785579681396484, "kl/avg_steps": 0.59375, "kl/beta": 0.03374044597148895, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.137151834863213e-07, "logits/chosen": -3.6343181133270264, "logits/rejected": -3.63087797164917, "logps/chosen": -76.2677993774414, "logps/ref_chosen": -56.47395706176758, "logps/ref_rejected": -59.56927490234375, "logps/rejected": -102.82588195800781, "loss": 1.0232, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6676421761512756, "rewards/margins": 0.780937671661377, "rewards/rejected": -1.4485797882080078, "step": 230 }, { "epoch": 0.3492063492063492, "epsilon_dpo/beta": 0.033293064683675766, "epsilon_dpo/beta_margin_grad_mean": -0.2813434600830078, "epsilon_dpo/beta_margin_grad_std": 0.18429261445999146, "epsilon_dpo/beta_margin_mean": 1.157078742980957, "epsilon_dpo/beta_margin_std": 1.064234733581543, "epsilon_dpo/loss_margin_mean": 34.87957763671875, "grad_norm": 47.93153762817383, "kl/avg_steps": 0.75, "kl/beta": 0.03354129567742348, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.1271363186719835e-07, "logits/chosen": -3.5638747215270996, "logits/rejected": -3.538975715637207, "logps/chosen": -80.92160034179688, "logps/ref_chosen": -61.39866638183594, "logps/ref_rejected": -80.97687530517578, "logps/rejected": -135.37939453125, "loss": 0.7458, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6524353623390198, "rewards/margins": 1.157078742980957, "rewards/rejected": -1.809514045715332, "step": 231 }, { "epoch": 0.3507180650037793, "epsilon_dpo/beta": 0.03310765326023102, "epsilon_dpo/beta_margin_grad_mean": -0.32703742384910583, "epsilon_dpo/beta_margin_grad_std": 0.2065211832523346, "epsilon_dpo/beta_margin_mean": 0.9263335466384888, "epsilon_dpo/beta_margin_std": 1.1869854927062988, "epsilon_dpo/loss_margin_mean": 28.175167083740234, "grad_norm": 54.2735481262207, "kl/avg_steps": 0.5625, "kl/beta": 0.03329160809516907, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.1170752879801436e-07, "logits/chosen": -3.5799169540405273, "logits/rejected": -3.657036304473877, "logps/chosen": -82.17314147949219, "logps/ref_chosen": -58.082557678222656, "logps/ref_rejected": -80.71989440917969, "logps/rejected": -132.9856414794922, "loss": 0.9238, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7996007204055786, "rewards/margins": 0.9263335466384888, "rewards/rejected": -1.7259342670440674, "step": 232 }, { "epoch": 0.35222978080120937, "epsilon_dpo/beta": 0.032943159341812134, "epsilon_dpo/beta_margin_grad_mean": -0.364286869764328, "epsilon_dpo/beta_margin_grad_std": 0.2133944183588028, "epsilon_dpo/beta_margin_mean": 0.6965461373329163, "epsilon_dpo/beta_margin_std": 1.1161088943481445, "epsilon_dpo/loss_margin_mean": 21.35443878173828, "grad_norm": 54.48058319091797, "kl/avg_steps": 0.5, "kl/beta": 0.03310539200901985, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.106969024216348e-07, "logits/chosen": -3.49937105178833, "logits/rejected": -3.567963123321533, "logps/chosen": -75.5169906616211, "logps/ref_chosen": -48.17005157470703, "logps/ref_rejected": -60.69952392578125, "logps/rejected": -109.40089416503906, "loss": 1.0643, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9055917263031006, "rewards/margins": 0.6965460777282715, "rewards/rejected": -1.602137804031372, "step": 233 }, { "epoch": 0.35374149659863946, "epsilon_dpo/beta": 0.0327998511493206, "epsilon_dpo/beta_margin_grad_mean": -0.36621546745300293, "epsilon_dpo/beta_margin_grad_std": 0.2287471890449524, "epsilon_dpo/beta_margin_mean": 0.6960792541503906, "epsilon_dpo/beta_margin_std": 1.186265230178833, "epsilon_dpo/loss_margin_mean": 21.471349716186523, "grad_norm": 68.93730163574219, "kl/avg_steps": 0.4375, "kl/beta": 0.032940685749053955, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.09681781007452e-07, "logits/chosen": -3.5102791786193848, "logits/rejected": -3.4917590618133545, "logps/chosen": -76.93553161621094, "logps/ref_chosen": -50.796932220458984, "logps/ref_rejected": -48.63116455078125, "logps/rejected": -96.2411117553711, "loss": 1.1007, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8619002103805542, "rewards/margins": 0.6960792541503906, "rewards/rejected": -1.5579794645309448, "step": 234 }, { "epoch": 0.35525321239606955, "epsilon_dpo/beta": 0.03257497400045395, "epsilon_dpo/beta_margin_grad_mean": -0.29326194524765015, "epsilon_dpo/beta_margin_grad_std": 0.18073107302188873, "epsilon_dpo/beta_margin_mean": 1.0693387985229492, "epsilon_dpo/beta_margin_std": 0.9997078776359558, "epsilon_dpo/loss_margin_mean": 32.96505355834961, "grad_norm": 44.331321716308594, "kl/avg_steps": 0.6875, "kl/beta": 0.0327971987426281, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.08662192950594e-07, "logits/chosen": -3.5926835536956787, "logits/rejected": -3.61897611618042, "logps/chosen": -79.71795654296875, "logps/ref_chosen": -58.720252990722656, "logps/ref_rejected": -74.06263732910156, "logps/rejected": -128.025390625, "loss": 0.7739, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6849663853645325, "rewards/margins": 1.0693387985229492, "rewards/rejected": -1.7543052434921265, "step": 235 }, { "epoch": 0.35676492819349964, "epsilon_dpo/beta": 0.03238309547305107, "epsilon_dpo/beta_margin_grad_mean": -0.31811246275901794, "epsilon_dpo/beta_margin_grad_std": 0.20673036575317383, "epsilon_dpo/beta_margin_mean": 0.9451549649238586, "epsilon_dpo/beta_margin_std": 1.1316167116165161, "epsilon_dpo/loss_margin_mean": 29.385644912719727, "grad_norm": 59.131553649902344, "kl/avg_steps": 0.59375, "kl/beta": 0.032573260366916656, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.076381667711306e-07, "logits/chosen": -3.6510281562805176, "logits/rejected": -3.5522732734680176, "logps/chosen": -93.43757629394531, "logps/ref_chosen": -60.77384948730469, "logps/ref_rejected": -76.09827423095703, "logps/rejected": -138.14764404296875, "loss": 0.9033, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0610227584838867, "rewards/margins": 0.9451549649238586, "rewards/rejected": -2.0061779022216797, "step": 236 }, { "epoch": 0.35827664399092973, "epsilon_dpo/beta": 0.03226279839873314, "epsilon_dpo/beta_margin_grad_mean": -0.34871017932891846, "epsilon_dpo/beta_margin_grad_std": 0.23624296486377716, "epsilon_dpo/beta_margin_mean": 0.8402957320213318, "epsilon_dpo/beta_margin_std": 1.295149803161621, "epsilon_dpo/loss_margin_mean": 26.331083297729492, "grad_norm": 68.4939193725586, "kl/avg_steps": 0.375, "kl/beta": 0.03238099813461304, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.066097311132753e-07, "logits/chosen": -3.571560859680176, "logits/rejected": -3.590184211730957, "logps/chosen": -97.25656127929688, "logps/ref_chosen": -65.96961975097656, "logps/ref_rejected": -77.60652923583984, "logps/rejected": -135.22454833984375, "loss": 1.0452, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0126960277557373, "rewards/margins": 0.840295672416687, "rewards/rejected": -1.8529917001724243, "step": 237 }, { "epoch": 0.35978835978835977, "epsilon_dpo/beta": 0.03209185227751732, "epsilon_dpo/beta_margin_grad_mean": -0.3394152820110321, "epsilon_dpo/beta_margin_grad_std": 0.22248654067516327, "epsilon_dpo/beta_margin_mean": 0.8316738605499268, "epsilon_dpo/beta_margin_std": 1.1292351484298706, "epsilon_dpo/loss_margin_mean": 26.1365966796875, "grad_norm": 210.83578491210938, "kl/avg_steps": 0.53125, "kl/beta": 0.032260023057460785, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.0557691474458414e-07, "logits/chosen": -3.4619462490081787, "logits/rejected": -3.4855332374572754, "logps/chosen": -145.80715942382812, "logps/ref_chosen": -118.00836181640625, "logps/ref_rejected": -67.71189880371094, "logps/rejected": -121.64729309082031, "loss": 0.9852, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8954745531082153, "rewards/margins": 0.8316739201545715, "rewards/rejected": -1.7271485328674316, "step": 238 }, { "epoch": 0.36130007558578986, "epsilon_dpo/beta": 0.03190220519900322, "epsilon_dpo/beta_margin_grad_mean": -0.32428762316703796, "epsilon_dpo/beta_margin_grad_std": 0.19579172134399414, "epsilon_dpo/beta_margin_mean": 0.9428533911705017, "epsilon_dpo/beta_margin_std": 1.1371135711669922, "epsilon_dpo/loss_margin_mean": 29.73337745666504, "grad_norm": 55.09555435180664, "kl/avg_steps": 0.59375, "kl/beta": 0.03208954632282257, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.045397465551513e-07, "logits/chosen": -3.5148532390594482, "logits/rejected": -3.626768112182617, "logps/chosen": -81.9236068725586, "logps/ref_chosen": -49.83290100097656, "logps/ref_rejected": -99.18492126464844, "logps/rejected": -161.00900268554688, "loss": 0.8895, "rewards/accuracies": 0.796875, "rewards/chosen": -1.0244648456573486, "rewards/margins": 0.9428534507751465, "rewards/rejected": -1.9673182964324951, "step": 239 }, { "epoch": 0.36281179138321995, "epsilon_dpo/beta": 0.03167402371764183, "epsilon_dpo/beta_margin_grad_mean": -0.29107651114463806, "epsilon_dpo/beta_margin_grad_std": 0.1893853396177292, "epsilon_dpo/beta_margin_mean": 1.0700339078903198, "epsilon_dpo/beta_margin_std": 1.0119413137435913, "epsilon_dpo/loss_margin_mean": 33.92853546142578, "grad_norm": 46.59968948364258, "kl/avg_steps": 0.71875, "kl/beta": 0.03190013766288757, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.0349825555680045e-07, "logits/chosen": -3.5461673736572266, "logits/rejected": -3.5441713333129883, "logps/chosen": -78.61238861083984, "logps/ref_chosen": -50.29046630859375, "logps/ref_rejected": -68.4387435913086, "logps/rejected": -130.689208984375, "loss": 0.7848, "rewards/accuracies": 0.875, "rewards/chosen": -0.8984587788581848, "rewards/margins": 1.0700339078903198, "rewards/rejected": -1.9684927463531494, "step": 240 }, { "epoch": 0.36432350718065004, "epsilon_dpo/beta": 0.031487587839365005, "epsilon_dpo/beta_margin_grad_mean": -0.31947940587997437, "epsilon_dpo/beta_margin_grad_std": 0.1979939490556717, "epsilon_dpo/beta_margin_mean": 0.949558436870575, "epsilon_dpo/beta_margin_std": 1.0902316570281982, "epsilon_dpo/loss_margin_mean": 30.336381912231445, "grad_norm": 51.10459899902344, "kl/avg_steps": 0.59375, "kl/beta": 0.03167249262332916, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.0245247088227377e-07, "logits/chosen": -3.584836721420288, "logits/rejected": -3.647287368774414, "logps/chosen": -86.01239013671875, "logps/ref_chosen": -58.00928497314453, "logps/ref_rejected": -76.18869018554688, "logps/rejected": -134.52816772460938, "loss": 0.8773, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8842865824699402, "rewards/margins": 0.9495584964752197, "rewards/rejected": -1.8338450193405151, "step": 241 }, { "epoch": 0.36583522297808013, "epsilon_dpo/beta": 0.031282052397727966, "epsilon_dpo/beta_margin_grad_mean": -0.2993624210357666, "epsilon_dpo/beta_margin_grad_std": 0.19578564167022705, "epsilon_dpo/beta_margin_mean": 1.0992188453674316, "epsilon_dpo/beta_margin_std": 1.2100465297698975, "epsilon_dpo/loss_margin_mean": 35.31301498413086, "grad_norm": 44.37929153442383, "kl/avg_steps": 0.65625, "kl/beta": 0.03148554638028145, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.0140242178441665e-07, "logits/chosen": -3.5604074001312256, "logits/rejected": -3.619284152984619, "logps/chosen": -71.79724884033203, "logps/ref_chosen": -51.619964599609375, "logps/ref_rejected": -65.46922302246094, "logps/rejected": -120.95951843261719, "loss": 0.821, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6324730515480042, "rewards/margins": 1.0992188453674316, "rewards/rejected": -1.731691837310791, "step": 242 }, { "epoch": 0.3673469387755102, "epsilon_dpo/beta": 0.031156310811638832, "epsilon_dpo/beta_margin_grad_mean": -0.35772407054901123, "epsilon_dpo/beta_margin_grad_std": 0.1977265179157257, "epsilon_dpo/beta_margin_mean": 0.7389934659004211, "epsilon_dpo/beta_margin_std": 1.023369312286377, "epsilon_dpo/loss_margin_mean": 23.934062957763672, "grad_norm": 65.09718322753906, "kl/avg_steps": 0.40625, "kl/beta": 0.0312802717089653, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.003481376353596e-07, "logits/chosen": -3.6756114959716797, "logits/rejected": -3.6021385192871094, "logps/chosen": -92.63142395019531, "logps/ref_chosen": -65.2960205078125, "logps/ref_rejected": -66.09979248046875, "logps/rejected": -117.36924743652344, "loss": 0.9927, "rewards/accuracies": 0.75, "rewards/chosen": -0.8550564050674438, "rewards/margins": 0.7389934659004211, "rewards/rejected": -1.5940498113632202, "step": 243 }, { "epoch": 0.3688586545729403, "epsilon_dpo/beta": 0.03095235861837864, "epsilon_dpo/beta_margin_grad_mean": -0.2953191101551056, "epsilon_dpo/beta_margin_grad_std": 0.18051616847515106, "epsilon_dpo/beta_margin_mean": 1.0589251518249512, "epsilon_dpo/beta_margin_std": 1.0004513263702393, "epsilon_dpo/loss_margin_mean": 34.36445236206055, "grad_norm": 54.02566146850586, "kl/avg_steps": 0.65625, "kl/beta": 0.031153708696365356, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.9928964792569654e-07, "logits/chosen": -3.537680149078369, "logits/rejected": -3.495872974395752, "logps/chosen": -71.62876892089844, "logps/ref_chosen": -47.62297058105469, "logps/ref_rejected": -53.68562316894531, "logps/rejected": -112.05587768554688, "loss": 0.7796, "rewards/accuracies": 0.875, "rewards/chosen": -0.7461987733840942, "rewards/margins": 1.0589251518249512, "rewards/rejected": -1.8051238059997559, "step": 244 }, { "epoch": 0.37037037037037035, "epsilon_dpo/beta": 0.03072153776884079, "epsilon_dpo/beta_margin_grad_mean": -0.2961544990539551, "epsilon_dpo/beta_margin_grad_std": 0.18561357259750366, "epsilon_dpo/beta_margin_mean": 1.0497902631759644, "epsilon_dpo/beta_margin_std": 1.0091371536254883, "epsilon_dpo/loss_margin_mean": 34.30048751831055, "grad_norm": 50.116539001464844, "kl/avg_steps": 0.75, "kl/beta": 0.030950594693422318, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.982269822636601e-07, "logits/chosen": -3.6024911403656006, "logits/rejected": -3.6006321907043457, "logps/chosen": -82.85564422607422, "logps/ref_chosen": -58.18247985839844, "logps/ref_rejected": -70.70956420898438, "logps/rejected": -129.6832275390625, "loss": 0.7911, "rewards/accuracies": 0.875, "rewards/chosen": -0.7608175277709961, "rewards/margins": 1.0497902631759644, "rewards/rejected": -1.81060791015625, "step": 245 }, { "epoch": 0.37188208616780044, "epsilon_dpo/beta": 0.030588850378990173, "epsilon_dpo/beta_margin_grad_mean": -0.35125380754470825, "epsilon_dpo/beta_margin_grad_std": 0.22386348247528076, "epsilon_dpo/beta_margin_mean": 0.7832393646240234, "epsilon_dpo/beta_margin_std": 1.1743152141571045, "epsilon_dpo/loss_margin_mean": 25.865478515625, "grad_norm": 63.129005432128906, "kl/avg_steps": 0.4375, "kl/beta": 0.030720192939043045, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.971601703742932e-07, "logits/chosen": -3.637173652648926, "logits/rejected": -3.640636920928955, "logps/chosen": -98.38485717773438, "logps/ref_chosen": -66.71534729003906, "logps/ref_rejected": -86.57673645019531, "logps/rejected": -144.11172485351562, "loss": 1.0314, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9731709957122803, "rewards/margins": 0.7832393646240234, "rewards/rejected": -1.7564103603363037, "step": 246 }, { "epoch": 0.37339380196523053, "epsilon_dpo/beta": 0.030446050688624382, "epsilon_dpo/beta_margin_grad_mean": -0.3735252618789673, "epsilon_dpo/beta_margin_grad_std": 0.20576192438602448, "epsilon_dpo/beta_margin_mean": 0.6456315517425537, "epsilon_dpo/beta_margin_std": 1.053346037864685, "epsilon_dpo/loss_margin_mean": 21.423704147338867, "grad_norm": 58.22221755981445, "kl/avg_steps": 0.46875, "kl/beta": 0.030586378648877144, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.960892420986177e-07, "logits/chosen": -3.68768310546875, "logits/rejected": -3.6840639114379883, "logps/chosen": -98.93038940429688, "logps/ref_chosen": -72.88449096679688, "logps/ref_rejected": -87.60906982421875, "logps/rejected": -135.07867431640625, "loss": 1.0736, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7954668998718262, "rewards/margins": 0.6456315517425537, "rewards/rejected": -1.4410984516143799, "step": 247 }, { "epoch": 0.3749055177626606, "epsilon_dpo/beta": 0.03028496913611889, "epsilon_dpo/beta_margin_grad_mean": -0.33445361256599426, "epsilon_dpo/beta_margin_grad_std": 0.2087889015674591, "epsilon_dpo/beta_margin_mean": 0.8773745894432068, "epsilon_dpo/beta_margin_std": 1.1472259759902954, "epsilon_dpo/loss_margin_mean": 29.191547393798828, "grad_norm": 55.27407455444336, "kl/avg_steps": 0.53125, "kl/beta": 0.030443673953413963, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.9501422739279953e-07, "logits/chosen": -3.6465039253234863, "logits/rejected": -3.545814037322998, "logps/chosen": -89.03305053710938, "logps/ref_chosen": -65.4447021484375, "logps/ref_rejected": -56.367881774902344, "logps/rejected": -109.14777374267578, "loss": 0.9483, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7187416553497314, "rewards/margins": 0.8773746490478516, "rewards/rejected": -1.596116304397583, "step": 248 }, { "epoch": 0.3764172335600907, "epsilon_dpo/beta": 0.030191179364919662, "epsilon_dpo/beta_margin_grad_mean": -0.42417946457862854, "epsilon_dpo/beta_margin_grad_std": 0.2049740105867386, "epsilon_dpo/beta_margin_mean": 0.37237900495529175, "epsilon_dpo/beta_margin_std": 1.0296086072921753, "epsilon_dpo/loss_margin_mean": 12.566636085510254, "grad_norm": 65.10012817382812, "kl/avg_steps": 0.3125, "kl/beta": 0.030282795429229736, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.9393515632731094e-07, "logits/chosen": -3.6460013389587402, "logits/rejected": -3.5336740016937256, "logps/chosen": -101.76521301269531, "logps/ref_chosen": -70.15800476074219, "logps/ref_rejected": -56.59617614746094, "logps/rejected": -100.77001953125, "loss": 1.2786, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9584237337112427, "rewards/margins": 0.37237900495529175, "rewards/rejected": -1.3308026790618896, "step": 249 }, { "epoch": 0.3779289493575208, "epsilon_dpo/beta": 0.02996504120528698, "epsilon_dpo/beta_margin_grad_mean": -0.284501314163208, "epsilon_dpo/beta_margin_grad_std": 0.18751470744609833, "epsilon_dpo/beta_margin_mean": 1.0861340761184692, "epsilon_dpo/beta_margin_std": 1.0152329206466675, "epsilon_dpo/loss_margin_mean": 36.3934211730957, "grad_norm": 50.31420135498047, "kl/avg_steps": 0.75, "kl/beta": 0.030188458040356636, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.9285205908608934e-07, "logits/chosen": -3.577256202697754, "logits/rejected": -3.527822494506836, "logps/chosen": -76.02413177490234, "logps/ref_chosen": -54.87158966064453, "logps/ref_rejected": -61.89708709716797, "logps/rejected": -119.44305419921875, "loss": 0.7794, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6355237364768982, "rewards/margins": 1.0861341953277588, "rewards/rejected": -1.7216578722000122, "step": 250 }, { "epoch": 0.3794406651549509, "epsilon_dpo/beta": 0.02985435165464878, "epsilon_dpo/beta_margin_grad_mean": -0.4094768762588501, "epsilon_dpo/beta_margin_grad_std": 0.19858568906784058, "epsilon_dpo/beta_margin_mean": 0.4522101581096649, "epsilon_dpo/beta_margin_std": 0.9909750819206238, "epsilon_dpo/loss_margin_mean": 15.362441062927246, "grad_norm": 61.0008544921875, "kl/avg_steps": 0.375, "kl/beta": 0.02996372990310192, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.9176496596569265e-07, "logits/chosen": -3.6427130699157715, "logits/rejected": -3.6399378776550293, "logps/chosen": -87.44236755371094, "logps/ref_chosen": -60.74372863769531, "logps/ref_rejected": -69.62147521972656, "logps/rejected": -111.68255615234375, "loss": 1.1965, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7996013760566711, "rewards/margins": 0.45221012830734253, "rewards/rejected": -1.2518115043640137, "step": 251 }, { "epoch": 0.38095238095238093, "epsilon_dpo/beta": 0.029733484610915184, "epsilon_dpo/beta_margin_grad_mean": -0.37406808137893677, "epsilon_dpo/beta_margin_grad_std": 0.20288723707199097, "epsilon_dpo/beta_margin_mean": 0.6282574534416199, "epsilon_dpo/beta_margin_std": 1.1099085807800293, "epsilon_dpo/loss_margin_mean": 21.3692569732666, "grad_norm": 52.91516876220703, "kl/avg_steps": 0.40625, "kl/beta": 0.02985178492963314, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.9067390737445254e-07, "logits/chosen": -3.580700397491455, "logits/rejected": -3.557034730911255, "logps/chosen": -69.9707260131836, "logps/ref_chosen": -48.75269317626953, "logps/ref_rejected": -69.82833862304688, "logps/rejected": -112.41563415527344, "loss": 1.1013, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6327940225601196, "rewards/margins": 0.6282573938369751, "rewards/rejected": -1.2610514163970947, "step": 252 }, { "epoch": 0.382464096749811, "epsilon_dpo/beta": 0.02960389107465744, "epsilon_dpo/beta_margin_grad_mean": -0.3725152909755707, "epsilon_dpo/beta_margin_grad_std": 0.20365676283836365, "epsilon_dpo/beta_margin_mean": 0.6370367407798767, "epsilon_dpo/beta_margin_std": 1.0394784212112427, "epsilon_dpo/loss_margin_mean": 21.740999221801758, "grad_norm": 44.64933395385742, "kl/avg_steps": 0.4375, "kl/beta": 0.02973100356757641, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.8957891383162304e-07, "logits/chosen": -3.490905284881592, "logits/rejected": -3.5019659996032715, "logps/chosen": -64.89385986328125, "logps/ref_chosen": -46.66432189941406, "logps/ref_rejected": -51.97372817993164, "logps/rejected": -91.94425964355469, "loss": 1.0754, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5417308807373047, "rewards/margins": 0.6370368003845215, "rewards/rejected": -1.1787676811218262, "step": 253 }, { "epoch": 0.3839758125472411, "epsilon_dpo/beta": 0.029493439942598343, "epsilon_dpo/beta_margin_grad_mean": -0.3818342983722687, "epsilon_dpo/beta_margin_grad_std": 0.2021295726299286, "epsilon_dpo/beta_margin_mean": 0.608883798122406, "epsilon_dpo/beta_margin_std": 1.035619854927063, "epsilon_dpo/loss_margin_mean": 20.876787185668945, "grad_norm": 58.75230026245117, "kl/avg_steps": 0.375, "kl/beta": 0.02960149571299553, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.884800159665276e-07, "logits/chosen": -3.546489953994751, "logits/rejected": -3.6087217330932617, "logps/chosen": -84.4720458984375, "logps/ref_chosen": -58.050567626953125, "logps/ref_rejected": -75.31587219238281, "logps/rejected": -122.61415100097656, "loss": 1.0908, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7838003635406494, "rewards/margins": 0.608883798122406, "rewards/rejected": -1.3926842212677002, "step": 254 }, { "epoch": 0.3854875283446712, "epsilon_dpo/beta": 0.02930951863527298, "epsilon_dpo/beta_margin_grad_mean": -0.33775338530540466, "epsilon_dpo/beta_margin_grad_std": 0.20432068407535553, "epsilon_dpo/beta_margin_mean": 0.835517942905426, "epsilon_dpo/beta_margin_std": 1.110921025276184, "epsilon_dpo/loss_margin_mean": 28.69426727294922, "grad_norm": 49.94908142089844, "kl/avg_steps": 0.625, "kl/beta": 0.02949090488255024, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.873772445177015e-07, "logits/chosen": -3.6488828659057617, "logits/rejected": -3.6104092597961426, "logps/chosen": -87.12797546386719, "logps/ref_chosen": -67.61114501953125, "logps/ref_rejected": -77.63667297363281, "logps/rejected": -125.8477783203125, "loss": 0.9618, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5737706422805786, "rewards/margins": 0.8355178833007812, "rewards/rejected": -1.4092886447906494, "step": 255 }, { "epoch": 0.3869992441421013, "epsilon_dpo/beta": 0.029154948890209198, "epsilon_dpo/beta_margin_grad_mean": -0.33983492851257324, "epsilon_dpo/beta_margin_grad_std": 0.18856678903102875, "epsilon_dpo/beta_margin_mean": 0.8219020366668701, "epsilon_dpo/beta_margin_std": 0.9891291260719299, "epsilon_dpo/loss_margin_mean": 28.378610610961914, "grad_norm": 48.428062438964844, "kl/avg_steps": 0.53125, "kl/beta": 0.029307732358574867, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.862706303320329e-07, "logits/chosen": -3.5416808128356934, "logits/rejected": -3.537715435028076, "logps/chosen": -72.05642700195312, "logps/ref_chosen": -51.523719787597656, "logps/ref_rejected": -83.32015991210938, "logps/rejected": -132.23147583007812, "loss": 0.9217, "rewards/accuracies": 0.75, "rewards/chosen": -0.6007453203201294, "rewards/margins": 0.8219020366668701, "rewards/rejected": -1.4226473569869995, "step": 256 }, { "epoch": 0.3885109599395314, "epsilon_dpo/beta": 0.02893710322678089, "epsilon_dpo/beta_margin_grad_mean": -0.325363427400589, "epsilon_dpo/beta_margin_grad_std": 0.20933221280574799, "epsilon_dpo/beta_margin_mean": 0.9128665924072266, "epsilon_dpo/beta_margin_std": 1.2138084173202515, "epsilon_dpo/loss_margin_mean": 31.714256286621094, "grad_norm": 53.623252868652344, "kl/avg_steps": 0.75, "kl/beta": 0.02915285713970661, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.851602043638994e-07, "logits/chosen": -3.6359810829162598, "logits/rejected": -3.631044387817383, "logps/chosen": -84.15858459472656, "logps/ref_chosen": -60.662689208984375, "logps/ref_rejected": -97.52305603027344, "logps/rejected": -152.73321533203125, "loss": 0.9476, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6823408603668213, "rewards/margins": 0.9128665328025818, "rewards/rejected": -1.5952074527740479, "step": 257 }, { "epoch": 0.3900226757369615, "epsilon_dpo/beta": 0.02876690775156021, "epsilon_dpo/beta_margin_grad_mean": -0.3407791256904602, "epsilon_dpo/beta_margin_grad_std": 0.16844306886196136, "epsilon_dpo/beta_margin_mean": 0.7861775755882263, "epsilon_dpo/beta_margin_std": 0.8930336236953735, "epsilon_dpo/loss_margin_mean": 27.47364616394043, "grad_norm": 47.24456787109375, "kl/avg_steps": 0.59375, "kl/beta": 0.028935838490724564, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.840459976743023e-07, "logits/chosen": -3.5434560775756836, "logits/rejected": -3.6504127979278564, "logps/chosen": -73.260986328125, "logps/ref_chosen": -50.68846893310547, "logps/ref_rejected": -79.4976577758789, "logps/rejected": -129.5438232421875, "loss": 0.9087, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6507036685943604, "rewards/margins": 0.7861775159835815, "rewards/rejected": -1.4368813037872314, "step": 258 }, { "epoch": 0.3915343915343915, "epsilon_dpo/beta": 0.028588123619556427, "epsilon_dpo/beta_margin_grad_mean": -0.3228612244129181, "epsilon_dpo/beta_margin_grad_std": 0.1956639140844345, "epsilon_dpo/beta_margin_mean": 0.9073410034179688, "epsilon_dpo/beta_margin_std": 1.0478018522262573, "epsilon_dpo/loss_margin_mean": 31.930784225463867, "grad_norm": 49.15840148925781, "kl/avg_steps": 0.625, "kl/beta": 0.02876504696905613, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.8292804142999796e-07, "logits/chosen": -3.5949811935424805, "logits/rejected": -3.6072301864624023, "logps/chosen": -75.76548767089844, "logps/ref_chosen": -59.32402038574219, "logps/ref_rejected": -83.28133392333984, "logps/rejected": -131.65359497070312, "loss": 0.8915, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4726943075656891, "rewards/margins": 0.9073410034179688, "rewards/rejected": -1.3800352811813354, "step": 259 }, { "epoch": 0.3930461073318216, "epsilon_dpo/beta": 0.028437361121177673, "epsilon_dpo/beta_margin_grad_mean": -0.3530673086643219, "epsilon_dpo/beta_margin_grad_std": 0.1954687088727951, "epsilon_dpo/beta_margin_mean": 0.7355630993843079, "epsilon_dpo/beta_margin_std": 1.008644461631775, "epsilon_dpo/loss_margin_mean": 26.08067512512207, "grad_norm": 47.68777084350586, "kl/avg_steps": 0.53125, "kl/beta": 0.028586382046341896, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.818063669026256e-07, "logits/chosen": -3.5422921180725098, "logits/rejected": -3.5187509059906006, "logps/chosen": -62.47365951538086, "logps/ref_chosen": -44.53438949584961, "logps/ref_rejected": -69.82275390625, "logps/rejected": -113.84269714355469, "loss": 0.9904, "rewards/accuracies": 0.75, "rewards/chosen": -0.5131896734237671, "rewards/margins": 0.7355630993843079, "rewards/rejected": -1.2487528324127197, "step": 260 }, { "epoch": 0.3945578231292517, "epsilon_dpo/beta": 0.028260424733161926, "epsilon_dpo/beta_margin_grad_mean": -0.327075332403183, "epsilon_dpo/beta_margin_grad_std": 0.15859192609786987, "epsilon_dpo/beta_margin_mean": 0.832417905330658, "epsilon_dpo/beta_margin_std": 0.8205698132514954, "epsilon_dpo/loss_margin_mean": 29.594884872436523, "grad_norm": 44.85503005981445, "kl/avg_steps": 0.625, "kl/beta": 0.028435319662094116, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.806810054678331e-07, "logits/chosen": -3.5502078533172607, "logits/rejected": -3.4599995613098145, "logps/chosen": -78.87286376953125, "logps/ref_chosen": -57.50079345703125, "logps/ref_rejected": -58.719940185546875, "logps/rejected": -109.6868896484375, "loss": 0.8569, "rewards/accuracies": 0.875, "rewards/chosen": -0.6041717529296875, "rewards/margins": 0.832417905330658, "rewards/rejected": -1.4365897178649902, "step": 261 }, { "epoch": 0.3960695389266818, "epsilon_dpo/beta": 0.02805839665234089, "epsilon_dpo/beta_margin_grad_mean": -0.33737432956695557, "epsilon_dpo/beta_margin_grad_std": 0.1681991070508957, "epsilon_dpo/beta_margin_mean": 0.777191162109375, "epsilon_dpo/beta_margin_std": 0.8419005274772644, "epsilon_dpo/loss_margin_mean": 27.834020614624023, "grad_norm": 50.670440673828125, "kl/avg_steps": 0.71875, "kl/beta": 0.028258701786398888, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.7955198860439887e-07, "logits/chosen": -3.610352039337158, "logits/rejected": -3.5990848541259766, "logps/chosen": -79.70450592041016, "logps/ref_chosen": -58.254920959472656, "logps/ref_rejected": -76.08486938476562, "logps/rejected": -125.36847686767578, "loss": 0.9033, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6038655042648315, "rewards/margins": 0.777191162109375, "rewards/rejected": -1.381056785583496, "step": 262 }, { "epoch": 0.3975812547241119, "epsilon_dpo/beta": 0.027919549494981766, "epsilon_dpo/beta_margin_grad_mean": -0.3778422772884369, "epsilon_dpo/beta_margin_grad_std": 0.19608712196350098, "epsilon_dpo/beta_margin_mean": 0.5965914726257324, "epsilon_dpo/beta_margin_std": 0.959089994430542, "epsilon_dpo/loss_margin_mean": 21.586875915527344, "grad_norm": 49.47671890258789, "kl/avg_steps": 0.5, "kl/beta": 0.028057042509317398, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.784193478933516e-07, "logits/chosen": -3.530191421508789, "logits/rejected": -3.6355128288269043, "logps/chosen": -77.72067260742188, "logps/ref_chosen": -52.77592468261719, "logps/ref_rejected": -76.45590209960938, "logps/rejected": -122.9875259399414, "loss": 1.0757, "rewards/accuracies": 0.765625, "rewards/chosen": -0.69884192943573, "rewards/margins": 0.5965914130210876, "rewards/rejected": -1.2954332828521729, "step": 263 }, { "epoch": 0.39909297052154197, "epsilon_dpo/beta": 0.02780681848526001, "epsilon_dpo/beta_margin_grad_mean": -0.348049134016037, "epsilon_dpo/beta_margin_grad_std": 0.18626873195171356, "epsilon_dpo/beta_margin_mean": 0.761427640914917, "epsilon_dpo/beta_margin_std": 0.9667872786521912, "epsilon_dpo/loss_margin_mean": 27.59726333618164, "grad_norm": 51.054840087890625, "kl/avg_steps": 0.40625, "kl/beta": 0.02791745401918888, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.7728311501708674e-07, "logits/chosen": -3.608062744140625, "logits/rejected": -3.5790553092956543, "logps/chosen": -98.29541015625, "logps/ref_chosen": -69.18888092041016, "logps/ref_rejected": -72.205078125, "logps/rejected": -128.90887451171875, "loss": 0.9556, "rewards/accuracies": 0.703125, "rewards/chosen": -0.811768114566803, "rewards/margins": 0.7614275813102722, "rewards/rejected": -1.5731956958770752, "step": 264 }, { "epoch": 0.40060468631897206, "epsilon_dpo/beta": 0.027668243274092674, "epsilon_dpo/beta_margin_grad_mean": -0.3470227122306824, "epsilon_dpo/beta_margin_grad_std": 0.2068847268819809, "epsilon_dpo/beta_margin_mean": 0.7655115723609924, "epsilon_dpo/beta_margin_std": 1.0473060607910156, "epsilon_dpo/loss_margin_mean": 27.915241241455078, "grad_norm": 53.42154312133789, "kl/avg_steps": 0.5, "kl/beta": 0.027804499492049217, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.7614332175848027e-07, "logits/chosen": -3.4719674587249756, "logits/rejected": -3.466555118560791, "logps/chosen": -70.08794403076172, "logps/ref_chosen": -43.96622085571289, "logps/ref_rejected": -59.57981872558594, "logps/rejected": -113.61679077148438, "loss": 0.9916, "rewards/accuracies": 0.75, "rewards/chosen": -0.7268335819244385, "rewards/margins": 0.7655115127563477, "rewards/rejected": -1.4923452138900757, "step": 265 }, { "epoch": 0.4021164021164021, "epsilon_dpo/beta": 0.027461417019367218, "epsilon_dpo/beta_margin_grad_mean": -0.3286169171333313, "epsilon_dpo/beta_margin_grad_std": 0.16591742634773254, "epsilon_dpo/beta_margin_mean": 0.8412624001502991, "epsilon_dpo/beta_margin_std": 0.8860583305358887, "epsilon_dpo/loss_margin_mean": 30.753999710083008, "grad_norm": 47.37533187866211, "kl/avg_steps": 0.75, "kl/beta": 0.02766616828739643, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.75e-07, "logits/chosen": -3.495457410812378, "logits/rejected": -3.5999817848205566, "logps/chosen": -70.7826156616211, "logps/ref_chosen": -49.006813049316406, "logps/ref_rejected": -71.20953369140625, "logps/rejected": -123.73934173583984, "loss": 0.8697, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5995597243309021, "rewards/margins": 0.8412623405456543, "rewards/rejected": -1.4408221244812012, "step": 266 }, { "epoch": 0.4036281179138322, "epsilon_dpo/beta": 0.02730848267674446, "epsilon_dpo/beta_margin_grad_mean": -0.3450157940387726, "epsilon_dpo/beta_margin_grad_std": 0.18275727331638336, "epsilon_dpo/beta_margin_mean": 0.7722615003585815, "epsilon_dpo/beta_margin_std": 0.9526923894882202, "epsilon_dpo/loss_margin_mean": 28.463619232177734, "grad_norm": 48.18452453613281, "kl/avg_steps": 0.5625, "kl/beta": 0.027460215613245964, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.738531817228131e-07, "logits/chosen": -3.417224884033203, "logits/rejected": -3.4844722747802734, "logps/chosen": -66.78785705566406, "logps/ref_chosen": -45.540836334228516, "logps/ref_rejected": -61.607276916503906, "logps/rejected": -111.31791687011719, "loss": 0.9414, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5823400020599365, "rewards/margins": 0.7722614407539368, "rewards/rejected": -1.354601502418518, "step": 267 }, { "epoch": 0.4051398337112623, "epsilon_dpo/beta": 0.027181332930922508, "epsilon_dpo/beta_margin_grad_mean": -0.3615582585334778, "epsilon_dpo/beta_margin_grad_std": 0.17675510048866272, "epsilon_dpo/beta_margin_mean": 0.6903026700019836, "epsilon_dpo/beta_margin_std": 0.9339083433151245, "epsilon_dpo/loss_margin_mean": 25.58917808532715, "grad_norm": 46.5500602722168, "kl/avg_steps": 0.46875, "kl/beta": 0.02730661630630493, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.7270289900589204e-07, "logits/chosen": -3.5379815101623535, "logits/rejected": -3.559307813644409, "logps/chosen": -76.1719970703125, "logps/ref_chosen": -51.97987365722656, "logps/ref_rejected": -62.867828369140625, "logps/rejected": -112.64912414550781, "loss": 0.9857, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6590510010719299, "rewards/margins": 0.6903026103973389, "rewards/rejected": -1.349353551864624, "step": 268 }, { "epoch": 0.40665154950869237, "epsilon_dpo/beta": 0.027067216113209724, "epsilon_dpo/beta_margin_grad_mean": -0.3807787597179413, "epsilon_dpo/beta_margin_grad_std": 0.17998461425304413, "epsilon_dpo/beta_margin_mean": 0.5820295214653015, "epsilon_dpo/beta_margin_std": 0.8969993591308594, "epsilon_dpo/loss_margin_mean": 21.712522506713867, "grad_norm": 49.719390869140625, "kl/avg_steps": 0.421875, "kl/beta": 0.027179215103387833, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.7154918402511714e-07, "logits/chosen": -3.601137638092041, "logits/rejected": -3.546011209487915, "logps/chosen": -92.56260681152344, "logps/ref_chosen": -60.658966064453125, "logps/ref_rejected": -78.83199310302734, "logps/rejected": -132.44815063476562, "loss": 1.0586, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8667058348655701, "rewards/margins": 0.5820295214653015, "rewards/rejected": -1.4487353563308716, "step": 269 }, { "epoch": 0.40816326530612246, "epsilon_dpo/beta": 0.026907024905085564, "epsilon_dpo/beta_margin_grad_mean": -0.36636677384376526, "epsilon_dpo/beta_margin_grad_std": 0.17549282312393188, "epsilon_dpo/beta_margin_mean": 0.6481313705444336, "epsilon_dpo/beta_margin_std": 0.8995020985603333, "epsilon_dpo/loss_margin_mean": 24.250646591186523, "grad_norm": 54.60033416748047, "kl/avg_steps": 0.59375, "kl/beta": 0.027065033093094826, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.7039206905237656e-07, "logits/chosen": -3.496518135070801, "logits/rejected": -3.591644763946533, "logps/chosen": -83.84132385253906, "logps/ref_chosen": -57.96975326538086, "logps/ref_rejected": -73.79118347167969, "logps/rejected": -123.91339111328125, "loss": 1.0087, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6983015537261963, "rewards/margins": 0.6481313705444336, "rewards/rejected": -1.3464329242706299, "step": 270 }, { "epoch": 0.40967498110355255, "epsilon_dpo/beta": 0.02681547775864601, "epsilon_dpo/beta_margin_grad_mean": -0.38882285356521606, "epsilon_dpo/beta_margin_grad_std": 0.21483492851257324, "epsilon_dpo/beta_margin_mean": 0.6285286545753479, "epsilon_dpo/beta_margin_std": 1.1996771097183228, "epsilon_dpo/loss_margin_mean": 23.72130584716797, "grad_norm": 51.445621490478516, "kl/avg_steps": 0.34375, "kl/beta": 0.026905283331871033, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.692315864546635e-07, "logits/chosen": -3.55088472366333, "logits/rejected": -3.6585793495178223, "logps/chosen": -84.46916961669922, "logps/ref_chosen": -60.323020935058594, "logps/ref_rejected": -87.9923095703125, "logps/rejected": -135.85975646972656, "loss": 1.1329, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6505815386772156, "rewards/margins": 0.6285287141799927, "rewards/rejected": -1.2791101932525635, "step": 271 }, { "epoch": 0.41118669690098264, "epsilon_dpo/beta": 0.026614677160978317, "epsilon_dpo/beta_margin_grad_mean": -0.30074161291122437, "epsilon_dpo/beta_margin_grad_std": 0.15430134534835815, "epsilon_dpo/beta_margin_mean": 0.9711037874221802, "epsilon_dpo/beta_margin_std": 0.809634268283844, "epsilon_dpo/loss_margin_mean": 36.59514617919922, "grad_norm": 38.902626037597656, "kl/avg_steps": 0.75, "kl/beta": 0.026813114061951637, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.6806776869317067e-07, "logits/chosen": -3.6169586181640625, "logits/rejected": -3.551443099975586, "logps/chosen": -76.90818786621094, "logps/ref_chosen": -57.508968353271484, "logps/ref_rejected": -66.32839965820312, "logps/rejected": -122.32276153564453, "loss": 0.7689, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5169238448143005, "rewards/margins": 0.9711037874221802, "rewards/rejected": -1.488027572631836, "step": 272 }, { "epoch": 0.4126984126984127, "epsilon_dpo/beta": 0.0264331866055727, "epsilon_dpo/beta_margin_grad_mean": -0.32822009921073914, "epsilon_dpo/beta_margin_grad_std": 0.18097271025180817, "epsilon_dpo/beta_margin_mean": 0.8469125032424927, "epsilon_dpo/beta_margin_std": 0.9704737663269043, "epsilon_dpo/loss_margin_mean": 32.213897705078125, "grad_norm": 46.045562744140625, "kl/avg_steps": 0.6875, "kl/beta": 0.0266135111451149, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.669006483223828e-07, "logits/chosen": -3.5004308223724365, "logits/rejected": -3.5750699043273926, "logps/chosen": -76.95924377441406, "logps/ref_chosen": -52.600013732910156, "logps/ref_rejected": -77.47993469238281, "logps/rejected": -134.0530548095703, "loss": 0.8978, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6464325785636902, "rewards/margins": 0.8469125032424927, "rewards/rejected": -1.493345022201538, "step": 273 }, { "epoch": 0.41421012849584277, "epsilon_dpo/beta": 0.02626095898449421, "epsilon_dpo/beta_margin_grad_mean": -0.327284574508667, "epsilon_dpo/beta_margin_grad_std": 0.18543575704097748, "epsilon_dpo/beta_margin_mean": 0.8514215350151062, "epsilon_dpo/beta_margin_std": 0.9426968693733215, "epsilon_dpo/loss_margin_mean": 32.6125602722168, "grad_norm": 45.8482666015625, "kl/avg_steps": 0.65625, "kl/beta": 0.026431793347001076, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.657302579891656e-07, "logits/chosen": -3.4862723350524902, "logits/rejected": -3.451174736022949, "logps/chosen": -76.99978637695312, "logps/ref_chosen": -50.854888916015625, "logps/ref_rejected": -60.379032135009766, "logps/rejected": -119.13648986816406, "loss": 0.8907, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6887195110321045, "rewards/margins": 0.8514215350151062, "rewards/rejected": -1.5401411056518555, "step": 274 }, { "epoch": 0.41572184429327286, "epsilon_dpo/beta": 0.02609795331954956, "epsilon_dpo/beta_margin_grad_mean": -0.3101238012313843, "epsilon_dpo/beta_margin_grad_std": 0.16840222477912903, "epsilon_dpo/beta_margin_mean": 0.9131906628608704, "epsilon_dpo/beta_margin_std": 0.8416401147842407, "epsilon_dpo/loss_margin_mean": 35.1732177734375, "grad_norm": 49.62858200073242, "kl/avg_steps": 0.625, "kl/beta": 0.026259465143084526, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.645566304318526e-07, "logits/chosen": -3.4851372241973877, "logits/rejected": -3.459151029586792, "logps/chosen": -74.43925476074219, "logps/ref_chosen": -48.05162811279297, "logps/ref_rejected": -63.60442352294922, "logps/rejected": -125.16526794433594, "loss": 0.8191, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6903046369552612, "rewards/margins": 0.9131906032562256, "rewards/rejected": -1.6034951210021973, "step": 275 }, { "epoch": 0.41723356009070295, "epsilon_dpo/beta": 0.025935852900147438, "epsilon_dpo/beta_margin_grad_mean": -0.3383890688419342, "epsilon_dpo/beta_margin_grad_std": 0.16998395323753357, "epsilon_dpo/beta_margin_mean": 0.7773675918579102, "epsilon_dpo/beta_margin_std": 0.8601058125495911, "epsilon_dpo/loss_margin_mean": 30.13843536376953, "grad_norm": 43.81614685058594, "kl/avg_steps": 0.625, "kl/beta": 0.026096362620592117, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.633797984793294e-07, "logits/chosen": -3.4434266090393066, "logits/rejected": -3.4127590656280518, "logps/chosen": -66.47349548339844, "logps/ref_chosen": -43.999977111816406, "logps/ref_rejected": -59.52679443359375, "logps/rejected": -112.13874816894531, "loss": 0.9087, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5843386054039001, "rewards/margins": 0.7773675918579102, "rewards/rejected": -1.3617061376571655, "step": 276 }, { "epoch": 0.41874527588813304, "epsilon_dpo/beta": 0.025863919407129288, "epsilon_dpo/beta_margin_grad_mean": -0.4072629511356354, "epsilon_dpo/beta_margin_grad_std": 0.19686594605445862, "epsilon_dpo/beta_margin_mean": 0.4675597548484802, "epsilon_dpo/beta_margin_std": 0.991973876953125, "epsilon_dpo/loss_margin_mean": 18.34613037109375, "grad_norm": 58.08272933959961, "kl/avg_steps": 0.28125, "kl/beta": 0.02593427337706089, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 3.6219979505011555e-07, "logits/chosen": -3.5127882957458496, "logits/rejected": -3.4736993312835693, "logps/chosen": -89.53678131103516, "logps/ref_chosen": -60.1158447265625, "logps/ref_rejected": -64.94580841064453, "logps/rejected": -112.71287536621094, "loss": 1.1836, "rewards/accuracies": 0.640625, "rewards/chosen": -0.7645653486251831, "rewards/margins": 0.4675597548484802, "rewards/rejected": -1.2321250438690186, "step": 277 }, { "epoch": 0.42025699168556313, "epsilon_dpo/beta": 0.025710556656122208, "epsilon_dpo/beta_margin_grad_mean": -0.34068748354911804, "epsilon_dpo/beta_margin_grad_std": 0.1879613846540451, "epsilon_dpo/beta_margin_mean": 0.7573944926261902, "epsilon_dpo/beta_margin_std": 0.9451728463172913, "epsilon_dpo/loss_margin_mean": 29.676828384399414, "grad_norm": 53.42407989501953, "kl/avg_steps": 0.59375, "kl/beta": 0.02586153894662857, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.6101665315144353e-07, "logits/chosen": -3.516630172729492, "logits/rejected": -3.613682270050049, "logps/chosen": -89.52540588378906, "logps/ref_chosen": -59.255680084228516, "logps/ref_rejected": -86.18937683105469, "logps/rejected": -146.1359405517578, "loss": 0.9571, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7802098989486694, "rewards/margins": 0.757394552230835, "rewards/rejected": -1.5376043319702148, "step": 278 }, { "epoch": 0.4217687074829932, "epsilon_dpo/beta": 0.02551862597465515, "epsilon_dpo/beta_margin_grad_mean": -0.3212796449661255, "epsilon_dpo/beta_margin_grad_std": 0.16688181459903717, "epsilon_dpo/beta_margin_mean": 0.862905740737915, "epsilon_dpo/beta_margin_std": 0.8581858277320862, "epsilon_dpo/loss_margin_mean": 33.94643020629883, "grad_norm": 52.292667388916016, "kl/avg_steps": 0.75, "kl/beta": 0.025708891451358795, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.5983040587833563e-07, "logits/chosen": -3.571943521499634, "logits/rejected": -3.5322561264038086, "logps/chosen": -71.05947875976562, "logps/ref_chosen": -49.807952880859375, "logps/ref_rejected": -66.16442108154297, "logps/rejected": -121.36237335205078, "loss": 0.8524, "rewards/accuracies": 0.875, "rewards/chosen": -0.5434627532958984, "rewards/margins": 0.862905740737915, "rewards/rejected": -1.4063684940338135, "step": 279 }, { "epoch": 0.42328042328042326, "epsilon_dpo/beta": 0.02533663623034954, "epsilon_dpo/beta_margin_grad_mean": -0.32832470536231995, "epsilon_dpo/beta_margin_grad_std": 0.16541075706481934, "epsilon_dpo/beta_margin_mean": 0.8273611664772034, "epsilon_dpo/beta_margin_std": 0.852222740650177, "epsilon_dpo/loss_margin_mean": 32.796417236328125, "grad_norm": 42.27378845214844, "kl/avg_steps": 0.71875, "kl/beta": 0.025517510250210762, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.586410864126781e-07, "logits/chosen": -3.5194931030273438, "logits/rejected": -3.5373425483703613, "logps/chosen": -80.07586669921875, "logps/ref_chosen": -54.87303161621094, "logps/ref_rejected": -73.10842895507812, "logps/rejected": -131.10769653320312, "loss": 0.8717, "rewards/accuracies": 0.875, "rewards/chosen": -0.639750599861145, "rewards/margins": 0.8273611664772034, "rewards/rejected": -1.4671118259429932, "step": 280 }, { "epoch": 0.42479213907785335, "epsilon_dpo/beta": 0.025179583579301834, "epsilon_dpo/beta_margin_grad_mean": -0.3320300579071045, "epsilon_dpo/beta_margin_grad_std": 0.16534727811813354, "epsilon_dpo/beta_margin_mean": 0.7969549298286438, "epsilon_dpo/beta_margin_std": 0.818410336971283, "epsilon_dpo/loss_margin_mean": 31.821311950683594, "grad_norm": 42.852996826171875, "kl/avg_steps": 0.625, "kl/beta": 0.025335412472486496, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.574487280222929e-07, "logits/chosen": -3.5348010063171387, "logits/rejected": -3.432969093322754, "logps/chosen": -76.64735412597656, "logps/ref_chosen": -54.82600402832031, "logps/ref_rejected": -58.821510314941406, "logps/rejected": -112.46417236328125, "loss": 0.8838, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5524246692657471, "rewards/margins": 0.7969549894332886, "rewards/rejected": -1.3493796586990356, "step": 281 }, { "epoch": 0.42630385487528344, "epsilon_dpo/beta": 0.025015318766236305, "epsilon_dpo/beta_margin_grad_mean": -0.32832956314086914, "epsilon_dpo/beta_margin_grad_std": 0.17006830871105194, "epsilon_dpo/beta_margin_mean": 0.8474223017692566, "epsilon_dpo/beta_margin_std": 0.9065929651260376, "epsilon_dpo/loss_margin_mean": 34.050113677978516, "grad_norm": 51.078182220458984, "kl/avg_steps": 0.65625, "kl/beta": 0.025178048759698868, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.562533640600075e-07, "logits/chosen": -3.5032458305358887, "logits/rejected": -3.5329298973083496, "logps/chosen": -80.05890655517578, "logps/ref_chosen": -52.77884292602539, "logps/ref_rejected": -61.18558120727539, "logps/rejected": -122.51575469970703, "loss": 0.8735, "rewards/accuracies": 0.859375, "rewards/chosen": -0.683773934841156, "rewards/margins": 0.8474222421646118, "rewards/rejected": -1.531196117401123, "step": 282 }, { "epoch": 0.42781557067271353, "epsilon_dpo/beta": 0.024867862462997437, "epsilon_dpo/beta_margin_grad_mean": -0.346819132566452, "epsilon_dpo/beta_margin_grad_std": 0.17922072112560272, "epsilon_dpo/beta_margin_mean": 0.7386068105697632, "epsilon_dpo/beta_margin_std": 0.8724326491355896, "epsilon_dpo/loss_margin_mean": 29.8946475982666, "grad_norm": 50.959434509277344, "kl/avg_steps": 0.59375, "kl/beta": 0.025013895705342293, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.550550279627215e-07, "logits/chosen": -3.472764253616333, "logits/rejected": -3.6371231079101562, "logps/chosen": -86.67143249511719, "logps/ref_chosen": -55.930294036865234, "logps/ref_rejected": -87.93072509765625, "logps/rejected": -148.56649780273438, "loss": 0.9423, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7671502828598022, "rewards/margins": 0.7386068105697632, "rewards/rejected": -1.5057570934295654, "step": 283 }, { "epoch": 0.4293272864701436, "epsilon_dpo/beta": 0.02472108229994774, "epsilon_dpo/beta_margin_grad_mean": -0.3570942282676697, "epsilon_dpo/beta_margin_grad_std": 0.1856486201286316, "epsilon_dpo/beta_margin_mean": 0.6846122145652771, "epsilon_dpo/beta_margin_std": 0.9263046383857727, "epsilon_dpo/loss_margin_mean": 27.893281936645508, "grad_norm": 46.115360260009766, "kl/avg_steps": 0.59375, "kl/beta": 0.0248662531375885, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -3.5009007453918457, "logits/rejected": -3.5784401893615723, "logps/chosen": -84.17832946777344, "logps/ref_chosen": -57.45970916748047, "logps/ref_rejected": -81.43141174316406, "logps/rejected": -136.04331970214844, "loss": 0.9986, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6646609902381897, "rewards/margins": 0.6846121549606323, "rewards/rejected": -1.3492732048034668, "step": 284 }, { "epoch": 0.4308390022675737, "epsilon_dpo/beta": 0.024606067687273026, "epsilon_dpo/beta_margin_grad_mean": -0.3554127514362335, "epsilon_dpo/beta_margin_grad_std": 0.16883914172649384, "epsilon_dpo/beta_margin_mean": 0.7153359055519104, "epsilon_dpo/beta_margin_std": 0.8846250772476196, "epsilon_dpo/loss_margin_mean": 29.272083282470703, "grad_norm": 53.23637771606445, "kl/avg_steps": 0.46875, "kl/beta": 0.0247194804251194, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.5264957352549375e-07, "logits/chosen": -3.5030999183654785, "logits/rejected": -3.457376003265381, "logps/chosen": -95.3831787109375, "logps/ref_chosen": -64.14385223388672, "logps/ref_rejected": -66.00570678710938, "logps/rejected": -126.51710510253906, "loss": 0.9531, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7711443901062012, "rewards/margins": 0.7153359055519104, "rewards/rejected": -1.4864802360534668, "step": 285 }, { "epoch": 0.4323507180650038, "epsilon_dpo/beta": 0.02443743869662285, "epsilon_dpo/beta_margin_grad_mean": -0.29852718114852905, "epsilon_dpo/beta_margin_grad_std": 0.17430494725704193, "epsilon_dpo/beta_margin_mean": 1.0480760335922241, "epsilon_dpo/beta_margin_std": 1.043094515800476, "epsilon_dpo/loss_margin_mean": 43.06303405761719, "grad_norm": 39.86260223388672, "kl/avg_steps": 0.6875, "kl/beta": 0.024604149162769318, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.514425224712835e-07, "logits/chosen": -3.5416972637176514, "logits/rejected": -3.5913586616516113, "logps/chosen": -82.38316345214844, "logps/ref_chosen": -54.679100036621094, "logps/ref_rejected": -86.51749420166016, "logps/rejected": -157.28457641601562, "loss": 0.7864, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6802202463150024, "rewards/margins": 1.0480761528015137, "rewards/rejected": -1.7282962799072266, "step": 286 }, { "epoch": 0.43386243386243384, "epsilon_dpo/beta": 0.024316400289535522, "epsilon_dpo/beta_margin_grad_mean": -0.33944711089134216, "epsilon_dpo/beta_margin_grad_std": 0.18090681731700897, "epsilon_dpo/beta_margin_mean": 0.8137615323066711, "epsilon_dpo/beta_margin_std": 0.9472151398658752, "epsilon_dpo/loss_margin_mean": 33.69645309448242, "grad_norm": 37.17608642578125, "kl/avg_steps": 0.5, "kl/beta": 0.02443614974617958, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.502326338516534e-07, "logits/chosen": -3.3416614532470703, "logits/rejected": -3.375074625015259, "logps/chosen": -70.60919952392578, "logps/ref_chosen": -44.448997497558594, "logps/ref_rejected": -53.14330291748047, "logps/rejected": -112.99995422363281, "loss": 0.9115, "rewards/accuracies": 0.71875, "rewards/chosen": -0.639596700668335, "rewards/margins": 0.8137615323066711, "rewards/rejected": -1.4533581733703613, "step": 287 }, { "epoch": 0.43537414965986393, "epsilon_dpo/beta": 0.024157429113984108, "epsilon_dpo/beta_margin_grad_mean": -0.3337177336215973, "epsilon_dpo/beta_margin_grad_std": 0.1625463217496872, "epsilon_dpo/beta_margin_mean": 0.7855278253555298, "epsilon_dpo/beta_margin_std": 0.8001845479011536, "epsilon_dpo/loss_margin_mean": 32.684165954589844, "grad_norm": 47.675392150878906, "kl/avg_steps": 0.65625, "kl/beta": 0.024314576759934425, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.490199415097892e-07, "logits/chosen": -3.5289015769958496, "logits/rejected": -3.5425853729248047, "logps/chosen": -88.12208557128906, "logps/ref_chosen": -54.177223205566406, "logps/ref_rejected": -82.56395721435547, "logps/rejected": -149.1929931640625, "loss": 0.885, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8218220472335815, "rewards/margins": 0.7855278253555298, "rewards/rejected": -1.6073498725891113, "step": 288 }, { "epoch": 0.436885865457294, "epsilon_dpo/beta": 0.02403767593204975, "epsilon_dpo/beta_margin_grad_mean": -0.36918365955352783, "epsilon_dpo/beta_margin_grad_std": 0.1830718070268631, "epsilon_dpo/beta_margin_mean": 0.6391366124153137, "epsilon_dpo/beta_margin_std": 0.9101723432540894, "epsilon_dpo/loss_margin_mean": 26.813085556030273, "grad_norm": 45.7679443359375, "kl/avg_steps": 0.5, "kl/beta": 0.024156052619218826, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.4780447936730247e-07, "logits/chosen": -3.3495442867279053, "logits/rejected": -3.3641200065612793, "logps/chosen": -84.00634765625, "logps/ref_chosen": -46.830223083496094, "logps/ref_rejected": -59.32768249511719, "logps/rejected": -123.31689453125, "loss": 1.0229, "rewards/accuracies": 0.734375, "rewards/chosen": -0.896483302116394, "rewards/margins": 0.6391366124153137, "rewards/rejected": -1.5356199741363525, "step": 289 }, { "epoch": 0.4383975812547241, "epsilon_dpo/beta": 0.02390306256711483, "epsilon_dpo/beta_margin_grad_mean": -0.3252708613872528, "epsilon_dpo/beta_margin_grad_std": 0.1821853071451187, "epsilon_dpo/beta_margin_mean": 0.8856675028800964, "epsilon_dpo/beta_margin_std": 0.9666686654090881, "epsilon_dpo/loss_margin_mean": 37.278961181640625, "grad_norm": 50.79010772705078, "kl/avg_steps": 0.5625, "kl/beta": 0.024035874754190445, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.465862814232821e-07, "logits/chosen": -3.3956217765808105, "logits/rejected": -3.540740966796875, "logps/chosen": -89.17860412597656, "logps/ref_chosen": -53.77537536621094, "logps/ref_rejected": -89.06211853027344, "logps/rejected": -161.74429321289062, "loss": 0.872, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8493714928627014, "rewards/margins": 0.8856675028800964, "rewards/rejected": -1.7350389957427979, "step": 290 }, { "epoch": 0.4399092970521542, "epsilon_dpo/beta": 0.02375442162156105, "epsilon_dpo/beta_margin_grad_mean": -0.3296690583229065, "epsilon_dpo/beta_margin_grad_std": 0.18409493565559387, "epsilon_dpo/beta_margin_mean": 0.8777821660041809, "epsilon_dpo/beta_margin_std": 1.0467389822006226, "epsilon_dpo/loss_margin_mean": 37.156375885009766, "grad_norm": 49.14398193359375, "kl/avg_steps": 0.625, "kl/beta": 0.023901429027318954, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.4536538175334343e-07, "logits/chosen": -3.410175323486328, "logits/rejected": -3.5372653007507324, "logps/chosen": -74.68447875976562, "logps/ref_chosen": -46.044700622558594, "logps/ref_rejected": -77.96891021728516, "logps/rejected": -143.76507568359375, "loss": 0.8979, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6832265853881836, "rewards/margins": 0.8777821660041809, "rewards/rejected": -1.5610086917877197, "step": 291 }, { "epoch": 0.4414210128495843, "epsilon_dpo/beta": 0.02365141734480858, "epsilon_dpo/beta_margin_grad_mean": -0.3605436086654663, "epsilon_dpo/beta_margin_grad_std": 0.19331659376621246, "epsilon_dpo/beta_margin_mean": 0.6995589733123779, "epsilon_dpo/beta_margin_std": 0.9916879534721375, "epsilon_dpo/loss_margin_mean": 29.841161727905273, "grad_norm": 44.712215423583984, "kl/avg_steps": 0.4375, "kl/beta": 0.02375297248363495, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.4414181450867465e-07, "logits/chosen": -3.3692009449005127, "logits/rejected": -3.4043450355529785, "logps/chosen": -85.44560241699219, "logps/ref_chosen": -55.59161376953125, "logps/ref_rejected": -64.73518371582031, "logps/rejected": -124.43032836914062, "loss": 1.0087, "rewards/accuracies": 0.75, "rewards/chosen": -0.7098550796508789, "rewards/margins": 0.6995589733123779, "rewards/rejected": -1.4094140529632568, "step": 292 }, { "epoch": 0.4429327286470144, "epsilon_dpo/beta": 0.023489264771342278, "epsilon_dpo/beta_margin_grad_mean": -0.32560887932777405, "epsilon_dpo/beta_margin_grad_std": 0.1817702203989029, "epsilon_dpo/beta_margin_mean": 0.8625008463859558, "epsilon_dpo/beta_margin_std": 0.9884577989578247, "epsilon_dpo/loss_margin_mean": 36.910213470458984, "grad_norm": 39.234649658203125, "kl/avg_steps": 0.6875, "kl/beta": 0.023649506270885468, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.4291561391508185e-07, "logits/chosen": -3.3564453125, "logits/rejected": -3.5165536403656006, "logps/chosen": -76.51992797851562, "logps/ref_chosen": -46.776344299316406, "logps/ref_rejected": -78.05533599853516, "logps/rejected": -144.70912170410156, "loss": 0.8939, "rewards/accuracies": 0.828125, "rewards/chosen": -0.700920581817627, "rewards/margins": 0.8625009059906006, "rewards/rejected": -1.5634214878082275, "step": 293 }, { "epoch": 0.4444444444444444, "epsilon_dpo/beta": 0.023358240723609924, "epsilon_dpo/beta_margin_grad_mean": -0.3368472456932068, "epsilon_dpo/beta_margin_grad_std": 0.16931232810020447, "epsilon_dpo/beta_margin_mean": 0.8159635066986084, "epsilon_dpo/beta_margin_std": 0.9154258370399475, "epsilon_dpo/loss_margin_mean": 35.136470794677734, "grad_norm": 37.46754837036133, "kl/avg_steps": 0.5625, "kl/beta": 0.02348802611231804, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.4168681427203153e-07, "logits/chosen": -3.450556755065918, "logits/rejected": -3.439328670501709, "logps/chosen": -81.38284301757812, "logps/ref_chosen": -51.676170349121094, "logps/ref_rejected": -70.57459259033203, "logps/rejected": -135.41773986816406, "loss": 0.8938, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6945424675941467, "rewards/margins": 0.8159635066986084, "rewards/rejected": -1.5105060338974, "step": 294 }, { "epoch": 0.4459561602418745, "epsilon_dpo/beta": 0.02318378910422325, "epsilon_dpo/beta_margin_grad_mean": -0.3320719301700592, "epsilon_dpo/beta_margin_grad_std": 0.15831997990608215, "epsilon_dpo/beta_margin_mean": 0.8186854720115662, "epsilon_dpo/beta_margin_std": 0.8354726433753967, "epsilon_dpo/loss_margin_mean": 35.43844223022461, "grad_norm": 45.94647979736328, "kl/avg_steps": 0.75, "kl/beta": 0.023356644436717033, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.4045544995169125e-07, "logits/chosen": -3.4075112342834473, "logits/rejected": -3.538727283477783, "logps/chosen": -74.18840026855469, "logps/ref_chosen": -44.85515594482422, "logps/ref_rejected": -74.43038177490234, "logps/rejected": -139.20205688476562, "loss": 0.8681, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6814078092575073, "rewards/margins": 0.8186854124069214, "rewards/rejected": -1.5000932216644287, "step": 295 }, { "epoch": 0.4474678760393046, "epsilon_dpo/beta": 0.02305467613041401, "epsilon_dpo/beta_margin_grad_mean": -0.32794496417045593, "epsilon_dpo/beta_margin_grad_std": 0.1824619323015213, "epsilon_dpo/beta_margin_mean": 0.8609923124313354, "epsilon_dpo/beta_margin_std": 0.9506134986877441, "epsilon_dpo/loss_margin_mean": 37.576324462890625, "grad_norm": 46.21034622192383, "kl/avg_steps": 0.5625, "kl/beta": 0.02318277396261692, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.392215553979679e-07, "logits/chosen": -3.5375661849975586, "logits/rejected": -3.572035312652588, "logps/chosen": -86.96758270263672, "logps/ref_chosen": -58.935638427734375, "logps/ref_rejected": -78.65898895263672, "logps/rejected": -144.2672576904297, "loss": 0.8842, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6479707956314087, "rewards/margins": 0.8609922528266907, "rewards/rejected": -1.5089631080627441, "step": 296 }, { "epoch": 0.4489795918367347, "epsilon_dpo/beta": 0.022918514907360077, "epsilon_dpo/beta_margin_grad_mean": -0.31877249479293823, "epsilon_dpo/beta_margin_grad_std": 0.16733162105083466, "epsilon_dpo/beta_margin_mean": 0.8775285482406616, "epsilon_dpo/beta_margin_std": 0.839491069316864, "epsilon_dpo/loss_margin_mean": 38.50216293334961, "grad_norm": 45.40238571166992, "kl/avg_steps": 0.59375, "kl/beta": 0.02305310033261776, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.3798516512554485e-07, "logits/chosen": -3.4466686248779297, "logits/rejected": -3.374145030975342, "logps/chosen": -81.90016174316406, "logps/ref_chosen": -53.04302978515625, "logps/ref_rejected": -62.58563232421875, "logps/rejected": -129.94493103027344, "loss": 0.8384, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6638805270195007, "rewards/margins": 0.8775285482406616, "rewards/rejected": -1.5414090156555176, "step": 297 }, { "epoch": 0.4504913076341648, "epsilon_dpo/beta": 0.0227617509663105, "epsilon_dpo/beta_margin_grad_mean": -0.341703325510025, "epsilon_dpo/beta_margin_grad_std": 0.17840270698070526, "epsilon_dpo/beta_margin_mean": 0.7759517431259155, "epsilon_dpo/beta_margin_std": 0.9039034843444824, "epsilon_dpo/loss_margin_mean": 34.26759719848633, "grad_norm": 48.5256233215332, "kl/avg_steps": 0.6875, "kl/beta": 0.02291703037917614, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.367463137189156e-07, "logits/chosen": -3.472851276397705, "logits/rejected": -3.4458398818969727, "logps/chosen": -74.42074584960938, "logps/ref_chosen": -47.468482971191406, "logps/ref_rejected": -64.02200317382812, "logps/rejected": -125.24186706542969, "loss": 0.9249, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6156304478645325, "rewards/margins": 0.7759517431259155, "rewards/rejected": -1.3915821313858032, "step": 298 }, { "epoch": 0.4520030234315949, "epsilon_dpo/beta": 0.022649012506008148, "epsilon_dpo/beta_margin_grad_mean": -0.34175968170166016, "epsilon_dpo/beta_margin_grad_std": 0.1833636313676834, "epsilon_dpo/beta_margin_mean": 0.8129475712776184, "epsilon_dpo/beta_margin_std": 1.0239810943603516, "epsilon_dpo/loss_margin_mean": 36.14271926879883, "grad_norm": 40.84474182128906, "kl/avg_steps": 0.5, "kl/beta": 0.022760551422834396, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.355050358314172e-07, "logits/chosen": -3.4653258323669434, "logits/rejected": -3.4130988121032715, "logps/chosen": -74.36851501464844, "logps/ref_chosen": -52.6894416809082, "logps/ref_rejected": -65.37330627441406, "logps/rejected": -123.19509887695312, "loss": 0.9277, "rewards/accuracies": 0.765625, "rewards/chosen": -0.49251508712768555, "rewards/margins": 0.8129475116729736, "rewards/rejected": -1.3054625988006592, "step": 299 }, { "epoch": 0.45351473922902497, "epsilon_dpo/beta": 0.022518599405884743, "epsilon_dpo/beta_margin_grad_mean": -0.3511313796043396, "epsilon_dpo/beta_margin_grad_std": 0.16206586360931396, "epsilon_dpo/beta_margin_mean": 0.6956834197044373, "epsilon_dpo/beta_margin_std": 0.8197082281112671, "epsilon_dpo/loss_margin_mean": 31.093141555786133, "grad_norm": 47.42357635498047, "kl/avg_steps": 0.578125, "kl/beta": 0.02264731377363205, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.3426136618426043e-07, "logits/chosen": -3.408918857574463, "logits/rejected": -3.489590644836426, "logps/chosen": -82.29682922363281, "logps/ref_chosen": -55.035194396972656, "logps/ref_rejected": -72.62406921386719, "logps/rejected": -130.97885131835938, "loss": 0.949, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6143022775650024, "rewards/margins": 0.6956834197044373, "rewards/rejected": -1.3099857568740845, "step": 300 }, { "epoch": 0.45351473922902497, "eval_epsilon_dpo/beta": 0.02242591604590416, "eval_epsilon_dpo/beta_margin_grad_mean": -0.39154955744743347, "eval_epsilon_dpo/beta_margin_grad_std": 0.18750175833702087, "eval_epsilon_dpo/beta_margin_mean": 0.5224977731704712, "eval_epsilon_dpo/beta_margin_std": 0.9255793690681458, "eval_epsilon_dpo/loss_margin_mean": 23.575237274169922, "eval_kl/n_epsilon_steps": 0.29181337356567383, "eval_kl/p_epsilon_steps": 0.7068662047386169, "eval_logits/chosen": -3.5919747352600098, "eval_logits/rejected": -3.5349626541137695, "eval_logps/chosen": -107.06302642822266, "eval_logps/ref_chosen": -77.40868377685547, "eval_logps/ref_rejected": -73.52816772460938, "eval_logps/rejected": -126.75774383544922, "eval_loss": 0.5616350769996643, "eval_rewards/accuracies": 0.7121478915214539, "eval_rewards/chosen": -0.6680159568786621, "eval_rewards/margins": 0.5224977731704712, "eval_rewards/rejected": -1.1905137300491333, "eval_runtime": 37.1519, "eval_samples_per_second": 61.989, "eval_steps_per_second": 1.938, "step": 300 }, { "epoch": 0.455026455026455, "epsilon_dpo/beta": 0.022420866414904594, "epsilon_dpo/beta_margin_grad_mean": -0.37266838550567627, "epsilon_dpo/beta_margin_grad_std": 0.1866014003753662, "epsilon_dpo/beta_margin_mean": 0.6351537704467773, "epsilon_dpo/beta_margin_std": 0.9442269802093506, "epsilon_dpo/loss_margin_mean": 28.60137176513672, "grad_norm": 53.439056396484375, "kl/avg_steps": 0.4375, "kl/beta": 0.022517137229442596, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.3301533956555885e-07, "logits/chosen": -3.4169836044311523, "logits/rejected": -3.398585319519043, "logps/chosen": -77.97047424316406, "logps/ref_chosen": -48.728675842285156, "logps/ref_rejected": -63.57511901855469, "logps/rejected": -121.41829681396484, "loss": 1.0355, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6589154005050659, "rewards/margins": 0.6351537704467773, "rewards/rejected": -1.2940691709518433, "step": 301 }, { "epoch": 0.4565381708238851, "epsilon_dpo/beta": 0.022337215021252632, "epsilon_dpo/beta_margin_grad_mean": -0.4189479649066925, "epsilon_dpo/beta_margin_grad_std": 0.19645841419696808, "epsilon_dpo/beta_margin_mean": 0.4186948239803314, "epsilon_dpo/beta_margin_std": 1.034669280052185, "epsilon_dpo/loss_margin_mean": 19.042078018188477, "grad_norm": 64.08478546142578, "kl/avg_steps": 0.375, "kl/beta": 0.022419054061174393, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.317669908293554e-07, "logits/chosen": -3.4411935806274414, "logits/rejected": -3.542949676513672, "logps/chosen": -92.18022155761719, "logps/ref_chosen": -60.664024353027344, "logps/ref_rejected": -78.62321472167969, "logps/rejected": -129.18148803710938, "loss": 1.2315, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7071412205696106, "rewards/margins": 0.41869479417800903, "rewards/rejected": -1.1258360147476196, "step": 302 }, { "epoch": 0.4580498866213152, "epsilon_dpo/beta": 0.022218862548470497, "epsilon_dpo/beta_margin_grad_mean": -0.33864742517471313, "epsilon_dpo/beta_margin_grad_std": 0.1839238703250885, "epsilon_dpo/beta_margin_mean": 0.7744192481040955, "epsilon_dpo/beta_margin_std": 0.9165587425231934, "epsilon_dpo/loss_margin_mean": 35.11915588378906, "grad_norm": 46.36555099487305, "kl/avg_steps": 0.53125, "kl/beta": 0.022335296496748924, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.3051635489464793e-07, "logits/chosen": -3.477597713470459, "logits/rejected": -3.5051486492156982, "logps/chosen": -77.63362884521484, "logps/ref_chosen": -55.59126281738281, "logps/ref_rejected": -79.28375244140625, "logps/rejected": -136.44528198242188, "loss": 0.9341, "rewards/accuracies": 0.765625, "rewards/chosen": -0.49125686287879944, "rewards/margins": 0.7744193077087402, "rewards/rejected": -1.2656761407852173, "step": 303 }, { "epoch": 0.4595616024187453, "epsilon_dpo/beta": 0.02204590104520321, "epsilon_dpo/beta_margin_grad_mean": -0.3089589774608612, "epsilon_dpo/beta_margin_grad_std": 0.1462126076221466, "epsilon_dpo/beta_margin_mean": 0.92357337474823, "epsilon_dpo/beta_margin_std": 0.7905997037887573, "epsilon_dpo/loss_margin_mean": 42.00259017944336, "grad_norm": 35.90860366821289, "kl/avg_steps": 0.78125, "kl/beta": 0.022217268124222755, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.292634667444117e-07, "logits/chosen": -3.528080463409424, "logits/rejected": -3.4996469020843506, "logps/chosen": -70.59703826904297, "logps/ref_chosen": -53.68409729003906, "logps/ref_rejected": -72.39662170410156, "logps/rejected": -131.31214904785156, "loss": 0.7869, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3735690116882324, "rewards/margins": 0.92357337474823, "rewards/rejected": -1.2971423864364624, "step": 304 }, { "epoch": 0.46107331821617537, "epsilon_dpo/beta": 0.021930119022727013, "epsilon_dpo/beta_margin_grad_mean": -0.35909682512283325, "epsilon_dpo/beta_margin_grad_std": 0.18305020034313202, "epsilon_dpo/beta_margin_mean": 0.6859689354896545, "epsilon_dpo/beta_margin_std": 0.8998550772666931, "epsilon_dpo/loss_margin_mean": 31.527374267578125, "grad_norm": 41.75257110595703, "kl/avg_steps": 0.53125, "kl/beta": 0.022045040503144264, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.280083614246217e-07, "logits/chosen": -3.5517983436584473, "logits/rejected": -3.4454336166381836, "logps/chosen": -84.57362365722656, "logps/ref_chosen": -61.988155364990234, "logps/ref_rejected": -62.704551696777344, "logps/rejected": -116.81739807128906, "loss": 0.9874, "rewards/accuracies": 0.75, "rewards/chosen": -0.4973277449607849, "rewards/margins": 0.6859689950942993, "rewards/rejected": -1.1832966804504395, "step": 305 }, { "epoch": 0.46258503401360546, "epsilon_dpo/beta": 0.0217936709523201, "epsilon_dpo/beta_margin_grad_mean": -0.357360303401947, "epsilon_dpo/beta_margin_grad_std": 0.16487321257591248, "epsilon_dpo/beta_margin_mean": 0.6856143474578857, "epsilon_dpo/beta_margin_std": 0.8294694423675537, "epsilon_dpo/loss_margin_mean": 31.637510299682617, "grad_norm": 46.81013107299805, "kl/avg_steps": 0.625, "kl/beta": 0.021928545087575912, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.267510740432719e-07, "logits/chosen": -3.3208365440368652, "logits/rejected": -3.4407358169555664, "logps/chosen": -68.32638549804688, "logps/ref_chosen": -49.447906494140625, "logps/ref_rejected": -66.84622955322266, "logps/rejected": -117.36221313476562, "loss": 0.9589, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4130626320838928, "rewards/margins": 0.6856143474578857, "rewards/rejected": -1.0986769199371338, "step": 306 }, { "epoch": 0.46409674981103555, "epsilon_dpo/beta": 0.02168554998934269, "epsilon_dpo/beta_margin_grad_mean": -0.4127929210662842, "epsilon_dpo/beta_margin_grad_std": 0.17212146520614624, "epsilon_dpo/beta_margin_mean": 0.4111768305301666, "epsilon_dpo/beta_margin_std": 0.8121265769004822, "epsilon_dpo/loss_margin_mean": 19.175983428955078, "grad_norm": 52.534400939941406, "kl/avg_steps": 0.5, "kl/beta": 0.021792342886328697, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.2549163976939285e-07, "logits/chosen": -3.4779303073883057, "logits/rejected": -3.4345755577087402, "logps/chosen": -68.0585708618164, "logps/ref_chosen": -50.486167907714844, "logps/ref_rejected": -58.821529388427734, "logps/rejected": -95.56991577148438, "loss": 1.1645, "rewards/accuracies": 0.71875, "rewards/chosen": -0.38303613662719727, "rewards/margins": 0.411176860332489, "rewards/rejected": -0.7942129969596863, "step": 307 }, { "epoch": 0.4656084656084656, "epsilon_dpo/beta": 0.021591216325759888, "epsilon_dpo/beta_margin_grad_mean": -0.3740115165710449, "epsilon_dpo/beta_margin_grad_std": 0.17493607103824615, "epsilon_dpo/beta_margin_mean": 0.5918732285499573, "epsilon_dpo/beta_margin_std": 0.8551866412162781, "epsilon_dpo/loss_margin_mean": 27.666423797607422, "grad_norm": 43.65812301635742, "kl/avg_steps": 0.4375, "kl/beta": 0.02168392390012741, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.2423009383206874e-07, "logits/chosen": -3.566082000732422, "logits/rejected": -3.5047104358673096, "logps/chosen": -80.58539581298828, "logps/ref_chosen": -60.2354736328125, "logps/ref_rejected": -66.9232177734375, "logps/rejected": -114.93956756591797, "loss": 1.0382, "rewards/accuracies": 0.75, "rewards/chosen": -0.4412750005722046, "rewards/margins": 0.591873288154602, "rewards/rejected": -1.0331482887268066, "step": 308 }, { "epoch": 0.4671201814058957, "epsilon_dpo/beta": 0.021443186327815056, "epsilon_dpo/beta_margin_grad_mean": -0.3469441533088684, "epsilon_dpo/beta_margin_grad_std": 0.14971475303173065, "epsilon_dpo/beta_margin_mean": 0.7270935773849487, "epsilon_dpo/beta_margin_std": 0.7648057341575623, "epsilon_dpo/loss_margin_mean": 34.042545318603516, "grad_norm": 47.88819122314453, "kl/avg_steps": 0.6875, "kl/beta": 0.02158946916460991, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.229664715194511e-07, "logits/chosen": -3.478738784790039, "logits/rejected": -3.470277786254883, "logps/chosen": -74.05812072753906, "logps/ref_chosen": -53.74769592285156, "logps/ref_rejected": -71.52200317382812, "logps/rejected": -125.87498474121094, "loss": 0.9073, "rewards/accuracies": 0.859375, "rewards/chosen": -0.43678274750709534, "rewards/margins": 0.7270935773849487, "rewards/rejected": -1.1638762950897217, "step": 309 }, { "epoch": 0.46863189720332576, "epsilon_dpo/beta": 0.021390588954091072, "epsilon_dpo/beta_margin_grad_mean": -0.416938453912735, "epsilon_dpo/beta_margin_grad_std": 0.2001698613166809, "epsilon_dpo/beta_margin_mean": 0.3982345163822174, "epsilon_dpo/beta_margin_std": 0.9748703241348267, "epsilon_dpo/loss_margin_mean": 18.95297622680664, "grad_norm": 65.61370086669922, "kl/avg_steps": 0.25, "kl/beta": 0.021442055702209473, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.2170080817777257e-07, "logits/chosen": -3.6341333389282227, "logits/rejected": -3.5537829399108887, "logps/chosen": -102.85679626464844, "logps/ref_chosen": -71.20709228515625, "logps/ref_rejected": -69.59140014648438, "logps/rejected": -120.19407653808594, "loss": 1.2378, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6796282529830933, "rewards/margins": 0.398234486579895, "rewards/rejected": -1.0778627395629883, "step": 310 }, { "epoch": 0.47014361300075586, "epsilon_dpo/beta": 0.021290455013513565, "epsilon_dpo/beta_margin_grad_mean": -0.392009437084198, "epsilon_dpo/beta_margin_grad_std": 0.18163029849529266, "epsilon_dpo/beta_margin_mean": 0.5403109788894653, "epsilon_dpo/beta_margin_std": 0.9781794548034668, "epsilon_dpo/loss_margin_mean": 25.632801055908203, "grad_norm": 45.10748291015625, "kl/avg_steps": 0.46875, "kl/beta": 0.0213885847479105, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.204331392103574e-07, "logits/chosen": -3.4072837829589844, "logits/rejected": -3.6568784713745117, "logps/chosen": -71.02266693115234, "logps/ref_chosen": -50.57222366333008, "logps/ref_rejected": -86.37225341796875, "logps/rejected": -132.45550537109375, "loss": 1.1095, "rewards/accuracies": 0.71875, "rewards/chosen": -0.43778371810913086, "rewards/margins": 0.5403109788894653, "rewards/rejected": -0.9780946969985962, "step": 311 }, { "epoch": 0.47165532879818595, "epsilon_dpo/beta": 0.021164506673812866, "epsilon_dpo/beta_margin_grad_mean": -0.3533521294593811, "epsilon_dpo/beta_margin_grad_std": 0.16184784471988678, "epsilon_dpo/beta_margin_mean": 0.6904141306877136, "epsilon_dpo/beta_margin_std": 0.7821415066719055, "epsilon_dpo/loss_margin_mean": 32.817081451416016, "grad_norm": 43.32979965209961, "kl/avg_steps": 0.59375, "kl/beta": 0.02128879353404045, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.1916350007663176e-07, "logits/chosen": -3.368271827697754, "logits/rejected": -3.4137144088745117, "logps/chosen": -72.72174072265625, "logps/ref_chosen": -47.678794860839844, "logps/ref_rejected": -61.4392204284668, "logps/rejected": -119.29924774169922, "loss": 0.9434, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5316519141197205, "rewards/margins": 0.6904141902923584, "rewards/rejected": -1.222066044807434, "step": 312 }, { "epoch": 0.47316704459561604, "epsilon_dpo/beta": 0.021039582788944244, "epsilon_dpo/beta_margin_grad_mean": -0.3826296329498291, "epsilon_dpo/beta_margin_grad_std": 0.1834283173084259, "epsilon_dpo/beta_margin_mean": 0.5433450937271118, "epsilon_dpo/beta_margin_std": 0.8974889516830444, "epsilon_dpo/loss_margin_mean": 26.066537857055664, "grad_norm": 42.01582717895508, "kl/avg_steps": 0.59375, "kl/beta": 0.021163135766983032, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.178919262911314e-07, "logits/chosen": -3.482300281524658, "logits/rejected": -3.421327590942383, "logps/chosen": -74.80753326416016, "logps/ref_chosen": -52.62970733642578, "logps/ref_rejected": -60.99772644042969, "logps/rejected": -109.24209594726562, "loss": 1.092, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4693948030471802, "rewards/margins": 0.5433450937271118, "rewards/rejected": -1.012739896774292, "step": 313 }, { "epoch": 0.47467876039304613, "epsilon_dpo/beta": 0.020961424335837364, "epsilon_dpo/beta_margin_grad_mean": -0.35413360595703125, "epsilon_dpo/beta_margin_grad_std": 0.17942708730697632, "epsilon_dpo/beta_margin_mean": 0.7215368747711182, "epsilon_dpo/beta_margin_std": 0.9011567831039429, "epsilon_dpo/loss_margin_mean": 34.711097717285156, "grad_norm": 46.95634078979492, "kl/avg_steps": 0.375, "kl/beta": 0.021038223057985306, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.166184534225087e-07, "logits/chosen": -3.481204032897949, "logits/rejected": -3.404500961303711, "logps/chosen": -84.33619689941406, "logps/ref_chosen": -62.14019775390625, "logps/ref_rejected": -66.03670501708984, "logps/rejected": -122.94380950927734, "loss": 0.9597, "rewards/accuracies": 0.6875, "rewards/chosen": -0.466935396194458, "rewards/margins": 0.7215368747711182, "rewards/rejected": -1.1884722709655762, "step": 314 }, { "epoch": 0.47619047619047616, "epsilon_dpo/beta": 0.020830707624554634, "epsilon_dpo/beta_margin_grad_mean": -0.348887175321579, "epsilon_dpo/beta_margin_grad_std": 0.16530612111091614, "epsilon_dpo/beta_margin_mean": 0.7323974967002869, "epsilon_dpo/beta_margin_std": 0.8364558219909668, "epsilon_dpo/loss_margin_mean": 35.35239028930664, "grad_norm": 44.713226318359375, "kl/avg_steps": 0.625, "kl/beta": 0.020959623157978058, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.1534311709253723e-07, "logits/chosen": -3.492908477783203, "logits/rejected": -3.437889575958252, "logps/chosen": -81.35464477539062, "logps/ref_chosen": -59.008056640625, "logps/ref_rejected": -69.71574401855469, "logps/rejected": -127.41471862792969, "loss": 0.9286, "rewards/accuracies": 0.8125, "rewards/chosen": -0.46730703115463257, "rewards/margins": 0.7323974967002869, "rewards/rejected": -1.199704647064209, "step": 315 }, { "epoch": 0.47770219198790626, "epsilon_dpo/beta": 0.020714344456791878, "epsilon_dpo/beta_margin_grad_mean": -0.3380410671234131, "epsilon_dpo/beta_margin_grad_std": 0.18695685267448425, "epsilon_dpo/beta_margin_mean": 0.8087754249572754, "epsilon_dpo/beta_margin_std": 0.9806798696517944, "epsilon_dpo/loss_margin_mean": 39.30621337890625, "grad_norm": 45.41092300415039, "kl/avg_steps": 0.5625, "kl/beta": 0.020829439163208008, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.1406595297511564e-07, "logits/chosen": -3.3787903785705566, "logits/rejected": -3.468698501586914, "logps/chosen": -75.8724365234375, "logps/ref_chosen": -51.71154022216797, "logps/ref_rejected": -87.0086441040039, "logps/rejected": -150.4757537841797, "loss": 0.9287, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5017073154449463, "rewards/margins": 0.8087753653526306, "rewards/rejected": -1.3104827404022217, "step": 316 }, { "epoch": 0.47921390778533635, "epsilon_dpo/beta": 0.020579056814312935, "epsilon_dpo/beta_margin_grad_mean": -0.33037692308425903, "epsilon_dpo/beta_margin_grad_std": 0.1699550896883011, "epsilon_dpo/beta_margin_mean": 0.7959507703781128, "epsilon_dpo/beta_margin_std": 0.8738538026809692, "epsilon_dpo/loss_margin_mean": 38.87232208251953, "grad_norm": 52.40599822998047, "kl/avg_steps": 0.65625, "kl/beta": 0.020712928846478462, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.1278699679526975e-07, "logits/chosen": -3.3670268058776855, "logits/rejected": -3.3748703002929688, "logps/chosen": -72.243896484375, "logps/ref_chosen": -52.302345275878906, "logps/ref_rejected": -65.77944946289062, "logps/rejected": -124.59332275390625, "loss": 0.9026, "rewards/accuracies": 0.84375, "rewards/chosen": -0.41306257247924805, "rewards/margins": 0.7959507703781128, "rewards/rejected": -1.2090133428573608, "step": 317 }, { "epoch": 0.48072562358276644, "epsilon_dpo/beta": 0.02051563188433647, "epsilon_dpo/beta_margin_grad_mean": -0.3677760362625122, "epsilon_dpo/beta_margin_grad_std": 0.20929959416389465, "epsilon_dpo/beta_margin_mean": 0.6926091313362122, "epsilon_dpo/beta_margin_std": 1.0777024030685425, "epsilon_dpo/loss_margin_mean": 34.12948226928711, "grad_norm": 50.718929290771484, "kl/avg_steps": 0.3125, "kl/beta": 0.020577887073159218, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.1150628432815336e-07, "logits/chosen": -3.33195161819458, "logits/rejected": -3.416433334350586, "logps/chosen": -65.81509399414062, "logps/ref_chosen": -43.39962387084961, "logps/ref_rejected": -72.62667083740234, "logps/rejected": -129.17161560058594, "loss": 1.0496, "rewards/accuracies": 0.640625, "rewards/chosen": -0.4628320634365082, "rewards/margins": 0.6926091313362122, "rewards/rejected": -1.155441164970398, "step": 318 }, { "epoch": 0.48223733938019653, "epsilon_dpo/beta": 0.020406844094395638, "epsilon_dpo/beta_margin_grad_mean": -0.3538264334201813, "epsilon_dpo/beta_margin_grad_std": 0.17822809517383575, "epsilon_dpo/beta_margin_mean": 0.7163873314857483, "epsilon_dpo/beta_margin_std": 0.8928670883178711, "epsilon_dpo/loss_margin_mean": 35.35110092163086, "grad_norm": 49.28916931152344, "kl/avg_steps": 0.53125, "kl/beta": 0.020513782277703285, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.1022385139804707e-07, "logits/chosen": -3.4634480476379395, "logits/rejected": -3.451408863067627, "logps/chosen": -78.55719757080078, "logps/ref_chosen": -52.94717788696289, "logps/ref_rejected": -66.69010162353516, "logps/rejected": -127.65122985839844, "loss": 0.9612, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5240877866744995, "rewards/margins": 0.7163873910903931, "rewards/rejected": -1.2404751777648926, "step": 319 }, { "epoch": 0.4837490551776266, "epsilon_dpo/beta": 0.020292626693844795, "epsilon_dpo/beta_margin_grad_mean": -0.36078941822052, "epsilon_dpo/beta_margin_grad_std": 0.1806831955909729, "epsilon_dpo/beta_margin_mean": 0.6665822863578796, "epsilon_dpo/beta_margin_std": 0.8781885504722595, "epsilon_dpo/loss_margin_mean": 33.08497619628906, "grad_norm": 42.472694396972656, "kl/avg_steps": 0.5625, "kl/beta": 0.020405378192663193, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -3.356529474258423, "logits/rejected": -3.3220431804656982, "logps/chosen": -64.42037963867188, "logps/ref_chosen": -42.857521057128906, "logps/ref_rejected": -60.914329528808594, "logps/rejected": -115.56216430664062, "loss": 0.9947, "rewards/accuracies": 0.796875, "rewards/chosen": -0.43974870443344116, "rewards/margins": 0.6665823459625244, "rewards/rejected": -1.1063309907913208, "step": 320 }, { "epoch": 0.4852607709750567, "epsilon_dpo/beta": 0.020166436210274696, "epsilon_dpo/beta_margin_grad_mean": -0.3521350920200348, "epsilon_dpo/beta_margin_grad_std": 0.19579117000102997, "epsilon_dpo/beta_margin_mean": 0.7144888043403625, "epsilon_dpo/beta_margin_std": 0.986595630645752, "epsilon_dpo/loss_margin_mean": 35.70862579345703, "grad_norm": 54.442298889160156, "kl/avg_steps": 0.625, "kl/beta": 0.020291239023208618, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.0765396768561004e-07, "logits/chosen": -3.316373348236084, "logits/rejected": -3.3210296630859375, "logps/chosen": -71.23872375488281, "logps/ref_chosen": -47.02752685546875, "logps/ref_rejected": -49.298683166503906, "logps/rejected": -109.218505859375, "loss": 1.001, "rewards/accuracies": 0.78125, "rewards/chosen": -0.49150192737579346, "rewards/margins": 0.7144888639450073, "rewards/rejected": -1.2059907913208008, "step": 321 }, { "epoch": 0.48677248677248675, "epsilon_dpo/beta": 0.02001596987247467, "epsilon_dpo/beta_margin_grad_mean": -0.32754746079444885, "epsilon_dpo/beta_margin_grad_std": 0.169097438454628, "epsilon_dpo/beta_margin_mean": 0.815572202205658, "epsilon_dpo/beta_margin_std": 0.8407673239707947, "epsilon_dpo/loss_margin_mean": 40.91514205932617, "grad_norm": 37.931793212890625, "kl/avg_steps": 0.75, "kl/beta": 0.020165206864476204, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.063665887884511e-07, "logits/chosen": -3.3091092109680176, "logits/rejected": -3.4334654808044434, "logps/chosen": -74.68165588378906, "logps/ref_chosen": -42.690452575683594, "logps/ref_rejected": -72.3783187866211, "logps/rejected": -145.28466796875, "loss": 0.8803, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6424614191055298, "rewards/margins": 0.815572202205658, "rewards/rejected": -1.458033561706543, "step": 322 }, { "epoch": 0.48828420256991684, "epsilon_dpo/beta": 0.019923266023397446, "epsilon_dpo/beta_margin_grad_mean": -0.39230355620384216, "epsilon_dpo/beta_margin_grad_std": 0.21131977438926697, "epsilon_dpo/beta_margin_mean": 0.5506172180175781, "epsilon_dpo/beta_margin_std": 1.0944788455963135, "epsilon_dpo/loss_margin_mean": 27.97972869873047, "grad_norm": 57.860679626464844, "kl/avg_steps": 0.46875, "kl/beta": 0.020015094429254532, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.0507763319663517e-07, "logits/chosen": -3.381880760192871, "logits/rejected": -3.4035162925720215, "logps/chosen": -91.85194396972656, "logps/ref_chosen": -56.76236343383789, "logps/ref_rejected": -81.0357666015625, "logps/rejected": -144.10507202148438, "loss": 1.1605, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7029722929000854, "rewards/margins": 0.5506172180175781, "rewards/rejected": -1.2535896301269531, "step": 323 }, { "epoch": 0.4897959183673469, "epsilon_dpo/beta": 0.019755596294999123, "epsilon_dpo/beta_margin_grad_mean": -0.32296764850616455, "epsilon_dpo/beta_margin_grad_std": 0.14631915092468262, "epsilon_dpo/beta_margin_mean": 0.8451575636863708, "epsilon_dpo/beta_margin_std": 0.8046606183052063, "epsilon_dpo/loss_margin_mean": 42.87529754638672, "grad_norm": 47.244667053222656, "kl/avg_steps": 0.84375, "kl/beta": 0.019921710714697838, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.0378713696502097e-07, "logits/chosen": -3.435929775238037, "logits/rejected": -3.3869004249572754, "logps/chosen": -68.54899597167969, "logps/ref_chosen": -48.64381790161133, "logps/ref_rejected": -65.56362915039062, "logps/rejected": -128.3441162109375, "loss": 0.8369, "rewards/accuracies": 0.9375, "rewards/chosen": -0.39476922154426575, "rewards/margins": 0.8451576232910156, "rewards/rejected": -1.239926815032959, "step": 324 }, { "epoch": 0.491307634164777, "epsilon_dpo/beta": 0.019621174782514572, "epsilon_dpo/beta_margin_grad_mean": -0.3394243121147156, "epsilon_dpo/beta_margin_grad_std": 0.1761888712644577, "epsilon_dpo/beta_margin_mean": 0.7975197434425354, "epsilon_dpo/beta_margin_std": 0.9371805191040039, "epsilon_dpo/loss_margin_mean": 40.85676193237305, "grad_norm": 48.55106735229492, "kl/avg_steps": 0.6875, "kl/beta": 0.019755028188228607, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.0249513619156206e-07, "logits/chosen": -3.348618745803833, "logits/rejected": -3.3598568439483643, "logps/chosen": -82.07290649414062, "logps/ref_chosen": -51.189205169677734, "logps/ref_rejected": -72.43170928955078, "logps/rejected": -144.17218017578125, "loss": 0.9155, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6086115837097168, "rewards/margins": 0.7975197434425354, "rewards/rejected": -1.4061312675476074, "step": 325 }, { "epoch": 0.4928193499622071, "epsilon_dpo/beta": 0.019560782238841057, "epsilon_dpo/beta_margin_grad_mean": -0.40435653924942017, "epsilon_dpo/beta_margin_grad_std": 0.18904957175254822, "epsilon_dpo/beta_margin_mean": 0.455229789018631, "epsilon_dpo/beta_margin_std": 0.8902692794799805, "epsilon_dpo/loss_margin_mean": 23.6031436920166, "grad_norm": 58.8565788269043, "kl/avg_steps": 0.3125, "kl/beta": 0.0196201391518116, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.012016670162977e-07, "logits/chosen": -3.4327940940856934, "logits/rejected": -3.3189120292663574, "logps/chosen": -110.60107421875, "logps/ref_chosen": -65.69468688964844, "logps/ref_rejected": -67.0368881225586, "logps/rejected": -135.54641723632812, "loss": 1.1597, "rewards/accuracies": 0.640625, "rewards/chosen": -0.8808815479278564, "rewards/margins": 0.455229789018631, "rewards/rejected": -1.336111307144165, "step": 326 }, { "epoch": 0.4943310657596372, "epsilon_dpo/beta": 0.019487621262669563, "epsilon_dpo/beta_margin_grad_mean": -0.37633711099624634, "epsilon_dpo/beta_margin_grad_std": 0.19073835015296936, "epsilon_dpo/beta_margin_mean": 0.6068073511123657, "epsilon_dpo/beta_margin_std": 0.9576746821403503, "epsilon_dpo/loss_margin_mean": 31.473876953125, "grad_norm": 49.7882080078125, "kl/avg_steps": 0.375, "kl/beta": 0.019559018313884735, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.99906765620341e-07, "logits/chosen": -3.4143319129943848, "logits/rejected": -3.384676933288574, "logps/chosen": -99.65130615234375, "logps/ref_chosen": -64.87962341308594, "logps/ref_rejected": -68.50736236572266, "logps/rejected": -134.7529296875, "loss": 1.0632, "rewards/accuracies": 0.75, "rewards/chosen": -0.6796836853027344, "rewards/margins": 0.6068073511123657, "rewards/rejected": -1.2864910364151, "step": 327 }, { "epoch": 0.4958427815570673, "epsilon_dpo/beta": 0.019369108602404594, "epsilon_dpo/beta_margin_grad_mean": -0.3478046655654907, "epsilon_dpo/beta_margin_grad_std": 0.16635644435882568, "epsilon_dpo/beta_margin_mean": 0.7288675308227539, "epsilon_dpo/beta_margin_std": 0.8379784822463989, "epsilon_dpo/loss_margin_mean": 37.84938430786133, "grad_norm": 44.434085845947266, "kl/avg_steps": 0.609375, "kl/beta": 0.019485944882035255, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.9861046822486766e-07, "logits/chosen": -3.3878231048583984, "logits/rejected": -3.4686760902404785, "logps/chosen": -84.3826904296875, "logps/ref_chosen": -55.73626708984375, "logps/ref_rejected": -81.68157958984375, "logps/rejected": -148.17738342285156, "loss": 0.9317, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5562355518341064, "rewards/margins": 0.7288675308227539, "rewards/rejected": -1.2851030826568604, "step": 328 }, { "epoch": 0.4973544973544973, "epsilon_dpo/beta": 0.01926090382039547, "epsilon_dpo/beta_margin_grad_mean": -0.33937472105026245, "epsilon_dpo/beta_margin_grad_std": 0.18434439599514008, "epsilon_dpo/beta_margin_mean": 0.8138880133628845, "epsilon_dpo/beta_margin_std": 0.9744046330451965, "epsilon_dpo/loss_margin_mean": 42.51982116699219, "grad_norm": 46.932533264160156, "kl/avg_steps": 0.5625, "kl/beta": 0.019367922097444534, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.9731281109010253e-07, "logits/chosen": -3.413111686706543, "logits/rejected": -3.4522817134857178, "logps/chosen": -86.95401763916016, "logps/ref_chosen": -54.290321350097656, "logps/ref_rejected": -79.90845489501953, "logps/rejected": -155.09197998046875, "loss": 0.9211, "rewards/accuracies": 0.828125, "rewards/chosen": -0.632223904132843, "rewards/margins": 0.8138880133628845, "rewards/rejected": -1.4461119174957275, "step": 329 }, { "epoch": 0.4988662131519274, "epsilon_dpo/beta": 0.019159186631441116, "epsilon_dpo/beta_margin_grad_mean": -0.3496563136577606, "epsilon_dpo/beta_margin_grad_std": 0.18265803158283234, "epsilon_dpo/beta_margin_mean": 0.7166499495506287, "epsilon_dpo/beta_margin_std": 0.891144871711731, "epsilon_dpo/loss_margin_mean": 37.68965530395508, "grad_norm": 44.891422271728516, "kl/avg_steps": 0.53125, "kl/beta": 0.019259586930274963, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.9601383051430505e-07, "logits/chosen": -3.3513903617858887, "logits/rejected": -3.327526092529297, "logps/chosen": -75.82537078857422, "logps/ref_chosen": -49.915130615234375, "logps/ref_rejected": -70.81008911132812, "logps/rejected": -134.40997314453125, "loss": 0.9651, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4991196095943451, "rewards/margins": 0.7166499495506287, "rewards/rejected": -1.2157695293426514, "step": 330 }, { "epoch": 0.5003779289493575, "epsilon_dpo/beta": 0.019033990800380707, "epsilon_dpo/beta_margin_grad_mean": -0.3187009394168854, "epsilon_dpo/beta_margin_grad_std": 0.1709832102060318, "epsilon_dpo/beta_margin_mean": 0.8697278499603271, "epsilon_dpo/beta_margin_std": 0.8706803321838379, "epsilon_dpo/loss_margin_mean": 45.92387008666992, "grad_norm": 36.595516204833984, "kl/avg_steps": 0.65625, "kl/beta": 0.019157810136675835, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.947135628327544e-07, "logits/chosen": -3.4218192100524902, "logits/rejected": -3.333296060562134, "logps/chosen": -81.89517211914062, "logps/ref_chosen": -51.67559051513672, "logps/ref_rejected": -66.58423614501953, "logps/rejected": -142.72769165039062, "loss": 0.8536, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5780783891677856, "rewards/margins": 0.8697278499603271, "rewards/rejected": -1.4478061199188232, "step": 331 }, { "epoch": 0.5018896447467877, "epsilon_dpo/beta": 0.018915843218564987, "epsilon_dpo/beta_margin_grad_mean": -0.35570162534713745, "epsilon_dpo/beta_margin_grad_std": 0.1683649867773056, "epsilon_dpo/beta_margin_mean": 0.6919738054275513, "epsilon_dpo/beta_margin_std": 0.8416243195533752, "epsilon_dpo/loss_margin_mean": 36.79828643798828, "grad_norm": 44.81597137451172, "kl/avg_steps": 0.625, "kl/beta": 0.019032906740903854, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.934120444167326e-07, "logits/chosen": -3.39770245552063, "logits/rejected": -3.309950828552246, "logps/chosen": -94.74606323242188, "logps/ref_chosen": -58.60978698730469, "logps/ref_rejected": -69.01592254638672, "logps/rejected": -141.9504852294922, "loss": 0.9596, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6853818893432617, "rewards/margins": 0.6919738054275513, "rewards/rejected": -1.3773558139801025, "step": 332 }, { "epoch": 0.5034013605442177, "epsilon_dpo/beta": 0.018792441114783287, "epsilon_dpo/beta_margin_grad_mean": -0.3229638636112213, "epsilon_dpo/beta_margin_grad_std": 0.1700924187898636, "epsilon_dpo/beta_margin_mean": 0.8560915589332581, "epsilon_dpo/beta_margin_std": 0.8561777472496033, "epsilon_dpo/loss_margin_mean": 45.77984619140625, "grad_norm": 37.537906646728516, "kl/avg_steps": 0.65625, "kl/beta": 0.018914690241217613, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.921093116725076e-07, "logits/chosen": -3.3646843433380127, "logits/rejected": -3.437655448913574, "logps/chosen": -83.70223236083984, "logps/ref_chosen": -53.214141845703125, "logps/ref_rejected": -79.26551818847656, "logps/rejected": -155.533447265625, "loss": 0.8568, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5749765038490295, "rewards/margins": 0.8560914993286133, "rewards/rejected": -1.4310680627822876, "step": 333 }, { "epoch": 0.5049130763416477, "epsilon_dpo/beta": 0.018693411722779274, "epsilon_dpo/beta_margin_grad_mean": -0.3581380248069763, "epsilon_dpo/beta_margin_grad_std": 0.18094590306282043, "epsilon_dpo/beta_margin_mean": 0.6872604489326477, "epsilon_dpo/beta_margin_std": 0.8981221318244934, "epsilon_dpo/loss_margin_mean": 37.035884857177734, "grad_norm": 42.99663162231445, "kl/avg_steps": 0.53125, "kl/beta": 0.01879137195646763, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.9080540104031484e-07, "logits/chosen": -3.393187999725342, "logits/rejected": -3.443711519241333, "logps/chosen": -88.67373657226562, "logps/ref_chosen": -59.616580963134766, "logps/ref_rejected": -86.77583312988281, "logps/rejected": -152.86886596679688, "loss": 0.9846, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5446189641952515, "rewards/margins": 0.6872604489326477, "rewards/rejected": -1.231879472732544, "step": 334 }, { "epoch": 0.5064247921390779, "epsilon_dpo/beta": 0.018588785082101822, "epsilon_dpo/beta_margin_grad_mean": -0.3731989562511444, "epsilon_dpo/beta_margin_grad_std": 0.17236380279064178, "epsilon_dpo/beta_margin_mean": 0.5979926586151123, "epsilon_dpo/beta_margin_std": 0.8442245721817017, "epsilon_dpo/loss_margin_mean": 32.423709869384766, "grad_norm": 55.91836929321289, "kl/avg_steps": 0.5625, "kl/beta": 0.018692070618271828, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.895003489933375e-07, "logits/chosen": -3.292750358581543, "logits/rejected": -3.3110814094543457, "logps/chosen": -89.97306823730469, "logps/ref_chosen": -54.95771026611328, "logps/ref_rejected": -70.10445404052734, "logps/rejected": -137.54351806640625, "loss": 1.0297, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6535066962242126, "rewards/margins": 0.5979926586151123, "rewards/rejected": -1.2514992952346802, "step": 335 }, { "epoch": 0.5079365079365079, "epsilon_dpo/beta": 0.018473191186785698, "epsilon_dpo/beta_margin_grad_mean": -0.34337422251701355, "epsilon_dpo/beta_margin_grad_std": 0.17747966945171356, "epsilon_dpo/beta_margin_mean": 0.7704724073410034, "epsilon_dpo/beta_margin_std": 0.9321368932723999, "epsilon_dpo/loss_margin_mean": 41.969566345214844, "grad_norm": 48.003360748291016, "kl/avg_steps": 0.625, "kl/beta": 0.018587516620755196, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.8819419203668675e-07, "logits/chosen": -3.3661270141601562, "logits/rejected": -3.420741319656372, "logps/chosen": -96.89640808105469, "logps/ref_chosen": -59.799957275390625, "logps/ref_rejected": -88.13325500488281, "logps/rejected": -167.1992645263672, "loss": 0.9328, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6873288154602051, "rewards/margins": 0.7704724073410034, "rewards/rejected": -1.4578012228012085, "step": 336 }, { "epoch": 0.509448223733938, "epsilon_dpo/beta": 0.01841040886938572, "epsilon_dpo/beta_margin_grad_mean": -0.39326056838035583, "epsilon_dpo/beta_margin_grad_std": 0.17929592728614807, "epsilon_dpo/beta_margin_mean": 0.5401861667633057, "epsilon_dpo/beta_margin_std": 0.9443849325180054, "epsilon_dpo/loss_margin_mean": 29.665878295898438, "grad_norm": 52.048397064208984, "kl/avg_steps": 0.34375, "kl/beta": 0.018472066149115562, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.8688696670638053e-07, "logits/chosen": -3.374258279800415, "logits/rejected": -3.417283773422241, "logps/chosen": -102.15555572509766, "logps/ref_chosen": -61.91872787475586, "logps/ref_rejected": -84.14926147460938, "logps/rejected": -154.05197143554688, "loss": 1.0976, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7447824478149414, "rewards/margins": 0.5401861667633057, "rewards/rejected": -1.284968614578247, "step": 337 }, { "epoch": 0.5109599395313681, "epsilon_dpo/beta": 0.01832432486116886, "epsilon_dpo/beta_margin_grad_mean": -0.3753017485141754, "epsilon_dpo/beta_margin_grad_std": 0.17814604938030243, "epsilon_dpo/beta_margin_mean": 0.5944863557815552, "epsilon_dpo/beta_margin_std": 0.8636569380760193, "epsilon_dpo/loss_margin_mean": 32.74312210083008, "grad_norm": 53.6877326965332, "kl/avg_steps": 0.46875, "kl/beta": 0.01840878464281559, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.8557870956832133e-07, "logits/chosen": -3.359755277633667, "logits/rejected": -3.355471611022949, "logps/chosen": -95.95315551757812, "logps/ref_chosen": -55.74195098876953, "logps/ref_rejected": -71.55216979980469, "logps/rejected": -144.50650024414062, "loss": 1.0402, "rewards/accuracies": 0.75, "rewards/chosen": -0.7390503883361816, "rewards/margins": 0.5944863557815552, "rewards/rejected": -1.3335367441177368, "step": 338 }, { "epoch": 0.5124716553287982, "epsilon_dpo/beta": 0.018215926364064217, "epsilon_dpo/beta_margin_grad_mean": -0.363910973072052, "epsilon_dpo/beta_margin_grad_std": 0.17328131198883057, "epsilon_dpo/beta_margin_mean": 0.6451756358146667, "epsilon_dpo/beta_margin_std": 0.8335817456245422, "epsilon_dpo/loss_margin_mean": 35.67417526245117, "grad_norm": 53.465782165527344, "kl/avg_steps": 0.59375, "kl/beta": 0.0183228962123394, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.842694572172736e-07, "logits/chosen": -3.225489616394043, "logits/rejected": -3.3362607955932617, "logps/chosen": -68.69784545898438, "logps/ref_chosen": -42.442649841308594, "logps/ref_rejected": -65.46226501464844, "logps/rejected": -127.39163208007812, "loss": 0.9933, "rewards/accuracies": 0.796875, "rewards/chosen": -0.48113203048706055, "rewards/margins": 0.6451756358146667, "rewards/rejected": -1.126307725906372, "step": 339 }, { "epoch": 0.5139833711262283, "epsilon_dpo/beta": 0.018131179735064507, "epsilon_dpo/beta_margin_grad_mean": -0.3626411259174347, "epsilon_dpo/beta_margin_grad_std": 0.17720721662044525, "epsilon_dpo/beta_margin_mean": 0.6630287766456604, "epsilon_dpo/beta_margin_std": 0.8619813919067383, "epsilon_dpo/loss_margin_mean": 36.85167694091797, "grad_norm": 36.84088897705078, "kl/avg_steps": 0.46875, "kl/beta": 0.018214747309684753, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.8295924627584004e-07, "logits/chosen": -3.2829673290252686, "logits/rejected": -3.198265314102173, "logps/chosen": -78.22593688964844, "logps/ref_chosen": -46.91853713989258, "logps/ref_rejected": -55.01128387451172, "logps/rejected": -123.17034912109375, "loss": 0.9903, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5706087350845337, "rewards/margins": 0.6630287766456604, "rewards/rejected": -1.2336375713348389, "step": 340 }, { "epoch": 0.5154950869236583, "epsilon_dpo/beta": 0.018046583980321884, "epsilon_dpo/beta_margin_grad_mean": -0.37172645330429077, "epsilon_dpo/beta_margin_grad_std": 0.17616744339466095, "epsilon_dpo/beta_margin_mean": 0.6090283393859863, "epsilon_dpo/beta_margin_std": 0.8536717891693115, "epsilon_dpo/loss_margin_mean": 34.04542922973633, "grad_norm": 40.59037399291992, "kl/avg_steps": 0.46875, "kl/beta": 0.01812976412475109, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.816481133934373e-07, "logits/chosen": -3.3924736976623535, "logits/rejected": -3.470890998840332, "logps/chosen": -88.79869079589844, "logps/ref_chosen": -56.489498138427734, "logps/ref_rejected": -76.42179870605469, "logps/rejected": -142.7764129638672, "loss": 1.0259, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5859235525131226, "rewards/margins": 0.6090283393859863, "rewards/rejected": -1.1949518918991089, "step": 341 }, { "epoch": 0.5170068027210885, "epsilon_dpo/beta": 0.017979303374886513, "epsilon_dpo/beta_margin_grad_mean": -0.3659070134162903, "epsilon_dpo/beta_margin_grad_std": 0.17422275245189667, "epsilon_dpo/beta_margin_mean": 0.6480848789215088, "epsilon_dpo/beta_margin_std": 0.8421779870986938, "epsilon_dpo/loss_margin_mean": 36.36375427246094, "grad_norm": 41.37820053100586, "kl/avg_steps": 0.375, "kl/beta": 0.018045175820589066, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.8033609524527046e-07, "logits/chosen": -3.253039836883545, "logits/rejected": -3.3439550399780273, "logps/chosen": -73.17562103271484, "logps/ref_chosen": -44.2977294921875, "logps/ref_rejected": -62.873634338378906, "logps/rejected": -128.1152801513672, "loss": 0.9934, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5215475559234619, "rewards/margins": 0.6480848789215088, "rewards/rejected": -1.1696324348449707, "step": 342 }, { "epoch": 0.5185185185185185, "epsilon_dpo/beta": 0.017889659851789474, "epsilon_dpo/beta_margin_grad_mean": -0.37413105368614197, "epsilon_dpo/beta_margin_grad_std": 0.14626076817512512, "epsilon_dpo/beta_margin_mean": 0.5817154049873352, "epsilon_dpo/beta_margin_std": 0.7043190002441406, "epsilon_dpo/loss_margin_mean": 32.736324310302734, "grad_norm": 36.98164367675781, "kl/avg_steps": 0.5, "kl/beta": 0.0179777592420578, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.7902322853130753e-07, "logits/chosen": -3.5400826930999756, "logits/rejected": -3.489339590072632, "logps/chosen": -86.29547119140625, "logps/ref_chosen": -61.75351333618164, "logps/ref_rejected": -79.50763702392578, "logps/rejected": -136.78591918945312, "loss": 0.9948, "rewards/accuracies": 0.75, "rewards/chosen": -0.4404725432395935, "rewards/margins": 0.5817153453826904, "rewards/rejected": -1.0221879482269287, "step": 343 }, { "epoch": 0.5200302343159486, "epsilon_dpo/beta": 0.017789475619792938, "epsilon_dpo/beta_margin_grad_mean": -0.3666778802871704, "epsilon_dpo/beta_margin_grad_std": 0.15356296300888062, "epsilon_dpo/beta_margin_mean": 0.6295640468597412, "epsilon_dpo/beta_margin_std": 0.7700322270393372, "epsilon_dpo/loss_margin_mean": 35.61920928955078, "grad_norm": 45.933475494384766, "kl/avg_steps": 0.5625, "kl/beta": 0.017888318747282028, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.7770954997525274e-07, "logits/chosen": -3.3116679191589355, "logits/rejected": -3.406489849090576, "logps/chosen": -81.83769226074219, "logps/ref_chosen": -50.44179916381836, "logps/ref_rejected": -74.44220733642578, "logps/rejected": -141.45730590820312, "loss": 0.9768, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5603649616241455, "rewards/margins": 0.6295640468597412, "rewards/rejected": -1.1899290084838867, "step": 344 }, { "epoch": 0.5215419501133787, "epsilon_dpo/beta": 0.017689969390630722, "epsilon_dpo/beta_margin_grad_mean": -0.37110435962677, "epsilon_dpo/beta_margin_grad_std": 0.16225150227546692, "epsilon_dpo/beta_margin_mean": 0.608718991279602, "epsilon_dpo/beta_margin_std": 0.7831045985221863, "epsilon_dpo/loss_margin_mean": 34.64168167114258, "grad_norm": 40.886009216308594, "kl/avg_steps": 0.5625, "kl/beta": 0.01778825931251049, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.7639509632351927e-07, "logits/chosen": -3.4382143020629883, "logits/rejected": -3.5035645961761475, "logps/chosen": -69.18930053710938, "logps/ref_chosen": -50.721946716308594, "logps/ref_rejected": -74.75621032714844, "logps/rejected": -127.86524963378906, "loss": 1.0008, "rewards/accuracies": 0.75, "rewards/chosen": -0.3282455801963806, "rewards/margins": 0.608718991279602, "rewards/rejected": -0.9369645714759827, "step": 345 }, { "epoch": 0.5230536659108088, "epsilon_dpo/beta": 0.017596548423171043, "epsilon_dpo/beta_margin_grad_mean": -0.37089794874191284, "epsilon_dpo/beta_margin_grad_std": 0.15857015550136566, "epsilon_dpo/beta_margin_mean": 0.6004424691200256, "epsilon_dpo/beta_margin_std": 0.7595969438552856, "epsilon_dpo/loss_margin_mean": 34.37065505981445, "grad_norm": 41.574195861816406, "kl/avg_steps": 0.53125, "kl/beta": 0.01768876053392887, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.7507990434420123e-07, "logits/chosen": -3.482679843902588, "logits/rejected": -3.4848432540893555, "logps/chosen": -75.52957153320312, "logps/ref_chosen": -52.331573486328125, "logps/ref_rejected": -79.70005798339844, "logps/rejected": -137.26870727539062, "loss": 0.9997, "rewards/accuracies": 0.765625, "rewards/chosen": -0.40921342372894287, "rewards/margins": 0.6004424095153809, "rewards/rejected": -1.0096559524536133, "step": 346 }, { "epoch": 0.5245653817082389, "epsilon_dpo/beta": 0.017520057037472725, "epsilon_dpo/beta_margin_grad_mean": -0.3717530369758606, "epsilon_dpo/beta_margin_grad_std": 0.1667712926864624, "epsilon_dpo/beta_margin_mean": 0.6061415076255798, "epsilon_dpo/beta_margin_std": 0.8208181262016296, "epsilon_dpo/loss_margin_mean": 34.883121490478516, "grad_norm": 39.43595886230469, "kl/avg_steps": 0.4375, "kl/beta": 0.017595285549759865, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.737640108260456e-07, "logits/chosen": -3.3054065704345703, "logits/rejected": -3.3840725421905518, "logps/chosen": -75.33184814453125, "logps/ref_chosen": -50.49886703491211, "logps/ref_rejected": -66.99897766113281, "logps/rejected": -126.71507263183594, "loss": 1.0147, "rewards/accuracies": 0.75, "rewards/chosen": -0.4367862939834595, "rewards/margins": 0.6061415076255798, "rewards/rejected": -1.042927861213684, "step": 347 }, { "epoch": 0.5260770975056689, "epsilon_dpo/beta": 0.017432792112231255, "epsilon_dpo/beta_margin_grad_mean": -0.3848005533218384, "epsilon_dpo/beta_margin_grad_std": 0.15636169910430908, "epsilon_dpo/beta_margin_mean": 0.5382253527641296, "epsilon_dpo/beta_margin_std": 0.763007640838623, "epsilon_dpo/loss_margin_mean": 31.119930267333984, "grad_norm": 39.39665985107422, "kl/avg_steps": 0.5, "kl/beta": 0.01751864142715931, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.724474525774229e-07, "logits/chosen": -3.345548391342163, "logits/rejected": -3.387348175048828, "logps/chosen": -71.2667236328125, "logps/ref_chosen": -46.789093017578125, "logps/ref_rejected": -62.48242950439453, "logps/rejected": -118.07998657226562, "loss": 1.0444, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4291536211967468, "rewards/margins": 0.5382253527641296, "rewards/rejected": -0.9673789739608765, "step": 348 }, { "epoch": 0.527588813303099, "epsilon_dpo/beta": 0.017346061766147614, "epsilon_dpo/beta_margin_grad_mean": -0.3677341938018799, "epsilon_dpo/beta_margin_grad_std": 0.1533939093351364, "epsilon_dpo/beta_margin_mean": 0.6218425035476685, "epsilon_dpo/beta_margin_std": 0.7507154941558838, "epsilon_dpo/loss_margin_mean": 36.09321212768555, "grad_norm": 38.02052307128906, "kl/avg_steps": 0.5, "kl/beta": 0.017431484535336494, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.711302664252973e-07, "logits/chosen": -3.3447628021240234, "logits/rejected": -3.3754830360412598, "logps/chosen": -70.95066833496094, "logps/ref_chosen": -48.089927673339844, "logps/ref_rejected": -74.52406311035156, "logps/rejected": -133.47802734375, "loss": 0.9792, "rewards/accuracies": 0.765625, "rewards/chosen": -0.39777398109436035, "rewards/margins": 0.6218425035476685, "rewards/rejected": -1.0196166038513184, "step": 349 }, { "epoch": 0.5291005291005291, "epsilon_dpo/beta": 0.01724349893629551, "epsilon_dpo/beta_margin_grad_mean": -0.34851065278053284, "epsilon_dpo/beta_margin_grad_std": 0.15505020320415497, "epsilon_dpo/beta_margin_mean": 0.6934606432914734, "epsilon_dpo/beta_margin_std": 0.7264665365219116, "epsilon_dpo/loss_margin_mean": 40.45161437988281, "grad_norm": 35.618629455566406, "kl/avg_steps": 0.59375, "kl/beta": 0.0173447597771883, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.698124892141971e-07, "logits/chosen": -3.4050345420837402, "logits/rejected": -3.4117021560668945, "logps/chosen": -73.09703826904297, "logps/ref_chosen": -52.5406379699707, "logps/ref_rejected": -74.3304443359375, "logps/rejected": -135.3384552001953, "loss": 0.9264, "rewards/accuracies": 0.78125, "rewards/chosen": -0.35566264390945435, "rewards/margins": 0.6934606432914734, "rewards/rejected": -1.0491232872009277, "step": 350 }, { "epoch": 0.5306122448979592, "epsilon_dpo/beta": 0.01715249754488468, "epsilon_dpo/beta_margin_grad_mean": -0.36785462498664856, "epsilon_dpo/beta_margin_grad_std": 0.16931787133216858, "epsilon_dpo/beta_margin_mean": 0.6120951771736145, "epsilon_dpo/beta_margin_std": 0.8174942135810852, "epsilon_dpo/loss_margin_mean": 35.96962356567383, "grad_norm": 42.16701126098633, "kl/avg_steps": 0.53125, "kl/beta": 0.01724238321185112, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.6849415780518357e-07, "logits/chosen": -3.406905174255371, "logits/rejected": -3.418269634246826, "logps/chosen": -73.9931869506836, "logps/ref_chosen": -49.928810119628906, "logps/ref_rejected": -72.40730285644531, "logps/rejected": -132.44129943847656, "loss": 1.0117, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4156160354614258, "rewards/margins": 0.6120951771736145, "rewards/rejected": -1.0277111530303955, "step": 351 }, { "epoch": 0.5321239606953893, "epsilon_dpo/beta": 0.017037708312273026, "epsilon_dpo/beta_margin_grad_mean": -0.3512311577796936, "epsilon_dpo/beta_margin_grad_std": 0.13588550686836243, "epsilon_dpo/beta_margin_mean": 0.6884089112281799, "epsilon_dpo/beta_margin_std": 0.7018852829933167, "epsilon_dpo/loss_margin_mean": 40.5786247253418, "grad_norm": 38.8215217590332, "kl/avg_steps": 0.671875, "kl/beta": 0.01715126633644104, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.6717530907482024e-07, "logits/chosen": -3.383419990539551, "logits/rejected": -3.3990321159362793, "logps/chosen": -77.245361328125, "logps/ref_chosen": -56.07526397705078, "logps/ref_rejected": -75.26618194580078, "logps/rejected": -137.01490783691406, "loss": 0.9125, "rewards/accuracies": 0.84375, "rewards/chosen": -0.36138951778411865, "rewards/margins": 0.6884089708328247, "rewards/rejected": -1.0497983694076538, "step": 352 }, { "epoch": 0.5336356764928194, "epsilon_dpo/beta": 0.01694798842072487, "epsilon_dpo/beta_margin_grad_mean": -0.3664696514606476, "epsilon_dpo/beta_margin_grad_std": 0.1521570086479187, "epsilon_dpo/beta_margin_mean": 0.6074159145355225, "epsilon_dpo/beta_margin_std": 0.7230923175811768, "epsilon_dpo/loss_margin_mean": 36.08266830444336, "grad_norm": 31.017539978027344, "kl/avg_steps": 0.53125, "kl/beta": 0.01703680120408535, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.658559799141411e-07, "logits/chosen": -3.4448561668395996, "logits/rejected": -3.35012149810791, "logps/chosen": -84.41938781738281, "logps/ref_chosen": -60.5035400390625, "logps/ref_rejected": -65.11581420898438, "logps/rejected": -125.11433410644531, "loss": 0.9843, "rewards/accuracies": 0.796875, "rewards/chosen": -0.40665316581726074, "rewards/margins": 0.6074159145355225, "rewards/rejected": -1.0140690803527832, "step": 353 }, { "epoch": 0.5351473922902494, "epsilon_dpo/beta": 0.016858428716659546, "epsilon_dpo/beta_margin_grad_mean": -0.36161530017852783, "epsilon_dpo/beta_margin_grad_std": 0.16449138522148132, "epsilon_dpo/beta_margin_mean": 0.662818968296051, "epsilon_dpo/beta_margin_std": 0.8085375428199768, "epsilon_dpo/loss_margin_mean": 39.59104919433594, "grad_norm": 38.932437896728516, "kl/avg_steps": 0.53125, "kl/beta": 0.01694677211344242, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.6453620722761895e-07, "logits/chosen": -3.1711440086364746, "logits/rejected": -3.358678102493286, "logps/chosen": -57.44135284423828, "logps/ref_chosen": -35.433738708496094, "logps/ref_rejected": -62.08784103393555, "logps/rejected": -123.6865005493164, "loss": 0.9691, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3732542395591736, "rewards/margins": 0.662818968296051, "rewards/rejected": -1.0360732078552246, "step": 354 }, { "epoch": 0.5366591080876795, "epsilon_dpo/beta": 0.016769342124462128, "epsilon_dpo/beta_margin_grad_mean": -0.3652481436729431, "epsilon_dpo/beta_margin_grad_std": 0.1818566769361496, "epsilon_dpo/beta_margin_mean": 0.6409850716590881, "epsilon_dpo/beta_margin_std": 0.8723416924476624, "epsilon_dpo/loss_margin_mean": 38.549095153808594, "grad_norm": 43.391563415527344, "kl/avg_steps": 0.53125, "kl/beta": 0.016857217997312546, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.632160279321328e-07, "logits/chosen": -3.1993560791015625, "logits/rejected": -3.368563652038574, "logps/chosen": -72.08704376220703, "logps/ref_chosen": -39.2044677734375, "logps/ref_rejected": -73.43476867675781, "logps/rejected": -144.86642456054688, "loss": 1.0114, "rewards/accuracies": 0.75, "rewards/chosen": -0.5541493892669678, "rewards/margins": 0.6409851312637329, "rewards/rejected": -1.1951344013214111, "step": 355 }, { "epoch": 0.5381708238851096, "epsilon_dpo/beta": 0.016685964539647102, "epsilon_dpo/beta_margin_grad_mean": -0.38575485348701477, "epsilon_dpo/beta_margin_grad_std": 0.18575793504714966, "epsilon_dpo/beta_margin_mean": 0.5375723242759705, "epsilon_dpo/beta_margin_std": 0.8933961391448975, "epsilon_dpo/loss_margin_mean": 32.55303192138672, "grad_norm": 38.00511169433594, "kl/avg_steps": 0.5, "kl/beta": 0.016768136993050575, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.618954789559356e-07, "logits/chosen": -3.318031072616577, "logits/rejected": -3.465146780014038, "logps/chosen": -70.73081970214844, "logps/ref_chosen": -42.615623474121094, "logps/ref_rejected": -71.92729187011719, "logps/rejected": -132.59552001953125, "loss": 1.0956, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4721330404281616, "rewards/margins": 0.5375723242759705, "rewards/rejected": -1.0097054243087769, "step": 356 }, { "epoch": 0.5396825396825397, "epsilon_dpo/beta": 0.01658209227025509, "epsilon_dpo/beta_margin_grad_mean": -0.3596562147140503, "epsilon_dpo/beta_margin_grad_std": 0.1463175117969513, "epsilon_dpo/beta_margin_mean": 0.6394835114479065, "epsilon_dpo/beta_margin_std": 0.6846140623092651, "epsilon_dpo/loss_margin_mean": 38.78138732910156, "grad_norm": 40.2320671081543, "kl/avg_steps": 0.625, "kl/beta": 0.01668471284210682, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.6057459723762076e-07, "logits/chosen": -3.319241523742676, "logits/rejected": -3.334078311920166, "logps/chosen": -79.93377685546875, "logps/ref_chosen": -52.681365966796875, "logps/ref_rejected": -59.05322265625, "logps/rejected": -125.08702087402344, "loss": 0.9501, "rewards/accuracies": 0.8125, "rewards/chosen": -0.45321381092071533, "rewards/margins": 0.6394835710525513, "rewards/rejected": -1.0926973819732666, "step": 357 }, { "epoch": 0.5411942554799698, "epsilon_dpo/beta": 0.016489461064338684, "epsilon_dpo/beta_margin_grad_mean": -0.34337183833122253, "epsilon_dpo/beta_margin_grad_std": 0.17295251786708832, "epsilon_dpo/beta_margin_mean": 0.7428022623062134, "epsilon_dpo/beta_margin_std": 0.8545412421226501, "epsilon_dpo/loss_margin_mean": 45.35118865966797, "grad_norm": 41.2762336730957, "kl/avg_steps": 0.5625, "kl/beta": 0.016581080853939056, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.5925341972508954e-07, "logits/chosen": -3.320063591003418, "logits/rejected": -3.323241710662842, "logps/chosen": -87.45002746582031, "logps/ref_chosen": -57.0673942565918, "logps/ref_rejected": -64.46846008300781, "logps/rejected": -140.20228576660156, "loss": 0.9324, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5021802186965942, "rewards/margins": 0.7428022623062134, "rewards/rejected": -1.2449824810028076, "step": 358 }, { "epoch": 0.5427059712773998, "epsilon_dpo/beta": 0.01640753261744976, "epsilon_dpo/beta_margin_grad_mean": -0.37836432456970215, "epsilon_dpo/beta_margin_grad_std": 0.1539464145898819, "epsilon_dpo/beta_margin_mean": 0.5634649991989136, "epsilon_dpo/beta_margin_std": 0.7400045394897461, "epsilon_dpo/loss_margin_mean": 34.591773986816406, "grad_norm": 41.34294509887695, "kl/avg_steps": 0.5, "kl/beta": 0.0164883341640234, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.579319833745169e-07, "logits/chosen": -3.4052157402038574, "logits/rejected": -3.3466391563415527, "logps/chosen": -96.06185150146484, "logps/ref_chosen": -58.52412414550781, "logps/ref_rejected": -72.77745056152344, "logps/rejected": -144.90695190429688, "loss": 1.0194, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6189997792243958, "rewards/margins": 0.5634649991989136, "rewards/rejected": -1.182464838027954, "step": 359 }, { "epoch": 0.54421768707483, "epsilon_dpo/beta": 0.016310520470142365, "epsilon_dpo/beta_margin_grad_mean": -0.3558012545108795, "epsilon_dpo/beta_margin_grad_std": 0.15470626950263977, "epsilon_dpo/beta_margin_mean": 0.6934565305709839, "epsilon_dpo/beta_margin_std": 0.8104661107063293, "epsilon_dpo/loss_margin_mean": 42.739261627197266, "grad_norm": 44.59520721435547, "kl/avg_steps": 0.59375, "kl/beta": 0.016406303271651268, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.5661032514931834e-07, "logits/chosen": -3.3282651901245117, "logits/rejected": -3.4520766735076904, "logps/chosen": -89.13972473144531, "logps/ref_chosen": -51.77776336669922, "logps/ref_rejected": -83.45474243164062, "logps/rejected": -163.55596923828125, "loss": 0.9414, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6103986501693726, "rewards/margins": 0.6934565305709839, "rewards/rejected": -1.3038551807403564, "step": 360 }, { "epoch": 0.54572940287226, "epsilon_dpo/beta": 0.016234638169407845, "epsilon_dpo/beta_margin_grad_mean": -0.3618767261505127, "epsilon_dpo/beta_margin_grad_std": 0.16919633746147156, "epsilon_dpo/beta_margin_mean": 0.658193826675415, "epsilon_dpo/beta_margin_std": 0.8122826814651489, "epsilon_dpo/loss_margin_mean": 40.85041046142578, "grad_norm": 41.18626022338867, "kl/avg_steps": 0.46875, "kl/beta": 0.0163094662129879, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.552884820191154e-07, "logits/chosen": -3.289153814315796, "logits/rejected": -3.316497325897217, "logps/chosen": -83.62090301513672, "logps/ref_chosen": -47.967262268066406, "logps/ref_rejected": -66.61077117919922, "logps/rejected": -143.1148223876953, "loss": 0.9764, "rewards/accuracies": 0.75, "rewards/chosen": -0.581680178642273, "rewards/margins": 0.658193826675415, "rewards/rejected": -1.239874005317688, "step": 361 }, { "epoch": 0.54724111866969, "epsilon_dpo/beta": 0.01616140455007553, "epsilon_dpo/beta_margin_grad_mean": -0.3655538558959961, "epsilon_dpo/beta_margin_grad_std": 0.17590853571891785, "epsilon_dpo/beta_margin_mean": 0.6540786623954773, "epsilon_dpo/beta_margin_std": 0.8872355222702026, "epsilon_dpo/loss_margin_mean": 40.80686569213867, "grad_norm": 42.047733306884766, "kl/avg_steps": 0.453125, "kl/beta": 0.016233371570706367, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.53966490958702e-07, "logits/chosen": -3.381411075592041, "logits/rejected": -3.5107688903808594, "logps/chosen": -101.22875213623047, "logps/ref_chosen": -57.52854919433594, "logps/ref_rejected": -93.94340515136719, "logps/rejected": -178.45046997070312, "loss": 1.0003, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7079542279243469, "rewards/margins": 0.6540787220001221, "rewards/rejected": -1.3620328903198242, "step": 362 }, { "epoch": 0.5487528344671202, "epsilon_dpo/beta": 0.01605064980685711, "epsilon_dpo/beta_margin_grad_mean": -0.35354748368263245, "epsilon_dpo/beta_margin_grad_std": 0.17584973573684692, "epsilon_dpo/beta_margin_mean": 0.7374035120010376, "epsilon_dpo/beta_margin_std": 0.9622324109077454, "epsilon_dpo/loss_margin_mean": 46.187538146972656, "grad_norm": 48.846832275390625, "kl/avg_steps": 0.6875, "kl/beta": 0.01616014540195465, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.526443889470099e-07, "logits/chosen": -3.206235408782959, "logits/rejected": -3.4505252838134766, "logps/chosen": -85.89228057861328, "logps/ref_chosen": -44.890724182128906, "logps/ref_rejected": -84.15472412109375, "logps/rejected": -171.34381103515625, "loss": 0.9603, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6592026948928833, "rewards/margins": 0.7374035120010376, "rewards/rejected": -1.396606206893921, "step": 363 }, { "epoch": 0.5502645502645502, "epsilon_dpo/beta": 0.01598620042204857, "epsilon_dpo/beta_margin_grad_mean": -0.34754303097724915, "epsilon_dpo/beta_margin_grad_std": 0.19974981248378754, "epsilon_dpo/beta_margin_mean": 0.799750030040741, "epsilon_dpo/beta_margin_std": 1.0889291763305664, "epsilon_dpo/loss_margin_mean": 50.474464416503906, "grad_norm": 39.8922119140625, "kl/avg_steps": 0.40625, "kl/beta": 0.016049804165959358, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.513222129660744e-07, "logits/chosen": -3.341822624206543, "logits/rejected": -3.316934585571289, "logps/chosen": -92.84584045410156, "logps/ref_chosen": -52.695404052734375, "logps/ref_rejected": -74.79721069335938, "logps/rejected": -165.422119140625, "loss": 0.9697, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6448004245758057, "rewards/margins": 0.7997500896453857, "rewards/rejected": -1.4445505142211914, "step": 364 }, { "epoch": 0.5517762660619804, "epsilon_dpo/beta": 0.015906531363725662, "epsilon_dpo/beta_margin_grad_mean": -0.34737733006477356, "epsilon_dpo/beta_margin_grad_std": 0.15040113031864166, "epsilon_dpo/beta_margin_mean": 0.7503244280815125, "epsilon_dpo/beta_margin_std": 0.8723316788673401, "epsilon_dpo/loss_margin_mean": 47.44552230834961, "grad_norm": 34.89613723754883, "kl/avg_steps": 0.5, "kl/beta": 0.01598486490547657, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.5e-07, "logits/chosen": -3.4144864082336426, "logits/rejected": -3.3406906127929688, "logps/chosen": -93.8440933227539, "logps/ref_chosen": -59.81489944458008, "logps/ref_rejected": -69.19140625, "logps/rejected": -150.66612243652344, "loss": 0.9092, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5412712097167969, "rewards/margins": 0.7503244280815125, "rewards/rejected": -1.291595697402954, "step": 365 }, { "epoch": 0.5532879818594104, "epsilon_dpo/beta": 0.015832362696528435, "epsilon_dpo/beta_margin_grad_mean": -0.36766955256462097, "epsilon_dpo/beta_margin_grad_std": 0.17102445662021637, "epsilon_dpo/beta_margin_mean": 0.6254857778549194, "epsilon_dpo/beta_margin_std": 0.8205941915512085, "epsilon_dpo/loss_margin_mean": 39.83504867553711, "grad_norm": 43.04199981689453, "kl/avg_steps": 0.46875, "kl/beta": 0.015905337408185005, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.486777870339255e-07, "logits/chosen": -3.196176528930664, "logits/rejected": -3.260798931121826, "logps/chosen": -76.98192596435547, "logps/ref_chosen": -46.85981750488281, "logps/ref_rejected": -62.585548400878906, "logps/rejected": -132.54270935058594, "loss": 1.0029, "rewards/accuracies": 0.765625, "rewards/chosen": -0.47811809182167053, "rewards/margins": 0.6254857778549194, "rewards/rejected": -1.1036038398742676, "step": 366 }, { "epoch": 0.5547996976568406, "epsilon_dpo/beta": 0.01572386361658573, "epsilon_dpo/beta_margin_grad_mean": -0.35749372839927673, "epsilon_dpo/beta_margin_grad_std": 0.1758396327495575, "epsilon_dpo/beta_margin_mean": 0.6691994071006775, "epsilon_dpo/beta_margin_std": 0.8513617515563965, "epsilon_dpo/loss_margin_mean": 42.815162658691406, "grad_norm": 47.53231430053711, "kl/avg_steps": 0.6875, "kl/beta": 0.015831129625439644, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.4735561105299014e-07, "logits/chosen": -3.298549175262451, "logits/rejected": -3.3775033950805664, "logps/chosen": -91.46617126464844, "logps/ref_chosen": -51.255069732666016, "logps/ref_rejected": -74.83206176757812, "logps/rejected": -157.8583221435547, "loss": 0.9832, "rewards/accuracies": 0.828125, "rewards/chosen": -0.635130763053894, "rewards/margins": 0.6691993474960327, "rewards/rejected": -1.3043301105499268, "step": 367 }, { "epoch": 0.5563114134542706, "epsilon_dpo/beta": 0.01565581187605858, "epsilon_dpo/beta_margin_grad_mean": -0.37138739228248596, "epsilon_dpo/beta_margin_grad_std": 0.19692209362983704, "epsilon_dpo/beta_margin_mean": 0.6128949522972107, "epsilon_dpo/beta_margin_std": 0.9206162095069885, "epsilon_dpo/loss_margin_mean": 39.58003234863281, "grad_norm": 47.71054458618164, "kl/avg_steps": 0.4375, "kl/beta": 0.015723034739494324, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.46033509041298e-07, "logits/chosen": -3.3739638328552246, "logits/rejected": -3.2874462604522705, "logps/chosen": -111.79261779785156, "logps/ref_chosen": -62.810523986816406, "logps/ref_rejected": -69.01786804199219, "logps/rejected": -157.57998657226562, "loss": 1.0541, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7701478004455566, "rewards/margins": 0.6128950119018555, "rewards/rejected": -1.383042812347412, "step": 368 }, { "epoch": 0.5578231292517006, "epsilon_dpo/beta": 0.015563152730464935, "epsilon_dpo/beta_margin_grad_mean": -0.33949291706085205, "epsilon_dpo/beta_margin_grad_std": 0.17674343287944794, "epsilon_dpo/beta_margin_mean": 0.7951789498329163, "epsilon_dpo/beta_margin_std": 0.9395841360092163, "epsilon_dpo/loss_margin_mean": 51.410552978515625, "grad_norm": 38.65352249145508, "kl/avg_steps": 0.59375, "kl/beta": 0.0156545452773571, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.447115179808846e-07, "logits/chosen": -3.346435546875, "logits/rejected": -3.327749729156494, "logps/chosen": -85.11600494384766, "logps/ref_chosen": -48.53266143798828, "logps/ref_rejected": -72.31483459472656, "logps/rejected": -160.3087158203125, "loss": 0.9192, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5701428055763245, "rewards/margins": 0.7951788902282715, "rewards/rejected": -1.3653216361999512, "step": 369 }, { "epoch": 0.5593348450491308, "epsilon_dpo/beta": 0.01549074612557888, "epsilon_dpo/beta_margin_grad_mean": -0.350219190120697, "epsilon_dpo/beta_margin_grad_std": 0.21133774518966675, "epsilon_dpo/beta_margin_mean": 0.7736452221870422, "epsilon_dpo/beta_margin_std": 1.1258368492126465, "epsilon_dpo/loss_margin_mean": 50.411224365234375, "grad_norm": 42.95795822143555, "kl/avg_steps": 0.46875, "kl/beta": 0.015562145039439201, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.4338967485068164e-07, "logits/chosen": -3.2409136295318604, "logits/rejected": -3.230123519897461, "logps/chosen": -92.06867218017578, "logps/ref_chosen": -49.51808166503906, "logps/ref_rejected": -74.35448455810547, "logps/rejected": -167.31629943847656, "loss": 1.0129, "rewards/accuracies": 0.75, "rewards/chosen": -0.6627376079559326, "rewards/margins": 0.7736452221870422, "rewards/rejected": -1.43638277053833, "step": 370 }, { "epoch": 0.5608465608465608, "epsilon_dpo/beta": 0.015408790670335293, "epsilon_dpo/beta_margin_grad_mean": -0.3487391471862793, "epsilon_dpo/beta_margin_grad_std": 0.20582331717014313, "epsilon_dpo/beta_margin_mean": 0.7824663519859314, "epsilon_dpo/beta_margin_std": 1.0633907318115234, "epsilon_dpo/loss_margin_mean": 51.1941032409668, "grad_norm": 44.11751174926758, "kl/avg_steps": 0.53125, "kl/beta": 0.015489538200199604, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.420680166254831e-07, "logits/chosen": -3.293853282928467, "logits/rejected": -3.2306013107299805, "logps/chosen": -90.35604858398438, "logps/ref_chosen": -48.58738327026367, "logps/ref_rejected": -56.118438720703125, "logps/rejected": -149.08120727539062, "loss": 0.9827, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6457116603851318, "rewards/margins": 0.7824663519859314, "rewards/rejected": -1.428178071975708, "step": 371 }, { "epoch": 0.562358276643991, "epsilon_dpo/beta": 0.015327363274991512, "epsilon_dpo/beta_margin_grad_mean": -0.3812878131866455, "epsilon_dpo/beta_margin_grad_std": 0.2236192375421524, "epsilon_dpo/beta_margin_mean": 0.6308372616767883, "epsilon_dpo/beta_margin_std": 1.1638354063034058, "epsilon_dpo/loss_margin_mean": 41.60068893432617, "grad_norm": 56.27540588378906, "kl/avg_steps": 0.53125, "kl/beta": 0.01540768425911665, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.4074658027491044e-07, "logits/chosen": -3.222090244293213, "logits/rejected": -3.261155605316162, "logps/chosen": -92.28636932373047, "logps/ref_chosen": -46.61650848388672, "logps/ref_rejected": -68.4769287109375, "logps/rejected": -155.74746704101562, "loss": 1.1326, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7028517723083496, "rewards/margins": 0.6308372616767883, "rewards/rejected": -1.3336889743804932, "step": 372 }, { "epoch": 0.563869992441421, "epsilon_dpo/beta": 0.015255946666002274, "epsilon_dpo/beta_margin_grad_mean": -0.34887006878852844, "epsilon_dpo/beta_margin_grad_std": 0.2207898199558258, "epsilon_dpo/beta_margin_mean": 0.7625728249549866, "epsilon_dpo/beta_margin_std": 1.1884722709655762, "epsilon_dpo/loss_margin_mean": 50.49529266357422, "grad_norm": 53.415306091308594, "kl/avg_steps": 0.46875, "kl/beta": 0.015326263383030891, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.394254027623792e-07, "logits/chosen": -3.274142265319824, "logits/rejected": -3.196444511413574, "logps/chosen": -109.62042999267578, "logps/ref_chosen": -56.139671325683594, "logps/ref_rejected": -64.26171875, "logps/rejected": -168.23776245117188, "loss": 1.0511, "rewards/accuracies": 0.75, "rewards/chosen": -0.8199567794799805, "rewards/margins": 0.7625728845596313, "rewards/rejected": -1.5825295448303223, "step": 373 }, { "epoch": 0.5653817082388511, "epsilon_dpo/beta": 0.015160931274294853, "epsilon_dpo/beta_margin_grad_mean": -0.2996435761451721, "epsilon_dpo/beta_margin_grad_std": 0.1779409945011139, "epsilon_dpo/beta_margin_mean": 1.0450704097747803, "epsilon_dpo/beta_margin_std": 1.0212397575378418, "epsilon_dpo/loss_margin_mean": 69.25741577148438, "grad_norm": 49.35390853881836, "kl/avg_steps": 0.625, "kl/beta": 0.01525475736707449, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.381045210440644e-07, "logits/chosen": -3.3751797676086426, "logits/rejected": -3.2126522064208984, "logps/chosen": -109.77655029296875, "logps/ref_chosen": -67.03562927246094, "logps/ref_rejected": -70.12338256835938, "logps/rejected": -182.12173461914062, "loss": 0.7874, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6494814157485962, "rewards/margins": 1.0450704097747803, "rewards/rejected": -1.6945518255233765, "step": 374 }, { "epoch": 0.5668934240362812, "epsilon_dpo/beta": 0.015080977231264114, "epsilon_dpo/beta_margin_grad_mean": -0.3587202727794647, "epsilon_dpo/beta_margin_grad_std": 0.21145598590373993, "epsilon_dpo/beta_margin_mean": 0.7242752313613892, "epsilon_dpo/beta_margin_std": 1.1032686233520508, "epsilon_dpo/loss_margin_mean": 48.46192932128906, "grad_norm": 52.50359344482422, "kl/avg_steps": 0.53125, "kl/beta": 0.015160007402300835, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.3678397206786715e-07, "logits/chosen": -3.249166965484619, "logits/rejected": -3.286276340484619, "logps/chosen": -97.0804214477539, "logps/ref_chosen": -51.218833923339844, "logps/ref_rejected": -70.16511535644531, "logps/rejected": -164.48863220214844, "loss": 1.0394, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6939356327056885, "rewards/margins": 0.7242752313613892, "rewards/rejected": -1.4182108640670776, "step": 375 }, { "epoch": 0.5684051398337112, "epsilon_dpo/beta": 0.01499185711145401, "epsilon_dpo/beta_margin_grad_mean": -0.3288250267505646, "epsilon_dpo/beta_margin_grad_std": 0.227212592959404, "epsilon_dpo/beta_margin_mean": 0.9416694641113281, "epsilon_dpo/beta_margin_std": 1.2772338390350342, "epsilon_dpo/loss_margin_mean": 63.293968200683594, "grad_norm": 59.19460678100586, "kl/avg_steps": 0.59375, "kl/beta": 0.015079895034432411, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.3546379277238103e-07, "logits/chosen": -3.1048479080200195, "logits/rejected": -3.2268459796905518, "logps/chosen": -91.07433319091797, "logps/ref_chosen": -43.091346740722656, "logps/ref_rejected": -66.357177734375, "logps/rejected": -177.63414001464844, "loss": 0.9627, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7233933210372925, "rewards/margins": 0.9416694641113281, "rewards/rejected": -1.6650629043579102, "step": 376 }, { "epoch": 0.5699168556311414, "epsilon_dpo/beta": 0.014912738464772701, "epsilon_dpo/beta_margin_grad_mean": -0.3717999756336212, "epsilon_dpo/beta_margin_grad_std": 0.20852404832839966, "epsilon_dpo/beta_margin_mean": 0.6492032408714294, "epsilon_dpo/beta_margin_std": 1.0510106086730957, "epsilon_dpo/loss_margin_mean": 43.959938049316406, "grad_norm": 54.119293212890625, "kl/avg_steps": 0.53125, "kl/beta": 0.01499088667333126, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.3414402008585886e-07, "logits/chosen": -3.085165500640869, "logits/rejected": -3.1110129356384277, "logps/chosen": -93.7093505859375, "logps/ref_chosen": -39.7353630065918, "logps/ref_rejected": -55.19762420654297, "logps/rejected": -153.1315460205078, "loss": 1.0731, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8083268404006958, "rewards/margins": 0.6492032408714294, "rewards/rejected": -1.4575300216674805, "step": 377 }, { "epoch": 0.5714285714285714, "epsilon_dpo/beta": 0.014829273335635662, "epsilon_dpo/beta_margin_grad_mean": -0.3458455502986908, "epsilon_dpo/beta_margin_grad_std": 0.18496288359165192, "epsilon_dpo/beta_margin_mean": 0.7642508149147034, "epsilon_dpo/beta_margin_std": 0.9978711009025574, "epsilon_dpo/loss_margin_mean": 51.87539291381836, "grad_norm": 51.9669303894043, "kl/avg_steps": 0.5625, "kl/beta": 0.014911668375134468, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.3282469092517977e-07, "logits/chosen": -3.2028841972351074, "logits/rejected": -3.2186942100524902, "logps/chosen": -106.4610366821289, "logps/ref_chosen": -55.603187561035156, "logps/ref_rejected": -62.18587875366211, "logps/rejected": -164.9191131591797, "loss": 0.9626, "rewards/accuracies": 0.828125, "rewards/chosen": -0.756363034248352, "rewards/margins": 0.7642508149147034, "rewards/rejected": -1.5206139087677002, "step": 378 }, { "epoch": 0.5729402872260015, "epsilon_dpo/beta": 0.014741689898073673, "epsilon_dpo/beta_margin_grad_mean": -0.3386710584163666, "epsilon_dpo/beta_margin_grad_std": 0.21029967069625854, "epsilon_dpo/beta_margin_mean": 0.8771803379058838, "epsilon_dpo/beta_margin_std": 1.211257815361023, "epsilon_dpo/loss_margin_mean": 59.915077209472656, "grad_norm": 47.90486526489258, "kl/avg_steps": 0.59375, "kl/beta": 0.014828259125351906, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.3150584219481643e-07, "logits/chosen": -3.400219440460205, "logits/rejected": -3.5537238121032715, "logps/chosen": -123.17984008789062, "logps/ref_chosen": -69.81751251220703, "logps/ref_rejected": -101.04313659667969, "logps/rejected": -214.32052612304688, "loss": 0.9659, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7902753353118896, "rewards/margins": 0.8771803379058838, "rewards/rejected": -1.6674556732177734, "step": 379 }, { "epoch": 0.5744520030234316, "epsilon_dpo/beta": 0.014636251144111156, "epsilon_dpo/beta_margin_grad_mean": -0.2978096306324005, "epsilon_dpo/beta_margin_grad_std": 0.1894843727350235, "epsilon_dpo/beta_margin_mean": 1.034064769744873, "epsilon_dpo/beta_margin_std": 1.036618947982788, "epsilon_dpo/loss_margin_mean": 70.95188903808594, "grad_norm": 44.718658447265625, "kl/avg_steps": 0.71875, "kl/beta": 0.014740736223757267, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.3018751078580283e-07, "logits/chosen": -3.3404767513275146, "logits/rejected": -3.1836423873901367, "logps/chosen": -90.62619018554688, "logps/ref_chosen": -55.78861999511719, "logps/ref_rejected": -63.3931884765625, "logps/rejected": -169.18264770507812, "loss": 0.8116, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5108762979507446, "rewards/margins": 1.034064769744873, "rewards/rejected": -1.5449409484863281, "step": 380 }, { "epoch": 0.5759637188208617, "epsilon_dpo/beta": 0.01456382218748331, "epsilon_dpo/beta_margin_grad_mean": -0.39249077439308167, "epsilon_dpo/beta_margin_grad_std": 0.19489768147468567, "epsilon_dpo/beta_margin_mean": 0.5212841033935547, "epsilon_dpo/beta_margin_std": 1.0820554494857788, "epsilon_dpo/loss_margin_mean": 36.2067985534668, "grad_norm": 58.39653396606445, "kl/avg_steps": 0.5, "kl/beta": 0.014635543338954449, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.288697335747027e-07, "logits/chosen": -3.3492016792297363, "logits/rejected": -3.178860664367676, "logps/chosen": -106.00067138671875, "logps/ref_chosen": -53.819801330566406, "logps/ref_rejected": -57.92563247680664, "logps/rejected": -146.31329345703125, "loss": 1.1679, "rewards/accuracies": 0.71875, "rewards/chosen": -0.76385498046875, "rewards/margins": 0.5212841033935547, "rewards/rejected": -1.2851390838623047, "step": 381 }, { "epoch": 0.5774754346182918, "epsilon_dpo/beta": 0.01448681391775608, "epsilon_dpo/beta_margin_grad_mean": -0.3598490059375763, "epsilon_dpo/beta_margin_grad_std": 0.19208425283432007, "epsilon_dpo/beta_margin_mean": 0.6855300664901733, "epsilon_dpo/beta_margin_std": 0.9582545757293701, "epsilon_dpo/loss_margin_mean": 47.72581481933594, "grad_norm": 51.23689270019531, "kl/avg_steps": 0.53125, "kl/beta": 0.014562729746103287, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.2755254742257706e-07, "logits/chosen": -3.3303654193878174, "logits/rejected": -3.3674495220184326, "logps/chosen": -117.71005249023438, "logps/ref_chosen": -57.819026947021484, "logps/ref_rejected": -79.43643188476562, "logps/rejected": -187.05328369140625, "loss": 1.0097, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8687565326690674, "rewards/margins": 0.6855300664901733, "rewards/rejected": -1.5542867183685303, "step": 382 }, { "epoch": 0.5789871504157218, "epsilon_dpo/beta": 0.014410259202122688, "epsilon_dpo/beta_margin_grad_mean": -0.34290367364883423, "epsilon_dpo/beta_margin_grad_std": 0.2112709879875183, "epsilon_dpo/beta_margin_mean": 0.8232312798500061, "epsilon_dpo/beta_margin_std": 1.133060097694397, "epsilon_dpo/loss_margin_mean": 57.59763717651367, "grad_norm": 55.14401626586914, "kl/avg_steps": 0.53125, "kl/beta": 0.01448577456176281, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.2623598917395436e-07, "logits/chosen": -3.4091572761535645, "logits/rejected": -3.2977962493896484, "logps/chosen": -118.38380432128906, "logps/ref_chosen": -68.7459716796875, "logps/ref_rejected": -67.90165710449219, "logps/rejected": -175.13711547851562, "loss": 0.9803, "rewards/accuracies": 0.78125, "rewards/chosen": -0.717979907989502, "rewards/margins": 0.8232312798500061, "rewards/rejected": -1.5412111282348633, "step": 383 }, { "epoch": 0.5804988662131519, "epsilon_dpo/beta": 0.014302586205303669, "epsilon_dpo/beta_margin_grad_mean": -0.31662464141845703, "epsilon_dpo/beta_margin_grad_std": 0.1673099845647812, "epsilon_dpo/beta_margin_mean": 0.8921586871147156, "epsilon_dpo/beta_margin_std": 0.9410348534584045, "epsilon_dpo/loss_margin_mean": 62.635009765625, "grad_norm": 45.56753158569336, "kl/avg_steps": 0.75, "kl/beta": 0.014409225434064865, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.2492009565579875e-07, "logits/chosen": -3.218989372253418, "logits/rejected": -3.2561981678009033, "logps/chosen": -102.83619689941406, "logps/ref_chosen": -54.397972106933594, "logps/ref_rejected": -71.54708862304688, "logps/rejected": -182.62033081054688, "loss": 0.8555, "rewards/accuracies": 0.875, "rewards/chosen": -0.6942504644393921, "rewards/margins": 0.8921587467193604, "rewards/rejected": -1.586409091949463, "step": 384 }, { "epoch": 0.582010582010582, "epsilon_dpo/beta": 0.014205054379999638, "epsilon_dpo/beta_margin_grad_mean": -0.3090650141239166, "epsilon_dpo/beta_margin_grad_std": 0.17058196663856506, "epsilon_dpo/beta_margin_mean": 0.9528416991233826, "epsilon_dpo/beta_margin_std": 0.918404757976532, "epsilon_dpo/loss_margin_mean": 67.38459777832031, "grad_norm": 47.07590866088867, "kl/avg_steps": 0.6875, "kl/beta": 0.014301960356533527, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.2360490367648084e-07, "logits/chosen": -3.2774322032928467, "logits/rejected": -3.313042640686035, "logps/chosen": -108.97322082519531, "logps/ref_chosen": -57.34923553466797, "logps/ref_rejected": -82.41984558105469, "logps/rejected": -201.42843627929688, "loss": 0.8133, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7345112562179565, "rewards/margins": 0.9528417587280273, "rewards/rejected": -1.6873528957366943, "step": 385 }, { "epoch": 0.5835222978080121, "epsilon_dpo/beta": 0.014139137230813503, "epsilon_dpo/beta_margin_grad_mean": -0.3586812913417816, "epsilon_dpo/beta_margin_grad_std": 0.19485019147396088, "epsilon_dpo/beta_margin_mean": 0.6976020932197571, "epsilon_dpo/beta_margin_std": 0.9871935844421387, "epsilon_dpo/loss_margin_mean": 49.79032516479492, "grad_norm": 58.6000862121582, "kl/avg_steps": 0.46875, "kl/beta": 0.014204305596649647, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.2229045002474724e-07, "logits/chosen": -3.2915077209472656, "logits/rejected": -3.378866195678711, "logps/chosen": -127.89048767089844, "logps/ref_chosen": -62.50819396972656, "logps/ref_rejected": -84.23635864257812, "logps/rejected": -199.40896606445312, "loss": 1.0111, "rewards/accuracies": 0.75, "rewards/chosen": -0.9278383255004883, "rewards/margins": 0.6976020336151123, "rewards/rejected": -1.6254403591156006, "step": 386 }, { "epoch": 0.5850340136054422, "epsilon_dpo/beta": 0.014024564065039158, "epsilon_dpo/beta_margin_grad_mean": -0.2869046628475189, "epsilon_dpo/beta_margin_grad_std": 0.17984089255332947, "epsilon_dpo/beta_margin_mean": 1.0842773914337158, "epsilon_dpo/beta_margin_std": 1.0145516395568848, "epsilon_dpo/loss_margin_mean": 77.56083679199219, "grad_norm": 43.88919448852539, "kl/avg_steps": 0.8125, "kl/beta": 0.014138033613562584, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.209767714686924e-07, "logits/chosen": -3.1605682373046875, "logits/rejected": -3.2737278938293457, "logps/chosen": -94.64328002929688, "logps/ref_chosen": -46.54127502441406, "logps/ref_rejected": -81.99500274658203, "logps/rejected": -207.6578369140625, "loss": 0.7716, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6767662763595581, "rewards/margins": 1.0842773914337158, "rewards/rejected": -1.761043667793274, "step": 387 }, { "epoch": 0.5865457294028723, "epsilon_dpo/beta": 0.01395536307245493, "epsilon_dpo/beta_margin_grad_mean": -0.35848677158355713, "epsilon_dpo/beta_margin_grad_std": 0.1969565451145172, "epsilon_dpo/beta_margin_mean": 0.7408319115638733, "epsilon_dpo/beta_margin_std": 1.0855438709259033, "epsilon_dpo/loss_margin_mean": 53.51836013793945, "grad_norm": 50.701534271240234, "kl/avg_steps": 0.5, "kl/beta": 0.01402408815920353, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.1966390475472954e-07, "logits/chosen": -3.3376717567443848, "logits/rejected": -3.28932785987854, "logps/chosen": -124.1201171875, "logps/ref_chosen": -66.84489440917969, "logps/ref_rejected": -74.01164245605469, "logps/rejected": -184.8052215576172, "loss": 1.0075, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8028033971786499, "rewards/margins": 0.7408318519592285, "rewards/rejected": -1.5436352491378784, "step": 388 }, { "epoch": 0.5880574452003023, "epsilon_dpo/beta": 0.013864126987755299, "epsilon_dpo/beta_margin_grad_mean": -0.3216671049594879, "epsilon_dpo/beta_margin_grad_std": 0.19483420252799988, "epsilon_dpo/beta_margin_mean": 0.9080601334571838, "epsilon_dpo/beta_margin_std": 1.050511360168457, "epsilon_dpo/loss_margin_mean": 65.85839080810547, "grad_norm": 48.08329772949219, "kl/avg_steps": 0.65625, "kl/beta": 0.013954316265881062, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.1835188660656265e-07, "logits/chosen": -3.2331197261810303, "logits/rejected": -3.262835741043091, "logps/chosen": -108.67831420898438, "logps/ref_chosen": -50.975711822509766, "logps/ref_rejected": -72.14762878417969, "logps/rejected": -195.7086181640625, "loss": 0.8921, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8031487464904785, "rewards/margins": 0.9080600738525391, "rewards/rejected": -1.7112088203430176, "step": 389 }, { "epoch": 0.5895691609977324, "epsilon_dpo/beta": 0.013791067525744438, "epsilon_dpo/beta_margin_grad_mean": -0.3632296919822693, "epsilon_dpo/beta_margin_grad_std": 0.1922600269317627, "epsilon_dpo/beta_margin_mean": 0.6897745728492737, "epsilon_dpo/beta_margin_std": 1.0113471746444702, "epsilon_dpo/loss_margin_mean": 50.415504455566406, "grad_norm": 54.563316345214844, "kl/avg_steps": 0.53125, "kl/beta": 0.0138633381575346, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.170407537241599e-07, "logits/chosen": -3.192674160003662, "logits/rejected": -3.064049243927002, "logps/chosen": -88.28244018554688, "logps/ref_chosen": -42.94923400878906, "logps/ref_rejected": -57.734718322753906, "logps/rejected": -153.48342895507812, "loss": 1.0208, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6268521547317505, "rewards/margins": 0.6897745728492737, "rewards/rejected": -1.316626787185669, "step": 390 }, { "epoch": 0.5910808767951625, "epsilon_dpo/beta": 0.013709570281207561, "epsilon_dpo/beta_margin_grad_mean": -0.31146058440208435, "epsilon_dpo/beta_margin_grad_std": 0.2093610018491745, "epsilon_dpo/beta_margin_mean": 0.995006799697876, "epsilon_dpo/beta_margin_std": 1.1393852233886719, "epsilon_dpo/loss_margin_mean": 73.0548095703125, "grad_norm": 73.71208953857422, "kl/avg_steps": 0.59375, "kl/beta": 0.013790078461170197, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.1573054278272636e-07, "logits/chosen": -3.2203612327575684, "logits/rejected": -3.1838178634643555, "logps/chosen": -103.08673858642578, "logps/ref_chosen": -52.099578857421875, "logps/ref_rejected": -70.65765380859375, "logps/rejected": -194.69961547851562, "loss": 0.8745, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7016524076461792, "rewards/margins": 0.995006799697876, "rewards/rejected": -1.6966593265533447, "step": 391 }, { "epoch": 0.5925925925925926, "epsilon_dpo/beta": 0.013611514121294022, "epsilon_dpo/beta_margin_grad_mean": -0.30823200941085815, "epsilon_dpo/beta_margin_grad_std": 0.19543124735355377, "epsilon_dpo/beta_margin_mean": 0.988762617111206, "epsilon_dpo/beta_margin_std": 1.055318832397461, "epsilon_dpo/loss_margin_mean": 72.97480010986328, "grad_norm": 54.984031677246094, "kl/avg_steps": 0.71875, "kl/beta": 0.013708683662116528, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.1442129043167873e-07, "logits/chosen": -3.394564151763916, "logits/rejected": -3.314425230026245, "logps/chosen": -103.28739929199219, "logps/ref_chosen": -59.36878967285156, "logps/ref_rejected": -87.73503112792969, "logps/rejected": -204.62844848632812, "loss": 0.8468, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5991251468658447, "rewards/margins": 0.9887626767158508, "rewards/rejected": -1.5878877639770508, "step": 392 }, { "epoch": 0.5941043083900227, "epsilon_dpo/beta": 0.013544156216084957, "epsilon_dpo/beta_margin_grad_mean": -0.31625089049339294, "epsilon_dpo/beta_margin_grad_std": 0.184193953871727, "epsilon_dpo/beta_margin_mean": 0.9234877228736877, "epsilon_dpo/beta_margin_std": 0.9508858323097229, "epsilon_dpo/loss_margin_mean": 68.62931823730469, "grad_norm": 41.886600494384766, "kl/avg_steps": 0.5, "kl/beta": 0.013610855676233768, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.131130332936195e-07, "logits/chosen": -3.194032669067383, "logits/rejected": -3.132784366607666, "logps/chosen": -101.50666809082031, "logps/ref_chosen": -47.389678955078125, "logps/ref_rejected": -71.14250183105469, "logps/rejected": -193.88882446289062, "loss": 0.8483, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7358752489089966, "rewards/margins": 0.9234877228736877, "rewards/rejected": -1.659363031387329, "step": 393 }, { "epoch": 0.5956160241874527, "epsilon_dpo/beta": 0.013476770371198654, "epsilon_dpo/beta_margin_grad_mean": -0.33718857169151306, "epsilon_dpo/beta_margin_grad_std": 0.1945227086544037, "epsilon_dpo/beta_margin_mean": 0.8201510310173035, "epsilon_dpo/beta_margin_std": 1.0113369226455688, "epsilon_dpo/loss_margin_mean": 61.31360626220703, "grad_norm": 41.906761169433594, "kl/avg_steps": 0.5, "kl/beta": 0.013543139211833477, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.1180580796331323e-07, "logits/chosen": -3.1901869773864746, "logits/rejected": -3.217527151107788, "logps/chosen": -97.13365173339844, "logps/ref_chosen": -50.93657302856445, "logps/ref_rejected": -64.4262924194336, "logps/rejected": -171.93698120117188, "loss": 0.9358, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6260335445404053, "rewards/margins": 0.8201510310173035, "rewards/rejected": -1.4461846351623535, "step": 394 }, { "epoch": 0.5971277399848829, "epsilon_dpo/beta": 0.013401299715042114, "epsilon_dpo/beta_margin_grad_mean": -0.3482809364795685, "epsilon_dpo/beta_margin_grad_std": 0.18462517857551575, "epsilon_dpo/beta_margin_mean": 0.749920666217804, "epsilon_dpo/beta_margin_std": 0.9422603249549866, "epsilon_dpo/loss_margin_mean": 56.342498779296875, "grad_norm": 54.11479949951172, "kl/avg_steps": 0.5625, "kl/beta": 0.013475760817527771, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.104996510066625e-07, "logits/chosen": -3.101229667663574, "logits/rejected": -3.1579670906066895, "logps/chosen": -98.45614624023438, "logps/ref_chosen": -44.2039794921875, "logps/ref_rejected": -65.82269287109375, "logps/rejected": -176.4173583984375, "loss": 0.9555, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7297436594963074, "rewards/margins": 0.749920666217804, "rewards/rejected": -1.4796643257141113, "step": 395 }, { "epoch": 0.5986394557823129, "epsilon_dpo/beta": 0.01330958679318428, "epsilon_dpo/beta_margin_grad_mean": -0.3151586055755615, "epsilon_dpo/beta_margin_grad_std": 0.17112332582473755, "epsilon_dpo/beta_margin_mean": 0.8961671590805054, "epsilon_dpo/beta_margin_std": 0.8688762784004211, "epsilon_dpo/loss_margin_mean": 67.62623596191406, "grad_norm": 43.30214309692383, "kl/avg_steps": 0.6875, "kl/beta": 0.013400383293628693, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.0919459895968517e-07, "logits/chosen": -3.1321287155151367, "logits/rejected": -3.09877872467041, "logps/chosen": -103.09954071044922, "logps/ref_chosen": -44.96233367919922, "logps/ref_rejected": -72.56996154785156, "logps/rejected": -198.33340454101562, "loss": 0.8378, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7754774689674377, "rewards/margins": 0.8961671590805054, "rewards/rejected": -1.671644687652588, "step": 396 }, { "epoch": 0.600151171579743, "epsilon_dpo/beta": 0.013243664056062698, "epsilon_dpo/beta_margin_grad_mean": -0.40136241912841797, "epsilon_dpo/beta_margin_grad_std": 0.2166062742471695, "epsilon_dpo/beta_margin_mean": 0.4942212402820587, "epsilon_dpo/beta_margin_std": 1.130833625793457, "epsilon_dpo/loss_margin_mean": 37.853843688964844, "grad_norm": 60.07572555541992, "kl/avg_steps": 0.5, "kl/beta": 0.01330888457596302, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.078906883274924e-07, "logits/chosen": -3.271454334259033, "logits/rejected": -3.2791268825531006, "logps/chosen": -119.51215362548828, "logps/ref_chosen": -60.217018127441406, "logps/ref_rejected": -82.36647033691406, "logps/rejected": -179.5154571533203, "loss": 1.2192, "rewards/accuracies": 0.75, "rewards/chosen": -0.7896686792373657, "rewards/margins": 0.4942212700843811, "rewards/rejected": -1.283889889717102, "step": 397 }, { "epoch": 0.6016628873771731, "epsilon_dpo/beta": 0.013169498182833195, "epsilon_dpo/beta_margin_grad_mean": -0.3528101444244385, "epsilon_dpo/beta_margin_grad_std": 0.1976197212934494, "epsilon_dpo/beta_margin_mean": 0.763227105140686, "epsilon_dpo/beta_margin_std": 1.0406984090805054, "epsilon_dpo/loss_margin_mean": 58.38323974609375, "grad_norm": 56.82994079589844, "kl/avg_steps": 0.5625, "kl/beta": 0.013242671266198158, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.065879555832674e-07, "logits/chosen": -3.22432279586792, "logits/rejected": -3.2247073650360107, "logps/chosen": -98.45611572265625, "logps/ref_chosen": -45.47966003417969, "logps/ref_rejected": -69.47705078125, "logps/rejected": -180.83676147460938, "loss": 0.9802, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6986398696899414, "rewards/margins": 0.763227105140686, "rewards/rejected": -1.461867094039917, "step": 398 }, { "epoch": 0.6031746031746031, "epsilon_dpo/beta": 0.013095834292471409, "epsilon_dpo/beta_margin_grad_mean": -0.33205607533454895, "epsilon_dpo/beta_margin_grad_std": 0.18497049808502197, "epsilon_dpo/beta_margin_mean": 0.8484097719192505, "epsilon_dpo/beta_margin_std": 0.9843422770500183, "epsilon_dpo/loss_margin_mean": 65.20555877685547, "grad_norm": 44.30854415893555, "kl/avg_steps": 0.5625, "kl/beta": 0.013168598525226116, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.052864371672457e-07, "logits/chosen": -3.2356061935424805, "logits/rejected": -3.3244175910949707, "logps/chosen": -126.0616226196289, "logps/ref_chosen": -60.490474700927734, "logps/ref_rejected": -105.50724792480469, "logps/rejected": -236.28395080566406, "loss": 0.9003, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8615003824234009, "rewards/margins": 0.8484097719192505, "rewards/rejected": -1.7099101543426514, "step": 399 }, { "epoch": 0.6046863189720333, "epsilon_dpo/beta": 0.013024607673287392, "epsilon_dpo/beta_margin_grad_mean": -0.35612669587135315, "epsilon_dpo/beta_margin_grad_std": 0.1694706231355667, "epsilon_dpo/beta_margin_mean": 0.6899195909500122, "epsilon_dpo/beta_margin_std": 0.848055899143219, "epsilon_dpo/loss_margin_mean": 53.3231201171875, "grad_norm": 44.88621520996094, "kl/avg_steps": 0.546875, "kl/beta": 0.013094939291477203, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.0398616948569493e-07, "logits/chosen": -3.2633471488952637, "logits/rejected": -3.3672585487365723, "logps/chosen": -130.7532958984375, "logps/ref_chosen": -63.98377227783203, "logps/ref_rejected": -89.09565734863281, "logps/rejected": -209.18829345703125, "loss": 0.9633, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8712400197982788, "rewards/margins": 0.6899195909500122, "rewards/rejected": -1.561159610748291, "step": 400 }, { "epoch": 0.6046863189720333, "eval_epsilon_dpo/beta": 0.012965280562639236, "eval_epsilon_dpo/beta_margin_grad_mean": -0.3752177059650421, "eval_epsilon_dpo/beta_margin_grad_std": 0.19133561849594116, "eval_epsilon_dpo/beta_margin_mean": 0.609695553779602, "eval_epsilon_dpo/beta_margin_std": 0.9587175846099854, "eval_epsilon_dpo/loss_margin_mean": 47.50445556640625, "eval_kl/n_epsilon_steps": 0.2698063254356384, "eval_kl/p_epsilon_steps": 0.7284330725669861, "eval_logits/chosen": -3.4438045024871826, "eval_logits/rejected": -3.3090176582336426, "eval_logps/chosen": -137.7568359375, "eval_logps/ref_chosen": -77.40868377685547, "eval_logps/ref_rejected": -73.52816772460938, "eval_logps/rejected": -181.38076782226562, "eval_loss": 0.5350582599639893, "eval_rewards/accuracies": 0.7266725301742554, "eval_rewards/chosen": -0.7854338884353638, "eval_rewards/margins": 0.609695553779602, "eval_rewards/rejected": -1.3951295614242554, "eval_runtime": 37.0942, "eval_samples_per_second": 62.085, "eval_steps_per_second": 1.941, "step": 400 }, { "epoch": 0.6061980347694633, "epsilon_dpo/beta": 0.012939541600644588, "epsilon_dpo/beta_margin_grad_mean": -0.31344011425971985, "epsilon_dpo/beta_margin_grad_std": 0.1722596436738968, "epsilon_dpo/beta_margin_mean": 0.9197573065757751, "epsilon_dpo/beta_margin_std": 0.8997470736503601, "epsilon_dpo/loss_margin_mean": 71.4128646850586, "grad_norm": 42.23625183105469, "kl/avg_steps": 0.65625, "kl/beta": 0.01302371546626091, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.0268718890989752e-07, "logits/chosen": -3.1860764026641846, "logits/rejected": -3.2014307975769043, "logps/chosen": -84.72711944580078, "logps/ref_chosen": -43.849422454833984, "logps/ref_rejected": -69.31915283203125, "logps/rejected": -181.60971069335938, "loss": 0.8308, "rewards/accuracies": 0.84375, "rewards/chosen": -0.530019998550415, "rewards/margins": 0.9197573065757751, "rewards/rejected": -1.449777364730835, "step": 401 }, { "epoch": 0.6077097505668935, "epsilon_dpo/beta": 0.012863267213106155, "epsilon_dpo/beta_margin_grad_mean": -0.35830989480018616, "epsilon_dpo/beta_margin_grad_std": 0.19567129015922546, "epsilon_dpo/beta_margin_mean": 0.6767988204956055, "epsilon_dpo/beta_margin_std": 0.9782071113586426, "epsilon_dpo/loss_margin_mean": 53.062503814697266, "grad_norm": 51.15648651123047, "kl/avg_steps": 0.59375, "kl/beta": 0.012938804924488068, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.013895317751323e-07, "logits/chosen": -3.2804250717163086, "logits/rejected": -2.9979867935180664, "logps/chosen": -106.02536010742188, "logps/ref_chosen": -55.16570281982422, "logps/ref_rejected": -53.97565460205078, "logps/rejected": -157.89781188964844, "loss": 1.0257, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6573912501335144, "rewards/margins": 0.6767988204956055, "rewards/rejected": -1.3341901302337646, "step": 402 }, { "epoch": 0.6092214663643235, "epsilon_dpo/beta": 0.012771262787282467, "epsilon_dpo/beta_margin_grad_mean": -0.3075571358203888, "epsilon_dpo/beta_margin_grad_std": 0.18084633350372314, "epsilon_dpo/beta_margin_mean": 0.9646462202072144, "epsilon_dpo/beta_margin_std": 0.9459803700447083, "epsilon_dpo/loss_margin_mean": 75.84674072265625, "grad_norm": 44.909976959228516, "kl/avg_steps": 0.71875, "kl/beta": 0.01286243461072445, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.0009323437965898e-07, "logits/chosen": -3.0720326900482178, "logits/rejected": -3.156073570251465, "logps/chosen": -97.13226318359375, "logps/ref_chosen": -42.436561584472656, "logps/ref_rejected": -80.56928253173828, "logps/rejected": -211.11170959472656, "loss": 0.821, "rewards/accuracies": 0.84375, "rewards/chosen": -0.700576901435852, "rewards/margins": 0.9646462202072144, "rewards/rejected": -1.6652231216430664, "step": 403 }, { "epoch": 0.6107331821617535, "epsilon_dpo/beta": 0.012712053954601288, "epsilon_dpo/beta_margin_grad_mean": -0.34417861700057983, "epsilon_dpo/beta_margin_grad_std": 0.19920368492603302, "epsilon_dpo/beta_margin_mean": 0.8112690448760986, "epsilon_dpo/beta_margin_std": 1.0682636499404907, "epsilon_dpo/loss_margin_mean": 64.33700561523438, "grad_norm": 53.768882751464844, "kl/avg_steps": 0.46875, "kl/beta": 0.012770645320415497, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.9879833298370237e-07, "logits/chosen": -3.2815704345703125, "logits/rejected": -3.317511558532715, "logps/chosen": -108.11491394042969, "logps/ref_chosen": -55.80046844482422, "logps/ref_rejected": -87.4559326171875, "logps/rejected": -204.1073760986328, "loss": 0.9582, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6672581434249878, "rewards/margins": 0.8112690448760986, "rewards/rejected": -1.478527307510376, "step": 404 }, { "epoch": 0.6122448979591837, "epsilon_dpo/beta": 0.012632881291210651, "epsilon_dpo/beta_margin_grad_mean": -0.33801528811454773, "epsilon_dpo/beta_margin_grad_std": 0.18035483360290527, "epsilon_dpo/beta_margin_mean": 0.8109557628631592, "epsilon_dpo/beta_margin_std": 0.9414539933204651, "epsilon_dpo/loss_margin_mean": 64.55620574951172, "grad_norm": 49.85801696777344, "kl/avg_steps": 0.625, "kl/beta": 0.012711062096059322, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.975048638084379e-07, "logits/chosen": -3.010253429412842, "logits/rejected": -3.0389394760131836, "logps/chosen": -87.96479034423828, "logps/ref_chosen": -42.07228469848633, "logps/ref_rejected": -58.06654739379883, "logps/rejected": -168.5152587890625, "loss": 0.9125, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5818367004394531, "rewards/margins": 0.8109557628631592, "rewards/rejected": -1.3927924633026123, "step": 405 }, { "epoch": 0.6137566137566137, "epsilon_dpo/beta": 0.012550468556582928, "epsilon_dpo/beta_margin_grad_mean": -0.31726282835006714, "epsilon_dpo/beta_margin_grad_std": 0.1875184327363968, "epsilon_dpo/beta_margin_mean": 0.9191246628761292, "epsilon_dpo/beta_margin_std": 0.968756377696991, "epsilon_dpo/loss_margin_mean": 73.62574005126953, "grad_norm": 41.341636657714844, "kl/avg_steps": 0.65625, "kl/beta": 0.012632111087441444, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.9621286303497914e-07, "logits/chosen": -3.1759350299835205, "logits/rejected": -3.22086763381958, "logps/chosen": -85.02434539794922, "logps/ref_chosen": -41.767059326171875, "logps/ref_rejected": -76.3978271484375, "logps/rejected": -193.28085327148438, "loss": 0.8579, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5443283915519714, "rewards/margins": 0.9191246032714844, "rewards/rejected": -1.4634530544281006, "step": 406 }, { "epoch": 0.6152683295540439, "epsilon_dpo/beta": 0.012464719824492931, "epsilon_dpo/beta_margin_grad_mean": -0.33188769221305847, "epsilon_dpo/beta_margin_grad_std": 0.161079540848732, "epsilon_dpo/beta_margin_mean": 0.7952305674552917, "epsilon_dpo/beta_margin_std": 0.7854166030883789, "epsilon_dpo/loss_margin_mean": 64.09669494628906, "grad_norm": 47.77092361450195, "kl/avg_steps": 0.6875, "kl/beta": 0.012549753300845623, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.9492236680336483e-07, "logits/chosen": -3.270841598510742, "logits/rejected": -3.2717809677124023, "logps/chosen": -139.7053985595703, "logps/ref_chosen": -76.91526794433594, "logps/ref_rejected": -85.80078125, "logps/rejected": -212.68759155273438, "loss": 0.8744, "rewards/accuracies": 0.828125, "rewards/chosen": -0.785014271736145, "rewards/margins": 0.795230507850647, "rewards/rejected": -1.580244779586792, "step": 407 }, { "epoch": 0.6167800453514739, "epsilon_dpo/beta": 0.012364028953015804, "epsilon_dpo/beta_margin_grad_mean": -0.2871266007423401, "epsilon_dpo/beta_margin_grad_std": 0.14586235582828522, "epsilon_dpo/beta_margin_mean": 1.0300157070159912, "epsilon_dpo/beta_margin_std": 0.7732513546943665, "epsilon_dpo/loss_margin_mean": 83.50468444824219, "grad_norm": 32.1417121887207, "kl/avg_steps": 0.8125, "kl/beta": 0.012464063242077827, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.9363341121154895e-07, "logits/chosen": -3.1778085231781006, "logits/rejected": -3.0978498458862305, "logps/chosen": -85.65403747558594, "logps/ref_chosen": -47.85430145263672, "logps/ref_rejected": -63.41582489013672, "logps/rejected": -184.72024536132812, "loss": 0.724, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4675591289997101, "rewards/margins": 1.0300157070159912, "rewards/rejected": -1.497574806213379, "step": 408 }, { "epoch": 0.618291761148904, "epsilon_dpo/beta": 0.012330069206655025, "epsilon_dpo/beta_margin_grad_mean": -0.40672364830970764, "epsilon_dpo/beta_margin_grad_std": 0.1917557716369629, "epsilon_dpo/beta_margin_mean": 0.4722346067428589, "epsilon_dpo/beta_margin_std": 0.9351789951324463, "epsilon_dpo/loss_margin_mean": 38.836978912353516, "grad_norm": 54.698421478271484, "kl/avg_steps": 0.28125, "kl/beta": 0.012363608926534653, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.9234603231438994e-07, "logits/chosen": -3.1757397651672363, "logits/rejected": -3.1159112453460693, "logps/chosen": -114.55815124511719, "logps/ref_chosen": -54.1250114440918, "logps/ref_rejected": -59.93003845214844, "logps/rejected": -159.20016479492188, "loss": 1.1594, "rewards/accuracies": 0.609375, "rewards/chosen": -0.7486213445663452, "rewards/margins": 0.4722346067428589, "rewards/rejected": -1.220855951309204, "step": 409 }, { "epoch": 0.6198034769463341, "epsilon_dpo/beta": 0.0122415442019701, "epsilon_dpo/beta_margin_grad_mean": -0.33080294728279114, "epsilon_dpo/beta_margin_grad_std": 0.1546378880739212, "epsilon_dpo/beta_margin_mean": 0.7987654805183411, "epsilon_dpo/beta_margin_std": 0.7712497711181641, "epsilon_dpo/loss_margin_mean": 65.52301025390625, "grad_norm": 39.59071731567383, "kl/avg_steps": 0.71875, "kl/beta": 0.01232893392443657, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.9106026612264315e-07, "logits/chosen": -3.3147921562194824, "logits/rejected": -3.2250471115112305, "logps/chosen": -108.56951904296875, "logps/ref_chosen": -54.869972229003906, "logps/ref_rejected": -71.07942962646484, "logps/rejected": -190.30197143554688, "loss": 0.8653, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6589058637619019, "rewards/margins": 0.7987654805183411, "rewards/rejected": -1.4576714038848877, "step": 410 }, { "epoch": 0.6213151927437641, "epsilon_dpo/beta": 0.01216566190123558, "epsilon_dpo/beta_margin_grad_mean": -0.3512946367263794, "epsilon_dpo/beta_margin_grad_std": 0.1858922392129898, "epsilon_dpo/beta_margin_mean": 0.7293370366096497, "epsilon_dpo/beta_margin_std": 0.9800242185592651, "epsilon_dpo/loss_margin_mean": 60.33929443359375, "grad_norm": 40.24601745605469, "kl/avg_steps": 0.625, "kl/beta": 0.012240951880812645, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.8977614860195296e-07, "logits/chosen": -3.0415139198303223, "logits/rejected": -3.1691360473632812, "logps/chosen": -108.67929077148438, "logps/ref_chosen": -47.60760498046875, "logps/ref_rejected": -76.2720947265625, "logps/rejected": -197.68307495117188, "loss": 0.9807, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7459183931350708, "rewards/margins": 0.7293370366096497, "rewards/rejected": -1.4752554893493652, "step": 411 }, { "epoch": 0.6228269085411943, "epsilon_dpo/beta": 0.012082495726644993, "epsilon_dpo/beta_margin_grad_mean": -0.3444744944572449, "epsilon_dpo/beta_margin_grad_std": 0.15515269339084625, "epsilon_dpo/beta_margin_mean": 0.7332562804222107, "epsilon_dpo/beta_margin_std": 0.762146532535553, "epsilon_dpo/loss_margin_mean": 60.95484161376953, "grad_norm": 47.7208137512207, "kl/avg_steps": 0.6875, "kl/beta": 0.012164921499788761, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.8849371567184662e-07, "logits/chosen": -3.1117968559265137, "logits/rejected": -3.019360303878784, "logps/chosen": -109.41290283203125, "logps/ref_chosen": -46.98847961425781, "logps/ref_rejected": -61.42888259887695, "logps/rejected": -184.8081512451172, "loss": 0.9063, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7563140392303467, "rewards/margins": 0.7332562804222107, "rewards/rejected": -1.4895703792572021, "step": 412 }, { "epoch": 0.6243386243386243, "epsilon_dpo/beta": 0.012005641125142574, "epsilon_dpo/beta_margin_grad_mean": -0.3349611461162567, "epsilon_dpo/beta_margin_grad_std": 0.18901513516902924, "epsilon_dpo/beta_margin_mean": 0.8059801459312439, "epsilon_dpo/beta_margin_std": 0.9869521260261536, "epsilon_dpo/loss_margin_mean": 67.5698013305664, "grad_norm": 42.9257698059082, "kl/avg_steps": 0.640625, "kl/beta": 0.012081858702003956, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.872130032047302e-07, "logits/chosen": -3.183147430419922, "logits/rejected": -3.069366455078125, "logps/chosen": -127.51182556152344, "logps/ref_chosen": -59.06121063232422, "logps/ref_rejected": -66.26124572753906, "logps/rejected": -202.2816619873047, "loss": 0.935, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8247032165527344, "rewards/margins": 0.8059800863265991, "rewards/rejected": -1.630683183670044, "step": 413 }, { "epoch": 0.6258503401360545, "epsilon_dpo/beta": 0.011919857934117317, "epsilon_dpo/beta_margin_grad_mean": -0.324365496635437, "epsilon_dpo/beta_margin_grad_std": 0.16152916848659515, "epsilon_dpo/beta_margin_mean": 0.8370386958122253, "epsilon_dpo/beta_margin_std": 0.8139888644218445, "epsilon_dpo/loss_margin_mean": 70.52116394042969, "grad_norm": 49.221702575683594, "kl/avg_steps": 0.71875, "kl/beta": 0.012004951946437359, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.8593404702488436e-07, "logits/chosen": -3.057481288909912, "logits/rejected": -3.0569777488708496, "logps/chosen": -104.4681396484375, "logps/ref_chosen": -45.23055648803711, "logps/ref_rejected": -69.24102783203125, "logps/rejected": -198.99977111816406, "loss": 0.8549, "rewards/accuracies": 0.875, "rewards/chosen": -0.7074226140975952, "rewards/margins": 0.8370386958122253, "rewards/rejected": -1.5444612503051758, "step": 414 }, { "epoch": 0.6273620559334845, "epsilon_dpo/beta": 0.011853421106934547, "epsilon_dpo/beta_margin_grad_mean": -0.35024699568748474, "epsilon_dpo/beta_margin_grad_std": 0.1677493304014206, "epsilon_dpo/beta_margin_mean": 0.7103952169418335, "epsilon_dpo/beta_margin_std": 0.8432081937789917, "epsilon_dpo/loss_margin_mean": 60.33004379272461, "grad_norm": 38.466915130615234, "kl/avg_steps": 0.5625, "kl/beta": 0.01191928144544363, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.846568829074628e-07, "logits/chosen": -3.0582566261291504, "logits/rejected": -3.0417425632476807, "logps/chosen": -104.78120422363281, "logps/ref_chosen": -48.924766540527344, "logps/ref_rejected": -55.06373596191406, "logps/rejected": -171.25021362304688, "loss": 0.9478, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6628572940826416, "rewards/margins": 0.7103952169418335, "rewards/rejected": -1.373252511024475, "step": 415 }, { "epoch": 0.6288737717309146, "epsilon_dpo/beta": 0.01179265696555376, "epsilon_dpo/beta_margin_grad_mean": -0.36480674147605896, "epsilon_dpo/beta_margin_grad_std": 0.19255809485912323, "epsilon_dpo/beta_margin_mean": 0.6533018350601196, "epsilon_dpo/beta_margin_std": 0.9482189416885376, "epsilon_dpo/loss_margin_mean": 55.90175247192383, "grad_norm": 51.852542877197266, "kl/avg_steps": 0.515625, "kl/beta": 0.011852610856294632, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.8338154657749128e-07, "logits/chosen": -3.1024200916290283, "logits/rejected": -2.9976954460144043, "logps/chosen": -121.18083190917969, "logps/ref_chosen": -52.09894561767578, "logps/ref_rejected": -60.379310607910156, "logps/rejected": -185.36294555664062, "loss": 1.03, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8176281452178955, "rewards/margins": 0.6533017754554749, "rewards/rejected": -1.4709298610687256, "step": 416 }, { "epoch": 0.6303854875283447, "epsilon_dpo/beta": 0.011706365272402763, "epsilon_dpo/beta_margin_grad_mean": -0.33509349822998047, "epsilon_dpo/beta_margin_grad_std": 0.17879465222358704, "epsilon_dpo/beta_margin_mean": 0.7893485426902771, "epsilon_dpo/beta_margin_std": 0.9173914790153503, "epsilon_dpo/loss_margin_mean": 67.77025604248047, "grad_norm": 48.18291473388672, "kl/avg_steps": 0.734375, "kl/beta": 0.01179180946201086, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.8210807370886849e-07, "logits/chosen": -3.0667994022369385, "logits/rejected": -2.9707934856414795, "logps/chosen": -113.939697265625, "logps/ref_chosen": -50.82991027832031, "logps/ref_rejected": -57.54952621459961, "logps/rejected": -188.4295654296875, "loss": 0.9217, "rewards/accuracies": 0.875, "rewards/chosen": -0.7411265969276428, "rewards/margins": 0.7893485426902771, "rewards/rejected": -1.53047513961792, "step": 417 }, { "epoch": 0.6318972033257747, "epsilon_dpo/beta": 0.011655797250568867, "epsilon_dpo/beta_margin_grad_mean": -0.3838587999343872, "epsilon_dpo/beta_margin_grad_std": 0.1869409829378128, "epsilon_dpo/beta_margin_mean": 0.5660474896430969, "epsilon_dpo/beta_margin_std": 0.9232034683227539, "epsilon_dpo/loss_margin_mean": 49.08390426635742, "grad_norm": 51.51350021362305, "kl/avg_steps": 0.4375, "kl/beta": 0.011705844663083553, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.8083649992336825e-07, "logits/chosen": -3.107374429702759, "logits/rejected": -3.148359537124634, "logps/chosen": -139.40093994140625, "logps/ref_chosen": -59.049217224121094, "logps/ref_rejected": -66.80460357666016, "logps/rejected": -196.240234375, "loss": 1.0811, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9401662349700928, "rewards/margins": 0.5660475492477417, "rewards/rejected": -1.5062137842178345, "step": 418 }, { "epoch": 0.6334089191232048, "epsilon_dpo/beta": 0.011579527519643307, "epsilon_dpo/beta_margin_grad_mean": -0.31527018547058105, "epsilon_dpo/beta_margin_grad_std": 0.18519006669521332, "epsilon_dpo/beta_margin_mean": 0.9286851286888123, "epsilon_dpo/beta_margin_std": 0.9508613348007202, "epsilon_dpo/loss_margin_mean": 80.62158966064453, "grad_norm": 50.5211181640625, "kl/avg_steps": 0.65625, "kl/beta": 0.011654854752123356, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.7956686078964255e-07, "logits/chosen": -3.0360560417175293, "logits/rejected": -3.1040239334106445, "logps/chosen": -103.87488555908203, "logps/ref_chosen": -51.049774169921875, "logps/ref_rejected": -66.17208099365234, "logps/rejected": -199.6187744140625, "loss": 0.8458, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6151851415634155, "rewards/margins": 0.9286850690841675, "rewards/rejected": -1.543870210647583, "step": 419 }, { "epoch": 0.6349206349206349, "epsilon_dpo/beta": 0.011542011052370071, "epsilon_dpo/beta_margin_grad_mean": -0.3916632831096649, "epsilon_dpo/beta_margin_grad_std": 0.19140572845935822, "epsilon_dpo/beta_margin_mean": 0.5247743129730225, "epsilon_dpo/beta_margin_std": 0.9314157366752625, "epsilon_dpo/loss_margin_mean": 46.02699279785156, "grad_norm": 56.10710906982422, "kl/avg_steps": 0.328125, "kl/beta": 0.011578868143260479, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.782991918222275e-07, "logits/chosen": -3.11198091506958, "logits/rejected": -3.0053396224975586, "logps/chosen": -129.7800750732422, "logps/ref_chosen": -52.37065124511719, "logps/ref_rejected": -57.59639358520508, "logps/rejected": -181.03280639648438, "loss": 1.1189, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8975132703781128, "rewards/margins": 0.5247743129730225, "rewards/rejected": -1.4222877025604248, "step": 420 }, { "epoch": 0.636432350718065, "epsilon_dpo/beta": 0.01148444227874279, "epsilon_dpo/beta_margin_grad_mean": -0.3743596374988556, "epsilon_dpo/beta_margin_grad_std": 0.19991958141326904, "epsilon_dpo/beta_margin_mean": 0.6282655000686646, "epsilon_dpo/beta_margin_std": 1.040977954864502, "epsilon_dpo/loss_margin_mean": 55.252628326416016, "grad_norm": 51.827857971191406, "kl/avg_steps": 0.5, "kl/beta": 0.011540999636054039, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.7703352848054887e-07, "logits/chosen": -3.0425453186035156, "logits/rejected": -3.064128875732422, "logps/chosen": -121.2735595703125, "logps/ref_chosen": -50.42585754394531, "logps/ref_rejected": -62.157188415527344, "logps/rejected": -188.25750732421875, "loss": 1.0781, "rewards/accuracies": 0.75, "rewards/chosen": -0.8171975612640381, "rewards/margins": 0.6282655000686646, "rewards/rejected": -1.445462942123413, "step": 421 }, { "epoch": 0.6379440665154951, "epsilon_dpo/beta": 0.011412950232625008, "epsilon_dpo/beta_margin_grad_mean": -0.3421357572078705, "epsilon_dpo/beta_margin_grad_std": 0.15693418681621552, "epsilon_dpo/beta_margin_mean": 0.7410057783126831, "epsilon_dpo/beta_margin_std": 0.7679588794708252, "epsilon_dpo/loss_margin_mean": 65.28427124023438, "grad_norm": 50.42546844482422, "kl/avg_steps": 0.625, "kl/beta": 0.011483581736683846, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.7576990616793137e-07, "logits/chosen": -3.2711071968078613, "logits/rejected": -3.146646022796631, "logps/chosen": -111.4495849609375, "logps/ref_chosen": -61.9298095703125, "logps/ref_rejected": -64.45858764648438, "logps/rejected": -179.26263427734375, "loss": 0.9035, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5661858320236206, "rewards/margins": 0.7410057783126831, "rewards/rejected": -1.3071916103363037, "step": 422 }, { "epoch": 0.6394557823129252, "epsilon_dpo/beta": 0.01135632861405611, "epsilon_dpo/beta_margin_grad_mean": -0.3255573511123657, "epsilon_dpo/beta_margin_grad_std": 0.18686482310295105, "epsilon_dpo/beta_margin_mean": 0.8518685698509216, "epsilon_dpo/beta_margin_std": 0.9424189925193787, "epsilon_dpo/loss_margin_mean": 75.54313659667969, "grad_norm": 40.837005615234375, "kl/avg_steps": 0.5, "kl/beta": 0.01141225453466177, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.745083602306071e-07, "logits/chosen": -3.008904457092285, "logits/rejected": -3.1958439350128174, "logps/chosen": -105.96476745605469, "logps/ref_chosen": -47.76591491699219, "logps/ref_rejected": -74.40087127685547, "logps/rejected": -208.14285278320312, "loss": 0.8936, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6622984409332275, "rewards/margins": 0.8518685698509216, "rewards/rejected": -1.514167070388794, "step": 423 }, { "epoch": 0.6409674981103552, "epsilon_dpo/beta": 0.011282085441052914, "epsilon_dpo/beta_margin_grad_mean": -0.32188084721565247, "epsilon_dpo/beta_margin_grad_std": 0.17614509165287018, "epsilon_dpo/beta_margin_mean": 0.8858447670936584, "epsilon_dpo/beta_margin_std": 0.9228990077972412, "epsilon_dpo/loss_margin_mean": 78.91209411621094, "grad_norm": 49.64436721801758, "kl/avg_steps": 0.65625, "kl/beta": 0.011355477385222912, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.7324892595672804e-07, "logits/chosen": -3.1539053916931152, "logits/rejected": -3.2170872688293457, "logps/chosen": -110.01268005371094, "logps/ref_chosen": -46.778297424316406, "logps/ref_rejected": -75.71923828125, "logps/rejected": -217.86572265625, "loss": 0.8572, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7147679328918457, "rewards/margins": 0.8858447670936584, "rewards/rejected": -1.6006126403808594, "step": 424 }, { "epoch": 0.6424792139077853, "epsilon_dpo/beta": 0.011219106614589691, "epsilon_dpo/beta_margin_grad_mean": -0.3644617795944214, "epsilon_dpo/beta_margin_grad_std": 0.1657644659280777, "epsilon_dpo/beta_margin_mean": 0.6436865329742432, "epsilon_dpo/beta_margin_std": 0.8037853837013245, "epsilon_dpo/loss_margin_mean": 57.76534652709961, "grad_norm": 43.20616149902344, "kl/avg_steps": 0.5625, "kl/beta": 0.011281442828476429, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.7199163857537824e-07, "logits/chosen": -3.105804443359375, "logits/rejected": -2.9913408756256104, "logps/chosen": -112.49369812011719, "logps/ref_chosen": -51.11269760131836, "logps/ref_rejected": -59.31032943725586, "logps/rejected": -178.4566650390625, "loss": 0.9827, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6914188265800476, "rewards/margins": 0.6436865329742432, "rewards/rejected": -1.3351054191589355, "step": 425 }, { "epoch": 0.6439909297052154, "epsilon_dpo/beta": 0.011193148791790009, "epsilon_dpo/beta_margin_grad_mean": -0.41356194019317627, "epsilon_dpo/beta_margin_grad_std": 0.18374751508235931, "epsilon_dpo/beta_margin_mean": 0.43696534633636475, "epsilon_dpo/beta_margin_std": 0.9536048769950867, "epsilon_dpo/loss_margin_mean": 39.60122299194336, "grad_norm": 61.31911087036133, "kl/avg_steps": 0.234375, "kl/beta": 0.011218340136110783, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.609375, "learning_rate": 1.7073653325558828e-07, "logits/chosen": -3.1485185623168945, "logits/rejected": -3.132035970687866, "logps/chosen": -135.79818725585938, "logps/ref_chosen": -57.66645050048828, "logps/ref_rejected": -71.13719177246094, "logps/rejected": -188.87014770507812, "loss": 1.1854, "rewards/accuracies": 0.671875, "rewards/chosen": -0.8773593306541443, "rewards/margins": 0.43696531653404236, "rewards/rejected": -1.3143246173858643, "step": 426 }, { "epoch": 0.6455026455026455, "epsilon_dpo/beta": 0.011137261986732483, "epsilon_dpo/beta_margin_grad_mean": -0.35336023569107056, "epsilon_dpo/beta_margin_grad_std": 0.18582922220230103, "epsilon_dpo/beta_margin_mean": 0.7329629063606262, "epsilon_dpo/beta_margin_std": 0.9645615220069885, "epsilon_dpo/loss_margin_mean": 66.33154296875, "grad_norm": 49.482887268066406, "kl/avg_steps": 0.5, "kl/beta": 0.011192108504474163, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.6948364510535218e-07, "logits/chosen": -3.140113592147827, "logits/rejected": -3.195136070251465, "logps/chosen": -118.420166015625, "logps/ref_chosen": -50.623207092285156, "logps/ref_rejected": -78.38642120361328, "logps/rejected": -212.51490783691406, "loss": 0.9719, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7568691968917847, "rewards/margins": 0.7329628467559814, "rewards/rejected": -1.4898320436477661, "step": 427 }, { "epoch": 0.6470143613000756, "epsilon_dpo/beta": 0.011067930608987808, "epsilon_dpo/beta_margin_grad_mean": -0.34974318742752075, "epsilon_dpo/beta_margin_grad_std": 0.19181585311889648, "epsilon_dpo/beta_margin_mean": 0.7494282722473145, "epsilon_dpo/beta_margin_std": 0.9935394525527954, "epsilon_dpo/loss_margin_mean": 68.17057800292969, "grad_norm": 41.909603118896484, "kl/avg_steps": 0.625, "kl/beta": 0.011136426590383053, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.6823300917064458e-07, "logits/chosen": -3.287627696990967, "logits/rejected": -3.1633493900299072, "logps/chosen": -140.1917266845703, "logps/ref_chosen": -65.96379089355469, "logps/ref_rejected": -78.78587341308594, "logps/rejected": -221.18438720703125, "loss": 0.9737, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8232454061508179, "rewards/margins": 0.7494282722473145, "rewards/rejected": -1.5726736783981323, "step": 428 }, { "epoch": 0.6485260770975056, "epsilon_dpo/beta": 0.011013020761311054, "epsilon_dpo/beta_margin_grad_mean": -0.3587930202484131, "epsilon_dpo/beta_margin_grad_std": 0.18254989385604858, "epsilon_dpo/beta_margin_mean": 0.6590009331703186, "epsilon_dpo/beta_margin_std": 0.8796647787094116, "epsilon_dpo/loss_margin_mean": 60.36332702636719, "grad_norm": 44.14021682739258, "kl/avg_steps": 0.5, "kl/beta": 0.011067255400121212, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.669846604344412e-07, "logits/chosen": -3.023120880126953, "logits/rejected": -2.933408737182617, "logps/chosen": -125.18557739257812, "logps/ref_chosen": -52.86711120605469, "logps/ref_rejected": -55.48959732055664, "logps/rejected": -188.17138671875, "loss": 1.0024, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7996683120727539, "rewards/margins": 0.6590009331703186, "rewards/rejected": -1.4586691856384277, "step": 429 }, { "epoch": 0.6500377928949358, "epsilon_dpo/beta": 0.010935841128230095, "epsilon_dpo/beta_margin_grad_mean": -0.32926544547080994, "epsilon_dpo/beta_margin_grad_std": 0.16211183369159698, "epsilon_dpo/beta_margin_mean": 0.8021836876869202, "epsilon_dpo/beta_margin_std": 0.7979654669761658, "epsilon_dpo/loss_margin_mean": 73.68955993652344, "grad_norm": 39.794898986816406, "kl/avg_steps": 0.703125, "kl/beta": 0.01101219467818737, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.6573863381573954e-07, "logits/chosen": -3.1020662784576416, "logits/rejected": -2.9608521461486816, "logps/chosen": -109.80938720703125, "logps/ref_chosen": -51.888057708740234, "logps/ref_rejected": -62.767486572265625, "logps/rejected": -194.37838745117188, "loss": 0.8741, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6356431245803833, "rewards/margins": 0.8021836280822754, "rewards/rejected": -1.4378267526626587, "step": 430 }, { "epoch": 0.6515495086923658, "epsilon_dpo/beta": 0.010868046432733536, "epsilon_dpo/beta_margin_grad_mean": -0.372292160987854, "epsilon_dpo/beta_margin_grad_std": 0.16359832882881165, "epsilon_dpo/beta_margin_mean": 0.5940731167793274, "epsilon_dpo/beta_margin_std": 0.7968641519546509, "epsilon_dpo/loss_margin_mean": 55.029911041259766, "grad_norm": 34.92251968383789, "kl/avg_steps": 0.625, "kl/beta": 0.010935305617749691, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -3.0833945274353027, "logits/rejected": -3.2080745697021484, "logps/chosen": -107.06591796875, "logps/ref_chosen": -46.398841857910156, "logps/ref_rejected": -72.00377655029297, "logps/rejected": -187.70077514648438, "loss": 1.0163, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6613021492958069, "rewards/margins": 0.5940730571746826, "rewards/rejected": -1.2553751468658447, "step": 431 }, { "epoch": 0.6530612244897959, "epsilon_dpo/beta": 0.010800544172525406, "epsilon_dpo/beta_margin_grad_mean": -0.33500024676322937, "epsilon_dpo/beta_margin_grad_std": 0.1664627343416214, "epsilon_dpo/beta_margin_mean": 0.7738580107688904, "epsilon_dpo/beta_margin_std": 0.8041167855262756, "epsilon_dpo/loss_margin_mean": 72.05082702636719, "grad_norm": 40.522682189941406, "kl/avg_steps": 0.625, "kl/beta": 0.010867385193705559, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.632536862810844e-07, "logits/chosen": -3.290902853012085, "logits/rejected": -3.2200634479522705, "logps/chosen": -108.97900390625, "logps/ref_chosen": -56.4910774230957, "logps/ref_rejected": -78.9006576538086, "logps/rejected": -203.43939208984375, "loss": 0.8965, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5692088007926941, "rewards/margins": 0.7738580107688904, "rewards/rejected": -1.3430668115615845, "step": 432 }, { "epoch": 0.654572940287226, "epsilon_dpo/beta": 0.010723333805799484, "epsilon_dpo/beta_margin_grad_mean": -0.3167564570903778, "epsilon_dpo/beta_margin_grad_std": 0.177069753408432, "epsilon_dpo/beta_margin_mean": 0.8901485800743103, "epsilon_dpo/beta_margin_std": 0.8777965307235718, "epsilon_dpo/loss_margin_mean": 83.36041259765625, "grad_norm": 39.762481689453125, "kl/avg_steps": 0.71875, "kl/beta": 0.010799885727465153, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.6201483487445515e-07, "logits/chosen": -3.1258950233459473, "logits/rejected": -2.969557046890259, "logps/chosen": -106.69856262207031, "logps/ref_chosen": -52.60382843017578, "logps/ref_rejected": -59.58381652832031, "logps/rejected": -197.03897094726562, "loss": 0.8468, "rewards/accuracies": 0.875, "rewards/chosen": -0.5821977257728577, "rewards/margins": 0.8901486396789551, "rewards/rejected": -1.472346305847168, "step": 433 }, { "epoch": 0.656084656084656, "epsilon_dpo/beta": 0.010653512552380562, "epsilon_dpo/beta_margin_grad_mean": -0.3407946825027466, "epsilon_dpo/beta_margin_grad_std": 0.17183029651641846, "epsilon_dpo/beta_margin_mean": 0.755030632019043, "epsilon_dpo/beta_margin_std": 0.8586925268173218, "epsilon_dpo/loss_margin_mean": 71.26582336425781, "grad_norm": 41.07792282104492, "kl/avg_steps": 0.65625, "kl/beta": 0.010722815059125423, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.6077844460203204e-07, "logits/chosen": -3.030958652496338, "logits/rejected": -3.1189804077148438, "logps/chosen": -94.27387237548828, "logps/ref_chosen": -42.62062454223633, "logps/ref_rejected": -65.37037658691406, "logps/rejected": -188.28945922851562, "loss": 0.9251, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5521217584609985, "rewards/margins": 0.755030632019043, "rewards/rejected": -1.3071523904800415, "step": 434 }, { "epoch": 0.6575963718820862, "epsilon_dpo/beta": 0.010590712539851665, "epsilon_dpo/beta_margin_grad_mean": -0.33470356464385986, "epsilon_dpo/beta_margin_grad_std": 0.16945385932922363, "epsilon_dpo/beta_margin_mean": 0.7851150631904602, "epsilon_dpo/beta_margin_std": 0.8343574404716492, "epsilon_dpo/loss_margin_mean": 74.57064056396484, "grad_norm": 39.330726623535156, "kl/avg_steps": 0.59375, "kl/beta": 0.010652905330061913, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.5954455004830878e-07, "logits/chosen": -3.1501574516296387, "logits/rejected": -3.031158447265625, "logps/chosen": -110.38848876953125, "logps/ref_chosen": -50.880821228027344, "logps/ref_rejected": -63.97433853149414, "logps/rejected": -198.05264282226562, "loss": 0.8973, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6333208084106445, "rewards/margins": 0.7851150631904602, "rewards/rejected": -1.41843581199646, "step": 435 }, { "epoch": 0.6591080876795162, "epsilon_dpo/beta": 0.010534821078181267, "epsilon_dpo/beta_margin_grad_mean": -0.37446129322052, "epsilon_dpo/beta_margin_grad_std": 0.18517981469631195, "epsilon_dpo/beta_margin_mean": 0.6095327734947205, "epsilon_dpo/beta_margin_std": 0.9018145203590393, "epsilon_dpo/loss_margin_mean": 58.36293411254883, "grad_norm": 38.41319274902344, "kl/avg_steps": 0.53125, "kl/beta": 0.010590027086436749, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.5831318572796847e-07, "logits/chosen": -3.1508169174194336, "logits/rejected": -3.0761466026306152, "logps/chosen": -115.63070678710938, "logps/ref_chosen": -55.031593322753906, "logps/ref_rejected": -61.85481643676758, "logps/rejected": -180.81686401367188, "loss": 1.0423, "rewards/accuracies": 0.734375, "rewards/chosen": -0.640838623046875, "rewards/margins": 0.6095327138900757, "rewards/rejected": -1.2503714561462402, "step": 436 }, { "epoch": 0.6606198034769464, "epsilon_dpo/beta": 0.010492319241166115, "epsilon_dpo/beta_margin_grad_mean": -0.36130034923553467, "epsilon_dpo/beta_margin_grad_std": 0.1940818727016449, "epsilon_dpo/beta_margin_mean": 0.7106533646583557, "epsilon_dpo/beta_margin_std": 1.0074844360351562, "epsilon_dpo/loss_margin_mean": 68.34970092773438, "grad_norm": 48.643959045410156, "kl/avg_steps": 0.40625, "kl/beta": 0.010534064844250679, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.5708438608491815e-07, "logits/chosen": -3.0891101360321045, "logits/rejected": -3.2071430683135986, "logps/chosen": -112.21627807617188, "logps/ref_chosen": -50.010414123535156, "logps/ref_rejected": -82.80753326416016, "logps/rejected": -213.36309814453125, "loss": 1.0056, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6551386117935181, "rewards/margins": 0.7106533646583557, "rewards/rejected": -1.365791916847229, "step": 437 }, { "epoch": 0.6621315192743764, "epsilon_dpo/beta": 0.010415421798825264, "epsilon_dpo/beta_margin_grad_mean": -0.33799031376838684, "epsilon_dpo/beta_margin_grad_std": 0.15876249969005585, "epsilon_dpo/beta_margin_mean": 0.7575523853302002, "epsilon_dpo/beta_margin_std": 0.7641726136207581, "epsilon_dpo/loss_margin_mean": 73.02360534667969, "grad_norm": 35.575584411621094, "kl/avg_steps": 0.734375, "kl/beta": 0.010491443797945976, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.558581854913253e-07, "logits/chosen": -3.03183650970459, "logits/rejected": -2.985513687133789, "logps/chosen": -100.65507507324219, "logps/ref_chosen": -40.676055908203125, "logps/ref_rejected": -59.5482292175293, "logps/rejected": -192.55084228515625, "loss": 0.8934, "rewards/accuracies": 0.8125, "rewards/chosen": -0.62664794921875, "rewards/margins": 0.7575523853302002, "rewards/rejected": -1.3842003345489502, "step": 438 }, { "epoch": 0.6636432350718064, "epsilon_dpo/beta": 0.010352510958909988, "epsilon_dpo/beta_margin_grad_mean": -0.3459526002407074, "epsilon_dpo/beta_margin_grad_std": 0.17511485517024994, "epsilon_dpo/beta_margin_mean": 0.7447972893714905, "epsilon_dpo/beta_margin_std": 0.8790538311004639, "epsilon_dpo/loss_margin_mean": 72.38047790527344, "grad_norm": 34.624935150146484, "kl/avg_steps": 0.609375, "kl/beta": 0.01041495893150568, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.5463461824665658e-07, "logits/chosen": -3.296609401702881, "logits/rejected": -3.246314287185669, "logps/chosen": -135.18341064453125, "logps/ref_chosen": -70.19447326660156, "logps/ref_rejected": -85.50820922851562, "logps/rejected": -222.87762451171875, "loss": 0.9373, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6745564937591553, "rewards/margins": 0.7447972893714905, "rewards/rejected": -1.419353723526001, "step": 439 }, { "epoch": 0.6651549508692366, "epsilon_dpo/beta": 0.01028173603117466, "epsilon_dpo/beta_margin_grad_mean": -0.33036115765571594, "epsilon_dpo/beta_margin_grad_std": 0.16681256890296936, "epsilon_dpo/beta_margin_mean": 0.8153419494628906, "epsilon_dpo/beta_margin_std": 0.8472408652305603, "epsilon_dpo/loss_margin_mean": 79.68769836425781, "grad_norm": 41.26952362060547, "kl/avg_steps": 0.6875, "kl/beta": 0.010351876728236675, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.534137185767178e-07, "logits/chosen": -3.0310044288635254, "logits/rejected": -3.049333095550537, "logps/chosen": -89.48184967041016, "logps/ref_chosen": -39.25298309326172, "logps/ref_rejected": -65.68348693847656, "logps/rejected": -195.6000518798828, "loss": 0.8776, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5178225040435791, "rewards/margins": 0.8153419494628906, "rewards/rejected": -1.3331644535064697, "step": 440 }, { "epoch": 0.6666666666666666, "epsilon_dpo/beta": 0.010208318941295147, "epsilon_dpo/beta_margin_grad_mean": -0.3244849145412445, "epsilon_dpo/beta_margin_grad_std": 0.1406620442867279, "epsilon_dpo/beta_margin_mean": 0.8051685094833374, "epsilon_dpo/beta_margin_std": 0.6723539233207703, "epsilon_dpo/loss_margin_mean": 79.18258666992188, "grad_norm": 35.11744689941406, "kl/avg_steps": 0.71875, "kl/beta": 0.010281194001436234, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.521955206326976e-07, "logits/chosen": -3.144274950027466, "logits/rejected": -3.1143977642059326, "logps/chosen": -96.97933959960938, "logps/ref_chosen": -50.41090393066406, "logps/ref_rejected": -70.79930877685547, "logps/rejected": -196.5503387451172, "loss": 0.8349, "rewards/accuracies": 0.859375, "rewards/chosen": -0.47575563192367554, "rewards/margins": 0.8051685094833374, "rewards/rejected": -1.2809240818023682, "step": 441 }, { "epoch": 0.6681783824640968, "epsilon_dpo/beta": 0.010141851380467415, "epsilon_dpo/beta_margin_grad_mean": -0.34670013189315796, "epsilon_dpo/beta_margin_grad_std": 0.15835554897785187, "epsilon_dpo/beta_margin_mean": 0.7083684206008911, "epsilon_dpo/beta_margin_std": 0.7583370804786682, "epsilon_dpo/loss_margin_mean": 70.21942138671875, "grad_norm": 42.561744689941406, "kl/avg_steps": 0.65625, "kl/beta": 0.0102078253403306, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.5098005849021078e-07, "logits/chosen": -3.115534782409668, "logits/rejected": -3.1137397289276123, "logps/chosen": -123.26164245605469, "logps/ref_chosen": -54.4185676574707, "logps/ref_rejected": -75.1820297241211, "logps/rejected": -214.24453735351562, "loss": 0.9247, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6998666524887085, "rewards/margins": 0.7083684206008911, "rewards/rejected": -1.4082350730895996, "step": 442 }, { "epoch": 0.6696900982615268, "epsilon_dpo/beta": 0.010072559118270874, "epsilon_dpo/beta_margin_grad_mean": -0.34837085008621216, "epsilon_dpo/beta_margin_grad_std": 0.1784690022468567, "epsilon_dpo/beta_margin_mean": 0.7196089625358582, "epsilon_dpo/beta_margin_std": 0.8734248280525208, "epsilon_dpo/loss_margin_mean": 71.84913635253906, "grad_norm": 37.01234817504883, "kl/avg_steps": 0.6875, "kl/beta": 0.010141273029148579, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.4976736614834662e-07, "logits/chosen": -2.98148512840271, "logits/rejected": -3.064034938812256, "logps/chosen": -108.21389770507812, "logps/ref_chosen": -50.46333312988281, "logps/ref_rejected": -67.96987915039062, "logps/rejected": -197.569580078125, "loss": 0.9562, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5838524699211121, "rewards/margins": 0.7196090221405029, "rewards/rejected": -1.3034615516662598, "step": 443 }, { "epoch": 0.671201814058957, "epsilon_dpo/beta": 0.010032112710177898, "epsilon_dpo/beta_margin_grad_mean": -0.4191220700740814, "epsilon_dpo/beta_margin_grad_std": 0.1901054084300995, "epsilon_dpo/beta_margin_mean": 0.40702149271965027, "epsilon_dpo/beta_margin_std": 0.9732410311698914, "epsilon_dpo/loss_margin_mean": 41.196876525878906, "grad_norm": 47.24799728393555, "kl/avg_steps": 0.40625, "kl/beta": 0.010072027333080769, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.4855747752871654e-07, "logits/chosen": -3.057222843170166, "logits/rejected": -3.1693315505981445, "logps/chosen": -130.43536376953125, "logps/ref_chosen": -51.869789123535156, "logps/ref_rejected": -74.45449829101562, "logps/rejected": -194.21694946289062, "loss": 1.2195, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7900919914245605, "rewards/margins": 0.40702149271965027, "rewards/rejected": -1.1971135139465332, "step": 444 }, { "epoch": 0.672713529856387, "epsilon_dpo/beta": 0.00997898168861866, "epsilon_dpo/beta_margin_grad_mean": -0.3583669364452362, "epsilon_dpo/beta_margin_grad_std": 0.16688847541809082, "epsilon_dpo/beta_margin_mean": 0.6637567281723022, "epsilon_dpo/beta_margin_std": 0.7902541160583496, "epsilon_dpo/loss_margin_mean": 66.99334716796875, "grad_norm": 39.09977722167969, "kl/avg_steps": 0.53125, "kl/beta": 0.010031275451183319, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.473504264745062e-07, "logits/chosen": -2.9483582973480225, "logits/rejected": -2.896817207336426, "logps/chosen": -121.84056091308594, "logps/ref_chosen": -51.32820129394531, "logps/ref_rejected": -61.57338333129883, "logps/rejected": -199.0791015625, "loss": 0.9666, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7060898542404175, "rewards/margins": 0.6637567281723022, "rewards/rejected": -1.3698465824127197, "step": 445 }, { "epoch": 0.674225245653817, "epsilon_dpo/beta": 0.009904420003294945, "epsilon_dpo/beta_margin_grad_mean": -0.3164527416229248, "epsilon_dpo/beta_margin_grad_std": 0.15325742959976196, "epsilon_dpo/beta_margin_mean": 0.877436101436615, "epsilon_dpo/beta_margin_std": 0.7844016551971436, "epsilon_dpo/loss_margin_mean": 88.88938903808594, "grad_norm": 35.128807067871094, "kl/avg_steps": 0.75, "kl/beta": 0.009978266432881355, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.461462467495284e-07, "logits/chosen": -3.033296585083008, "logits/rejected": -3.012072801589966, "logps/chosen": -96.05516815185547, "logps/ref_chosen": -44.46532440185547, "logps/ref_rejected": -64.81745910644531, "logps/rejected": -205.29669189453125, "loss": 0.8178, "rewards/accuracies": 0.875, "rewards/chosen": -0.5122168064117432, "rewards/margins": 0.877436101436615, "rewards/rejected": -1.389652967453003, "step": 446 }, { "epoch": 0.6757369614512472, "epsilon_dpo/beta": 0.009833784773945808, "epsilon_dpo/beta_margin_grad_mean": -0.33009472489356995, "epsilon_dpo/beta_margin_grad_std": 0.1772153377532959, "epsilon_dpo/beta_margin_mean": 0.8100787997245789, "epsilon_dpo/beta_margin_std": 0.8676522970199585, "epsilon_dpo/loss_margin_mean": 82.79435729980469, "grad_norm": 40.57292175292969, "kl/avg_steps": 0.71875, "kl/beta": 0.009903986006975174, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.4494497203727843e-07, "logits/chosen": -3.022735118865967, "logits/rejected": -3.212925910949707, "logps/chosen": -101.14094543457031, "logps/ref_chosen": -44.02503204345703, "logps/ref_rejected": -81.97323608398438, "logps/rejected": -221.88351440429688, "loss": 0.8943, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5635664463043213, "rewards/margins": 0.8100787997245789, "rewards/rejected": -1.373645305633545, "step": 447 }, { "epoch": 0.6772486772486772, "epsilon_dpo/beta": 0.009778975509107113, "epsilon_dpo/beta_margin_grad_mean": -0.3702988028526306, "epsilon_dpo/beta_margin_grad_std": 0.16913077235221863, "epsilon_dpo/beta_margin_mean": 0.6145232915878296, "epsilon_dpo/beta_margin_std": 0.8328132033348083, "epsilon_dpo/loss_margin_mean": 63.34140396118164, "grad_norm": 48.729393005371094, "kl/avg_steps": 0.5625, "kl/beta": 0.009833309799432755, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.4374663593999256e-07, "logits/chosen": -3.17475962638855, "logits/rejected": -3.1191859245300293, "logps/chosen": -114.37477111816406, "logps/ref_chosen": -50.81896209716797, "logps/ref_rejected": -73.50254821777344, "logps/rejected": -200.39974975585938, "loss": 1.0116, "rewards/accuracies": 0.765625, "rewards/chosen": -0.622562050819397, "rewards/margins": 0.6145232915878296, "rewards/rejected": -1.2370853424072266, "step": 448 }, { "epoch": 0.6787603930461074, "epsilon_dpo/beta": 0.009736500680446625, "epsilon_dpo/beta_margin_grad_mean": -0.3968140482902527, "epsilon_dpo/beta_margin_grad_std": 0.16042354702949524, "epsilon_dpo/beta_margin_mean": 0.47273388504981995, "epsilon_dpo/beta_margin_std": 0.7330147624015808, "epsilon_dpo/loss_margin_mean": 49.04754638671875, "grad_norm": 46.11749267578125, "kl/avg_steps": 0.4375, "kl/beta": 0.009778306819498539, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.4255127197770707e-07, "logits/chosen": -3.2978219985961914, "logits/rejected": -3.069335460662842, "logps/chosen": -147.80227661132812, "logps/ref_chosen": -67.94647216796875, "logps/ref_rejected": -75.97640991210938, "logps/rejected": -204.8797607421875, "loss": 1.0906, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7783463597297668, "rewards/margins": 0.47273391485214233, "rewards/rejected": -1.2510802745819092, "step": 449 }, { "epoch": 0.6802721088435374, "epsilon_dpo/beta": 0.00970472302287817, "epsilon_dpo/beta_margin_grad_mean": -0.3903656601905823, "epsilon_dpo/beta_margin_grad_std": 0.1692904680967331, "epsilon_dpo/beta_margin_mean": 0.5265427827835083, "epsilon_dpo/beta_margin_std": 0.819326639175415, "epsilon_dpo/loss_margin_mean": 54.84526062011719, "grad_norm": 48.26889419555664, "kl/avg_steps": 0.328125, "kl/beta": 0.0097357127815485, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.4135891358732205e-07, "logits/chosen": -2.9436933994293213, "logits/rejected": -3.1964752674102783, "logps/chosen": -104.97734069824219, "logps/ref_chosen": -41.225425720214844, "logps/ref_rejected": -73.09139251708984, "logps/rejected": -191.68856811523438, "loss": 1.0738, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6211973428726196, "rewards/margins": 0.5265427827835083, "rewards/rejected": -1.147740125656128, "step": 450 }, { "epoch": 0.6817838246409675, "epsilon_dpo/beta": 0.009647219441831112, "epsilon_dpo/beta_margin_grad_mean": -0.3605809211730957, "epsilon_dpo/beta_margin_grad_std": 0.15067480504512787, "epsilon_dpo/beta_margin_mean": 0.6386559009552002, "epsilon_dpo/beta_margin_std": 0.7163569927215576, "epsilon_dpo/loss_margin_mean": 66.59651947021484, "grad_norm": 38.20392990112305, "kl/avg_steps": 0.59375, "kl/beta": 0.009703871794044971, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.4016959412166437e-07, "logits/chosen": -3.1152894496917725, "logits/rejected": -3.137643814086914, "logps/chosen": -108.02947235107422, "logps/ref_chosen": -51.84246063232422, "logps/ref_rejected": -68.494873046875, "logps/rejected": -191.27841186523438, "loss": 0.9593, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5438657999038696, "rewards/margins": 0.6386559009552002, "rewards/rejected": -1.1825218200683594, "step": 451 }, { "epoch": 0.6832955404383976, "epsilon_dpo/beta": 0.009584248065948486, "epsilon_dpo/beta_margin_grad_mean": -0.354620099067688, "epsilon_dpo/beta_margin_grad_std": 0.1553487926721573, "epsilon_dpo/beta_margin_mean": 0.687550961971283, "epsilon_dpo/beta_margin_std": 0.7707722783088684, "epsilon_dpo/loss_margin_mean": 72.09152221679688, "grad_norm": 35.652626037597656, "kl/avg_steps": 0.65625, "kl/beta": 0.00964659545570612, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.3898334684855645e-07, "logits/chosen": -3.0505571365356445, "logits/rejected": -2.9589128494262695, "logps/chosen": -113.3621826171875, "logps/ref_chosen": -47.722564697265625, "logps/ref_rejected": -69.6410140991211, "logps/rejected": -207.37216186523438, "loss": 0.9385, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6300846338272095, "rewards/margins": 0.6875509023666382, "rewards/rejected": -1.3176355361938477, "step": 452 }, { "epoch": 0.6848072562358276, "epsilon_dpo/beta": 0.009533742442727089, "epsilon_dpo/beta_margin_grad_mean": -0.34590622782707214, "epsilon_dpo/beta_margin_grad_std": 0.16824328899383545, "epsilon_dpo/beta_margin_mean": 0.7319109439849854, "epsilon_dpo/beta_margin_std": 0.8173980116844177, "epsilon_dpo/loss_margin_mean": 77.29075622558594, "grad_norm": 33.68185043334961, "kl/avg_steps": 0.53125, "kl/beta": 0.009583702310919762, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.3780020494988445e-07, "logits/chosen": -3.0558266639709473, "logits/rejected": -3.0066354274749756, "logps/chosen": -108.08393859863281, "logps/ref_chosen": -51.57438659667969, "logps/ref_rejected": -70.69873046875, "logps/rejected": -204.49903869628906, "loss": 0.927, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5403603315353394, "rewards/margins": 0.7319109439849854, "rewards/rejected": -1.2722712755203247, "step": 453 }, { "epoch": 0.6863189720332578, "epsilon_dpo/beta": 0.009484836831688881, "epsilon_dpo/beta_margin_grad_mean": -0.360315203666687, "epsilon_dpo/beta_margin_grad_std": 0.16899505257606506, "epsilon_dpo/beta_margin_mean": 0.6487451791763306, "epsilon_dpo/beta_margin_std": 0.8249931931495667, "epsilon_dpo/loss_margin_mean": 68.93878173828125, "grad_norm": 40.3460807800293, "kl/avg_steps": 0.515625, "kl/beta": 0.009533057920634747, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.366202015206706e-07, "logits/chosen": -3.006380558013916, "logits/rejected": -2.9053120613098145, "logps/chosen": -106.21598815917969, "logps/ref_chosen": -49.735389709472656, "logps/ref_rejected": -56.315826416015625, "logps/rejected": -181.73519897460938, "loss": 0.9874, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5377349257469177, "rewards/margins": 0.6487451791763306, "rewards/rejected": -1.1864800453186035, "step": 454 }, { "epoch": 0.6878306878306878, "epsilon_dpo/beta": 0.00943618081510067, "epsilon_dpo/beta_margin_grad_mean": -0.35162416100502014, "epsilon_dpo/beta_margin_grad_std": 0.16402143239974976, "epsilon_dpo/beta_margin_mean": 0.7024453282356262, "epsilon_dpo/beta_margin_std": 0.7901948094367981, "epsilon_dpo/loss_margin_mean": 74.93250274658203, "grad_norm": 44.44548034667969, "kl/avg_steps": 0.515625, "kl/beta": 0.009484155103564262, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.354433695681474e-07, "logits/chosen": -3.250429630279541, "logits/rejected": -3.0474910736083984, "logps/chosen": -125.97431945800781, "logps/ref_chosen": -63.15057373046875, "logps/ref_rejected": -67.60485076904297, "logps/rejected": -205.36109924316406, "loss": 0.938, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5954431295394897, "rewards/margins": 0.7024452686309814, "rewards/rejected": -1.2978883981704712, "step": 455 }, { "epoch": 0.6893424036281179, "epsilon_dpo/beta": 0.009380416944622993, "epsilon_dpo/beta_margin_grad_mean": -0.36414530873298645, "epsilon_dpo/beta_margin_grad_std": 0.17582763731479645, "epsilon_dpo/beta_margin_mean": 0.6404057145118713, "epsilon_dpo/beta_margin_std": 0.8544778227806091, "epsilon_dpo/loss_margin_mean": 68.75310516357422, "grad_norm": 32.23760986328125, "kl/avg_steps": 0.59375, "kl/beta": 0.00943550281226635, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.3426974201083439e-07, "logits/chosen": -3.021559476852417, "logits/rejected": -3.0597939491271973, "logps/chosen": -110.34629821777344, "logps/ref_chosen": -49.2303466796875, "logps/ref_rejected": -70.36286926269531, "logps/rejected": -200.23193359375, "loss": 1.0033, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5756324529647827, "rewards/margins": 0.6404057741165161, "rewards/rejected": -1.2160382270812988, "step": 456 }, { "epoch": 0.690854119425548, "epsilon_dpo/beta": 0.009330913424491882, "epsilon_dpo/beta_margin_grad_mean": -0.37853631377220154, "epsilon_dpo/beta_margin_grad_std": 0.15840989351272583, "epsilon_dpo/beta_margin_mean": 0.5547515749931335, "epsilon_dpo/beta_margin_std": 0.7463831901550293, "epsilon_dpo/loss_margin_mean": 59.925315856933594, "grad_norm": 34.59005355834961, "kl/avg_steps": 0.53125, "kl/beta": 0.009379810653626919, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.3309935167761717e-07, "logits/chosen": -2.8486409187316895, "logits/rejected": -3.09061336517334, "logps/chosen": -113.46354675292969, "logps/ref_chosen": -41.363037109375, "logps/ref_rejected": -66.77450561523438, "logps/rejected": -198.8003387451172, "loss": 1.0308, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6741635799407959, "rewards/margins": 0.5547515749931335, "rewards/rejected": -1.2289150953292847, "step": 457 }, { "epoch": 0.6923658352229781, "epsilon_dpo/beta": 0.009275772608816624, "epsilon_dpo/beta_margin_grad_mean": -0.346624493598938, "epsilon_dpo/beta_margin_grad_std": 0.1617269068956375, "epsilon_dpo/beta_margin_mean": 0.7118436098098755, "epsilon_dpo/beta_margin_std": 0.7810143232345581, "epsilon_dpo/loss_margin_mean": 77.20606994628906, "grad_norm": 40.32435607910156, "kl/avg_steps": 0.59375, "kl/beta": 0.009330243803560734, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.3193223130682936e-07, "logits/chosen": -2.9063267707824707, "logits/rejected": -3.1843972206115723, "logps/chosen": -99.68240356445312, "logps/ref_chosen": -42.73438262939453, "logps/ref_rejected": -79.93819427490234, "logps/rejected": -214.09228515625, "loss": 0.9297, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5296976566314697, "rewards/margins": 0.7118436098098755, "rewards/rejected": -1.2415411472320557, "step": 458 }, { "epoch": 0.6938775510204082, "epsilon_dpo/beta": 0.009215225465595722, "epsilon_dpo/beta_margin_grad_mean": -0.33904725313186646, "epsilon_dpo/beta_margin_grad_std": 0.1560046672821045, "epsilon_dpo/beta_margin_mean": 0.752241313457489, "epsilon_dpo/beta_margin_std": 0.7647945284843445, "epsilon_dpo/loss_margin_mean": 82.0270004272461, "grad_norm": 35.54068374633789, "kl/avg_steps": 0.65625, "kl/beta": 0.009275171905755997, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.3076841354533658e-07, "logits/chosen": -3.1813931465148926, "logits/rejected": -2.987440586090088, "logps/chosen": -109.76310729980469, "logps/ref_chosen": -57.00560760498047, "logps/ref_rejected": -78.57566833496094, "logps/rejected": -213.36016845703125, "loss": 0.8952, "rewards/accuracies": 0.859375, "rewards/chosen": -0.48780763149261475, "rewards/margins": 0.752241313457489, "rewards/rejected": -1.2400490045547485, "step": 459 }, { "epoch": 0.6953892668178382, "epsilon_dpo/beta": 0.009172424674034119, "epsilon_dpo/beta_margin_grad_mean": -0.35334837436676025, "epsilon_dpo/beta_margin_grad_std": 0.15726487338542938, "epsilon_dpo/beta_margin_mean": 0.6859422326087952, "epsilon_dpo/beta_margin_std": 0.7452183365821838, "epsilon_dpo/loss_margin_mean": 75.31069946289062, "grad_norm": 41.105247497558594, "kl/avg_steps": 0.46875, "kl/beta": 0.009214701130986214, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.2960793094762345e-07, "logits/chosen": -3.0217089653015137, "logits/rejected": -3.061248779296875, "logps/chosen": -117.30683898925781, "logps/ref_chosen": -54.62413024902344, "logps/ref_rejected": -84.09452056884766, "logps/rejected": -222.08792114257812, "loss": 0.9357, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5775173902511597, "rewards/margins": 0.6859422922134399, "rewards/rejected": -1.2634596824645996, "step": 460 }, { "epoch": 0.6969009826152683, "epsilon_dpo/beta": 0.00910383090376854, "epsilon_dpo/beta_margin_grad_mean": -0.3366430997848511, "epsilon_dpo/beta_margin_grad_std": 0.1474873423576355, "epsilon_dpo/beta_margin_mean": 0.779132068157196, "epsilon_dpo/beta_margin_std": 0.7680892944335938, "epsilon_dpo/loss_margin_mean": 85.84680938720703, "grad_norm": 34.63417053222656, "kl/avg_steps": 0.75, "kl/beta": 0.00917170848697424, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.2845081597488286e-07, "logits/chosen": -2.9778292179107666, "logits/rejected": -3.0602681636810303, "logps/chosen": -89.60379028320312, "logps/ref_chosen": -42.29850769042969, "logps/ref_rejected": -66.46756744384766, "logps/rejected": -199.61965942382812, "loss": 0.8723, "rewards/accuracies": 0.84375, "rewards/chosen": -0.43167203664779663, "rewards/margins": 0.779132068157196, "rewards/rejected": -1.2108041048049927, "step": 461 }, { "epoch": 0.6984126984126984, "epsilon_dpo/beta": 0.009050285443663597, "epsilon_dpo/beta_margin_grad_mean": -0.3437485694885254, "epsilon_dpo/beta_margin_grad_std": 0.15861545503139496, "epsilon_dpo/beta_margin_mean": 0.7234411239624023, "epsilon_dpo/beta_margin_std": 0.7432688474655151, "epsilon_dpo/loss_margin_mean": 80.41810607910156, "grad_norm": 37.28372573852539, "kl/avg_steps": 0.59375, "kl/beta": 0.009103432297706604, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.27297100994108e-07, "logits/chosen": -3.003964900970459, "logits/rejected": -2.9240102767944336, "logps/chosen": -114.0467758178711, "logps/ref_chosen": -50.31602478027344, "logps/ref_rejected": -68.26872253417969, "logps/rejected": -212.41757202148438, "loss": 0.911, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5790793895721436, "rewards/margins": 0.7234411239624023, "rewards/rejected": -1.302520513534546, "step": 462 }, { "epoch": 0.6999244142101285, "epsilon_dpo/beta": 0.009002524428069592, "epsilon_dpo/beta_margin_grad_mean": -0.3657132089138031, "epsilon_dpo/beta_margin_grad_std": 0.15791313350200653, "epsilon_dpo/beta_margin_mean": 0.6076966524124146, "epsilon_dpo/beta_margin_std": 0.7330036163330078, "epsilon_dpo/loss_margin_mean": 68.00065612792969, "grad_norm": 44.75859451293945, "kl/avg_steps": 0.53125, "kl/beta": 0.009049700573086739, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.2614681827718695e-07, "logits/chosen": -3.1410956382751465, "logits/rejected": -2.8460946083068848, "logps/chosen": -129.70838928222656, "logps/ref_chosen": -57.89822769165039, "logps/ref_rejected": -60.88771057128906, "logps/rejected": -200.69851684570312, "loss": 0.9894, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6480365991592407, "rewards/margins": 0.6076966524124146, "rewards/rejected": -1.2557332515716553, "step": 463 }, { "epoch": 0.7014361300075586, "epsilon_dpo/beta": 0.008946510963141918, "epsilon_dpo/beta_margin_grad_mean": -0.33674463629722595, "epsilon_dpo/beta_margin_grad_std": 0.16115881502628326, "epsilon_dpo/beta_margin_mean": 0.7785069346427917, "epsilon_dpo/beta_margin_std": 0.7943654656410217, "epsilon_dpo/loss_margin_mean": 87.45020294189453, "grad_norm": 35.144004821777344, "kl/avg_steps": 0.625, "kl/beta": 0.009001878090202808, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -2.9778032302856445, "logits/rejected": -2.8806443214416504, "logps/chosen": -113.39933776855469, "logps/ref_chosen": -49.217796325683594, "logps/ref_rejected": -60.73811340332031, "logps/rejected": -212.369873046875, "loss": 0.8868, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5752967596054077, "rewards/margins": 0.7785069942474365, "rewards/rejected": -1.3538036346435547, "step": 464 }, { "epoch": 0.7029478458049887, "epsilon_dpo/beta": 0.008893738500773907, "epsilon_dpo/beta_margin_grad_mean": -0.3740887939929962, "epsilon_dpo/beta_margin_grad_std": 0.15973451733589172, "epsilon_dpo/beta_margin_mean": 0.5860607028007507, "epsilon_dpo/beta_margin_std": 0.7525936961174011, "epsilon_dpo/loss_margin_mean": 66.32755279541016, "grad_norm": 43.63521194458008, "kl/avg_steps": 0.59375, "kl/beta": 0.008945965208113194, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.238566782415197e-07, "logits/chosen": -3.061175584793091, "logits/rejected": -3.111180305480957, "logps/chosen": -122.44712829589844, "logps/ref_chosen": -49.716182708740234, "logps/ref_rejected": -79.85928344726562, "logps/rejected": -218.91778564453125, "loss": 1.0093, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6484643220901489, "rewards/margins": 0.5860607028007507, "rewards/rejected": -1.2345249652862549, "step": 465 }, { "epoch": 0.7044595616024187, "epsilon_dpo/beta": 0.008857919834554195, "epsilon_dpo/beta_margin_grad_mean": -0.4078911542892456, "epsilon_dpo/beta_margin_grad_std": 0.16191598773002625, "epsilon_dpo/beta_margin_mean": 0.4127967357635498, "epsilon_dpo/beta_margin_std": 0.7428742051124573, "epsilon_dpo/loss_margin_mean": 47.15573501586914, "grad_norm": 48.011474609375, "kl/avg_steps": 0.40625, "kl/beta": 0.00889316201210022, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.2271688498291334e-07, "logits/chosen": -3.014935255050659, "logits/rejected": -2.9322071075439453, "logps/chosen": -140.85598754882812, "logps/ref_chosen": -62.54936218261719, "logps/ref_rejected": -63.35521697998047, "logps/rejected": -188.81756591796875, "loss": 1.142, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6965527534484863, "rewards/margins": 0.4127967357635498, "rewards/rejected": -1.1093494892120361, "step": 466 }, { "epoch": 0.7059712773998488, "epsilon_dpo/beta": 0.008799934759736061, "epsilon_dpo/beta_margin_grad_mean": -0.3562059700489044, "epsilon_dpo/beta_margin_grad_std": 0.16323348879814148, "epsilon_dpo/beta_margin_mean": 0.6768296360969543, "epsilon_dpo/beta_margin_std": 0.7987775802612305, "epsilon_dpo/loss_margin_mean": 77.37510681152344, "grad_norm": 40.956634521484375, "kl/avg_steps": 0.65625, "kl/beta": 0.00885718036442995, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.2158065210664848e-07, "logits/chosen": -2.9398422241210938, "logits/rejected": -2.977806568145752, "logps/chosen": -113.66433715820312, "logps/ref_chosen": -45.69499206542969, "logps/ref_rejected": -67.98948669433594, "logps/rejected": -213.33395385742188, "loss": 0.9562, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6000622510910034, "rewards/margins": 0.6768296360969543, "rewards/rejected": -1.2768919467926025, "step": 467 }, { "epoch": 0.7074829931972789, "epsilon_dpo/beta": 0.00873706117272377, "epsilon_dpo/beta_margin_grad_mean": -0.3294633626937866, "epsilon_dpo/beta_margin_grad_std": 0.16503098607063293, "epsilon_dpo/beta_margin_mean": 0.7967308759689331, "epsilon_dpo/beta_margin_std": 0.7904424071311951, "epsilon_dpo/loss_margin_mean": 91.6434555053711, "grad_norm": 38.528236389160156, "kl/avg_steps": 0.71875, "kl/beta": 0.008799433708190918, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.204480113956011e-07, "logits/chosen": -3.1074306964874268, "logits/rejected": -2.987607479095459, "logps/chosen": -112.33391571044922, "logps/ref_chosen": -53.87787628173828, "logps/ref_rejected": -72.89202880859375, "logps/rejected": -222.99151611328125, "loss": 0.8774, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5118928551673889, "rewards/margins": 0.7967308759689331, "rewards/rejected": -1.3086237907409668, "step": 468 }, { "epoch": 0.708994708994709, "epsilon_dpo/beta": 0.008688364177942276, "epsilon_dpo/beta_margin_grad_mean": -0.3695855438709259, "epsilon_dpo/beta_margin_grad_std": 0.13571633398532867, "epsilon_dpo/beta_margin_mean": 0.6020424962043762, "epsilon_dpo/beta_margin_std": 0.6872910857200623, "epsilon_dpo/loss_margin_mean": 69.64955139160156, "grad_norm": 43.27452087402344, "kl/avg_steps": 0.5625, "kl/beta": 0.00873663928359747, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.1931899453216697e-07, "logits/chosen": -3.1579771041870117, "logits/rejected": -3.120800018310547, "logps/chosen": -119.15943908691406, "logps/ref_chosen": -53.718421936035156, "logps/ref_rejected": -74.461669921875, "logps/rejected": -209.55224609375, "loss": 0.9706, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5706468820571899, "rewards/margins": 0.6020424962043762, "rewards/rejected": -1.1726893186569214, "step": 469 }, { "epoch": 0.7105064247921391, "epsilon_dpo/beta": 0.008634334430098534, "epsilon_dpo/beta_margin_grad_mean": -0.35456258058547974, "epsilon_dpo/beta_margin_grad_std": 0.14077353477478027, "epsilon_dpo/beta_margin_mean": 0.6569988131523132, "epsilon_dpo/beta_margin_std": 0.6583942770957947, "epsilon_dpo/loss_margin_mean": 76.4986572265625, "grad_norm": 47.810977935791016, "kl/avg_steps": 0.625, "kl/beta": 0.008687769994139671, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.1819363309737438e-07, "logits/chosen": -2.899038791656494, "logits/rejected": -2.8511667251586914, "logps/chosen": -101.81800842285156, "logps/ref_chosen": -44.10430145263672, "logps/ref_rejected": -56.753562927246094, "logps/rejected": -190.96592712402344, "loss": 0.9302, "rewards/accuracies": 0.84375, "rewards/chosen": -0.49989408254623413, "rewards/margins": 0.6569988131523132, "rewards/rejected": -1.1568928956985474, "step": 470 }, { "epoch": 0.7120181405895691, "epsilon_dpo/beta": 0.00858070608228445, "epsilon_dpo/beta_margin_grad_mean": -0.34139055013656616, "epsilon_dpo/beta_margin_grad_std": 0.14833906292915344, "epsilon_dpo/beta_margin_mean": 0.7167556881904602, "epsilon_dpo/beta_margin_std": 0.6927893757820129, "epsilon_dpo/loss_margin_mean": 83.94532775878906, "grad_norm": 38.46868896484375, "kl/avg_steps": 0.625, "kl/beta": 0.00863380916416645, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.1707195857000215e-07, "logits/chosen": -3.0611729621887207, "logits/rejected": -2.950870990753174, "logps/chosen": -110.12937927246094, "logps/ref_chosen": -52.552894592285156, "logps/ref_rejected": -64.72122192382812, "logps/rejected": -206.24302673339844, "loss": 0.9007, "rewards/accuracies": 0.84375, "rewards/chosen": -0.49612343311309814, "rewards/margins": 0.7167556881904602, "rewards/rejected": -1.2128791809082031, "step": 471 }, { "epoch": 0.7135298563869993, "epsilon_dpo/beta": 0.008532771840691566, "epsilon_dpo/beta_margin_grad_mean": -0.36492446064949036, "epsilon_dpo/beta_margin_grad_std": 0.16223636269569397, "epsilon_dpo/beta_margin_mean": 0.6312053799629211, "epsilon_dpo/beta_margin_std": 0.7762150168418884, "epsilon_dpo/loss_margin_mean": 74.49177551269531, "grad_norm": 47.13676452636719, "kl/avg_steps": 0.5625, "kl/beta": 0.008580182678997517, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.1595400232569768e-07, "logits/chosen": -3.12437105178833, "logits/rejected": -2.991716146469116, "logps/chosen": -117.95704650878906, "logps/ref_chosen": -53.82633972167969, "logps/ref_rejected": -66.27600860595703, "logps/rejected": -204.89849853515625, "loss": 0.9834, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5482678413391113, "rewards/margins": 0.6312054395675659, "rewards/rejected": -1.1794732809066772, "step": 472 }, { "epoch": 0.7150415721844293, "epsilon_dpo/beta": 0.008485044352710247, "epsilon_dpo/beta_margin_grad_mean": -0.3758508563041687, "epsilon_dpo/beta_margin_grad_std": 0.17187382280826569, "epsilon_dpo/beta_margin_mean": 0.5881558060646057, "epsilon_dpo/beta_margin_std": 0.8373464941978455, "epsilon_dpo/loss_margin_mean": 69.85887908935547, "grad_norm": 46.701534271240234, "kl/avg_steps": 0.5625, "kl/beta": 0.008532189764082432, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.1483979563610069e-07, "logits/chosen": -3.0324227809906006, "logits/rejected": -3.2037577629089355, "logps/chosen": -97.7859115600586, "logps/ref_chosen": -47.981719970703125, "logps/ref_rejected": -87.42070770263672, "logps/rejected": -207.08377075195312, "loss": 1.0347, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4240872263908386, "rewards/margins": 0.5881558656692505, "rewards/rejected": -1.0122430324554443, "step": 473 }, { "epoch": 0.7165532879818595, "epsilon_dpo/beta": 0.008445537649095058, "epsilon_dpo/beta_margin_grad_mean": -0.375211238861084, "epsilon_dpo/beta_margin_grad_std": 0.16416117548942566, "epsilon_dpo/beta_margin_mean": 0.5792803168296814, "epsilon_dpo/beta_margin_std": 0.7684534788131714, "epsilon_dpo/loss_margin_mean": 69.1688003540039, "grad_norm": 45.77193069458008, "kl/avg_steps": 0.46875, "kl/beta": 0.008484464138746262, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.1372936966796709e-07, "logits/chosen": -2.903597831726074, "logits/rejected": -3.00131893157959, "logps/chosen": -102.572265625, "logps/ref_chosen": -39.87531280517578, "logps/ref_rejected": -67.63872528076172, "logps/rejected": -199.50448608398438, "loss": 1.0203, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5307076573371887, "rewards/margins": 0.5792803764343262, "rewards/rejected": -1.1099879741668701, "step": 474 }, { "epoch": 0.7180650037792895, "epsilon_dpo/beta": 0.008385020308196545, "epsilon_dpo/beta_margin_grad_mean": -0.3307664096355438, "epsilon_dpo/beta_margin_grad_std": 0.15174920856952667, "epsilon_dpo/beta_margin_mean": 0.8129953145980835, "epsilon_dpo/beta_margin_std": 0.801196813583374, "epsilon_dpo/loss_margin_mean": 97.32677459716797, "grad_norm": 36.2890625, "kl/avg_steps": 0.71875, "kl/beta": 0.008444879204034805, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.126227554822985e-07, "logits/chosen": -3.0218334197998047, "logits/rejected": -2.9268972873687744, "logps/chosen": -116.41016387939453, "logps/ref_chosen": -51.6319465637207, "logps/ref_rejected": -81.1695327758789, "logps/rejected": -243.2745361328125, "loss": 0.8601, "rewards/accuracies": 0.875, "rewards/chosen": -0.5447710752487183, "rewards/margins": 0.8129953145980835, "rewards/rejected": -1.3577663898468018, "step": 475 }, { "epoch": 0.7195767195767195, "epsilon_dpo/beta": 0.008340904489159584, "epsilon_dpo/beta_margin_grad_mean": -0.38177502155303955, "epsilon_dpo/beta_margin_grad_std": 0.14762112498283386, "epsilon_dpo/beta_margin_mean": 0.5338074564933777, "epsilon_dpo/beta_margin_std": 0.6770220398902893, "epsilon_dpo/loss_margin_mean": 64.48006439208984, "grad_norm": 45.318939208984375, "kl/avg_steps": 0.53125, "kl/beta": 0.008384614251554012, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.1151998403347243e-07, "logits/chosen": -3.0409555435180664, "logits/rejected": -3.0340170860290527, "logps/chosen": -132.08428955078125, "logps/ref_chosen": -59.138587951660156, "logps/ref_rejected": -72.81019592285156, "logps/rejected": -210.2359619140625, "loss": 1.0259, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6104044914245605, "rewards/margins": 0.5338074564933777, "rewards/rejected": -1.144212007522583, "step": 476 }, { "epoch": 0.7210884353741497, "epsilon_dpo/beta": 0.008294221013784409, "epsilon_dpo/beta_margin_grad_mean": -0.3728020191192627, "epsilon_dpo/beta_margin_grad_std": 0.17356941103935242, "epsilon_dpo/beta_margin_mean": 0.5870934724807739, "epsilon_dpo/beta_margin_std": 0.8151865005493164, "epsilon_dpo/loss_margin_mean": 71.36532592773438, "grad_norm": 41.995609283447266, "kl/avg_steps": 0.5625, "kl/beta": 0.008340306580066681, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.1042108616837692e-07, "logits/chosen": -2.9323155879974365, "logits/rejected": -3.070002555847168, "logps/chosen": -122.77670288085938, "logps/ref_chosen": -51.14232635498047, "logps/ref_rejected": -78.74493408203125, "logps/rejected": -221.74464416503906, "loss": 1.0316, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5965589880943298, "rewards/margins": 0.5870934724807739, "rewards/rejected": -1.183652400970459, "step": 477 }, { "epoch": 0.7226001511715797, "epsilon_dpo/beta": 0.008258195593953133, "epsilon_dpo/beta_margin_grad_mean": -0.383217453956604, "epsilon_dpo/beta_margin_grad_std": 0.169718936085701, "epsilon_dpo/beta_margin_mean": 0.5288316607475281, "epsilon_dpo/beta_margin_std": 0.7749770879745483, "epsilon_dpo/loss_margin_mean": 64.6636734008789, "grad_norm": 38.47211456298828, "kl/avg_steps": 0.4375, "kl/beta": 0.008293654769659042, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.0932609262554746e-07, "logits/chosen": -2.846428394317627, "logits/rejected": -2.745283603668213, "logps/chosen": -114.35136413574219, "logps/ref_chosen": -49.11362075805664, "logps/ref_rejected": -46.867347717285156, "logps/rejected": -176.76876831054688, "loss": 1.0632, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5408506393432617, "rewards/margins": 0.5288316607475281, "rewards/rejected": -1.069682240486145, "step": 478 }, { "epoch": 0.7241118669690099, "epsilon_dpo/beta": 0.008211900480091572, "epsilon_dpo/beta_margin_grad_mean": -0.3750561475753784, "epsilon_dpo/beta_margin_grad_std": 0.15112420916557312, "epsilon_dpo/beta_margin_mean": 0.5641729831695557, "epsilon_dpo/beta_margin_std": 0.702041506767273, "epsilon_dpo/loss_margin_mean": 69.18150329589844, "grad_norm": 41.25717544555664, "kl/avg_steps": 0.5625, "kl/beta": 0.008257527835667133, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.0823503403430734e-07, "logits/chosen": -2.9416332244873047, "logits/rejected": -2.7940831184387207, "logps/chosen": -113.9233169555664, "logps/ref_chosen": -50.32819366455078, "logps/ref_rejected": -58.97947692871094, "logps/rejected": -191.756103515625, "loss": 1.0106, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5234329700469971, "rewards/margins": 0.5641729831695557, "rewards/rejected": -1.0876059532165527, "step": 479 }, { "epoch": 0.7256235827664399, "epsilon_dpo/beta": 0.008165966719388962, "epsilon_dpo/beta_margin_grad_mean": -0.3647410571575165, "epsilon_dpo/beta_margin_grad_std": 0.17620109021663666, "epsilon_dpo/beta_margin_mean": 0.6069278717041016, "epsilon_dpo/beta_margin_std": 0.8099779486656189, "epsilon_dpo/loss_margin_mean": 74.99452209472656, "grad_norm": 43.37430191040039, "kl/avg_steps": 0.5625, "kl/beta": 0.008211338892579079, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.0714794091391072e-07, "logits/chosen": -3.0721914768218994, "logits/rejected": -2.917562961578369, "logps/chosen": -124.82667541503906, "logps/ref_chosen": -55.76716613769531, "logps/ref_rejected": -64.60369110107422, "logps/rejected": -208.65771484375, "loss": 1.0186, "rewards/accuracies": 0.796875, "rewards/chosen": -0.56634122133255, "rewards/margins": 0.6069278717041016, "rewards/rejected": -1.1732690334320068, "step": 480 }, { "epoch": 0.72713529856387, "epsilon_dpo/beta": 0.008117739111185074, "epsilon_dpo/beta_margin_grad_mean": -0.378347784280777, "epsilon_dpo/beta_margin_grad_std": 0.15316613018512726, "epsilon_dpo/beta_margin_mean": 0.5437122583389282, "epsilon_dpo/beta_margin_std": 0.7222504615783691, "epsilon_dpo/loss_margin_mean": 67.42758178710938, "grad_norm": 37.390743255615234, "kl/avg_steps": 0.59375, "kl/beta": 0.008165408857166767, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.0606484367268906e-07, "logits/chosen": -3.090466260910034, "logits/rejected": -2.960984945297241, "logps/chosen": -134.95208740234375, "logps/ref_chosen": -62.20503616333008, "logps/ref_rejected": -62.93307876586914, "logps/rejected": -203.10769653320312, "loss": 1.0319, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5920074582099915, "rewards/margins": 0.5437122583389282, "rewards/rejected": -1.135719656944275, "step": 481 }, { "epoch": 0.7286470143613001, "epsilon_dpo/beta": 0.008069823496043682, "epsilon_dpo/beta_margin_grad_mean": -0.38707828521728516, "epsilon_dpo/beta_margin_grad_std": 0.16094250977039337, "epsilon_dpo/beta_margin_mean": 0.5296624302864075, "epsilon_dpo/beta_margin_std": 0.7988515496253967, "epsilon_dpo/loss_margin_mean": 66.14301300048828, "grad_norm": 37.55079650878906, "kl/avg_steps": 0.59375, "kl/beta": 0.008117212913930416, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.0498577260720048e-07, "logits/chosen": -2.999875068664551, "logits/rejected": -3.1038661003112793, "logps/chosen": -136.13937377929688, "logps/ref_chosen": -55.681915283203125, "logps/ref_rejected": -91.20880126953125, "logps/rejected": -237.80929565429688, "loss": 1.0617, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6508998870849609, "rewards/margins": 0.5296624898910522, "rewards/rejected": -1.1805623769760132, "step": 482 }, { "epoch": 0.7301587301587301, "epsilon_dpo/beta": 0.00801714789122343, "epsilon_dpo/beta_margin_grad_mean": -0.3528984487056732, "epsilon_dpo/beta_margin_grad_std": 0.14112207293510437, "epsilon_dpo/beta_margin_mean": 0.6678046584129333, "epsilon_dpo/beta_margin_std": 0.6695928573608398, "epsilon_dpo/loss_margin_mean": 83.7013931274414, "grad_norm": 41.722015380859375, "kl/avg_steps": 0.65625, "kl/beta": 0.008069301024079323, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.0391075790138232e-07, "logits/chosen": -2.9585354328155518, "logits/rejected": -3.049147605895996, "logps/chosen": -105.969970703125, "logps/ref_chosen": -47.599761962890625, "logps/ref_rejected": -72.28488159179688, "logps/rejected": -214.35647583007812, "loss": 0.9252, "rewards/accuracies": 0.859375, "rewards/chosen": -0.47000598907470703, "rewards/margins": 0.6678046584129333, "rewards/rejected": -1.1378107070922852, "step": 483 }, { "epoch": 0.7316704459561603, "epsilon_dpo/beta": 0.007969889789819717, "epsilon_dpo/beta_margin_grad_mean": -0.38915571570396423, "epsilon_dpo/beta_margin_grad_std": 0.13463613390922546, "epsilon_dpo/beta_margin_mean": 0.48408761620521545, "epsilon_dpo/beta_margin_std": 0.5998511910438538, "epsilon_dpo/loss_margin_mean": 61.1665153503418, "grad_norm": 44.82302474975586, "kl/avg_steps": 0.59375, "kl/beta": 0.008016691543161869, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.0283982962570681e-07, "logits/chosen": -3.0187864303588867, "logits/rejected": -2.9453258514404297, "logps/chosen": -117.80728912353516, "logps/ref_chosen": -49.678001403808594, "logps/ref_rejected": -61.5303955078125, "logps/rejected": -190.82620239257812, "loss": 1.0432, "rewards/accuracies": 0.8125, "rewards/chosen": -0.544452428817749, "rewards/margins": 0.48408758640289307, "rewards/rejected": -1.0285401344299316, "step": 484 }, { "epoch": 0.7331821617535903, "epsilon_dpo/beta": 0.007924080826342106, "epsilon_dpo/beta_margin_grad_mean": -0.3795631229877472, "epsilon_dpo/beta_margin_grad_std": 0.1330898553133011, "epsilon_dpo/beta_margin_mean": 0.535297155380249, "epsilon_dpo/beta_margin_std": 0.6089780330657959, "epsilon_dpo/loss_margin_mean": 67.95079803466797, "grad_norm": 35.98203659057617, "kl/avg_steps": 0.578125, "kl/beta": 0.00796937383711338, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.0177301773633992e-07, "logits/chosen": -3.0292232036590576, "logits/rejected": -2.8409886360168457, "logps/chosen": -127.94165802001953, "logps/ref_chosen": -55.31871795654297, "logps/ref_rejected": -66.27642059326172, "logps/rejected": -206.85015869140625, "loss": 1.0051, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5764427781105042, "rewards/margins": 0.535297155380249, "rewards/rejected": -1.1117398738861084, "step": 485 }, { "epoch": 0.7346938775510204, "epsilon_dpo/beta": 0.007889688946306705, "epsilon_dpo/beta_margin_grad_mean": -0.38176867365837097, "epsilon_dpo/beta_margin_grad_std": 0.15477198362350464, "epsilon_dpo/beta_margin_mean": 0.5442383885383606, "epsilon_dpo/beta_margin_std": 0.7165350914001465, "epsilon_dpo/loss_margin_mean": 69.56564331054688, "grad_norm": 40.761451721191406, "kl/avg_steps": 0.4375, "kl/beta": 0.007923565804958344, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.007103520743035e-07, "logits/chosen": -2.8089499473571777, "logits/rejected": -2.914480209350586, "logps/chosen": -128.2469024658203, "logps/ref_chosen": -47.17963409423828, "logps/ref_rejected": -75.31692504882812, "logps/rejected": -225.94984436035156, "loss": 1.0297, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6414223909378052, "rewards/margins": 0.5442384481430054, "rewards/rejected": -1.1856608390808105, "step": 486 }, { "epoch": 0.7362055933484505, "epsilon_dpo/beta": 0.00785039085894823, "epsilon_dpo/beta_margin_grad_mean": -0.39141690731048584, "epsilon_dpo/beta_margin_grad_std": 0.15399259328842163, "epsilon_dpo/beta_margin_mean": 0.50898677110672, "epsilon_dpo/beta_margin_std": 0.7915201783180237, "epsilon_dpo/loss_margin_mean": 65.38215637207031, "grad_norm": 52.37213134765625, "kl/avg_steps": 0.5, "kl/beta": 0.007889050990343094, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 9.965186236464046e-08, "logits/chosen": -2.91711688041687, "logits/rejected": -3.044752836227417, "logps/chosen": -127.31414794921875, "logps/ref_chosen": -52.8890266418457, "logps/ref_rejected": -78.99122619628906, "logps/rejected": -218.79852294921875, "loss": 1.0696, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5862300395965576, "rewards/margins": 0.50898677110672, "rewards/rejected": -1.0952167510986328, "step": 487 }, { "epoch": 0.7377173091458806, "epsilon_dpo/beta": 0.007806427776813507, "epsilon_dpo/beta_margin_grad_mean": -0.3717724680900574, "epsilon_dpo/beta_margin_grad_std": 0.1533455103635788, "epsilon_dpo/beta_margin_mean": 0.5857774019241333, "epsilon_dpo/beta_margin_std": 0.7060866355895996, "epsilon_dpo/loss_margin_mean": 75.54196166992188, "grad_norm": 38.499820709228516, "kl/avg_steps": 0.5625, "kl/beta": 0.007849802263081074, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 9.859757821558337e-08, "logits/chosen": -3.0275957584381104, "logits/rejected": -2.833566665649414, "logps/chosen": -126.12478637695312, "logps/ref_chosen": -56.79566955566406, "logps/ref_rejected": -69.90049743652344, "logps/rejected": -214.77157592773438, "loss": 0.9963, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5426954030990601, "rewards/margins": 0.5857774019241333, "rewards/rejected": -1.1284728050231934, "step": 488 }, { "epoch": 0.7392290249433107, "epsilon_dpo/beta": 0.007767640985548496, "epsilon_dpo/beta_margin_grad_mean": -0.396861732006073, "epsilon_dpo/beta_margin_grad_std": 0.15693841874599457, "epsilon_dpo/beta_margin_mean": 0.4590238332748413, "epsilon_dpo/beta_margin_std": 0.7086181044578552, "epsilon_dpo/loss_margin_mean": 59.6797981262207, "grad_norm": 39.52256774902344, "kl/avg_steps": 0.5, "kl/beta": 0.007805893663316965, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 9.754752911772615e-08, "logits/chosen": -2.98934268951416, "logits/rejected": -3.082855463027954, "logps/chosen": -133.67401123046875, "logps/ref_chosen": -54.353004455566406, "logps/ref_rejected": -76.5740966796875, "logps/rejected": -215.57492065429688, "loss": 1.0949, "rewards/accuracies": 0.75, "rewards/chosen": -0.617673933506012, "rewards/margins": 0.4590238630771637, "rewards/rejected": -1.076697826385498, "step": 489 }, { "epoch": 0.7407407407407407, "epsilon_dpo/beta": 0.007736278232187033, "epsilon_dpo/beta_margin_grad_mean": -0.3966467082500458, "epsilon_dpo/beta_margin_grad_std": 0.18160822987556458, "epsilon_dpo/beta_margin_mean": 0.48425695300102234, "epsilon_dpo/beta_margin_std": 0.859203577041626, "epsilon_dpo/loss_margin_mean": 63.38637924194336, "grad_norm": 49.755706787109375, "kl/avg_steps": 0.40625, "kl/beta": 0.007767058443278074, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 9.650174444319956e-08, "logits/chosen": -2.97818922996521, "logits/rejected": -2.8531765937805176, "logps/chosen": -133.44039916992188, "logps/ref_chosen": -56.27444839477539, "logps/ref_rejected": -65.3909912109375, "logps/rejected": -205.94332885742188, "loss": 1.1238, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5996379256248474, "rewards/margins": 0.48425692319869995, "rewards/rejected": -1.083894968032837, "step": 490 }, { "epoch": 0.7422524565381708, "epsilon_dpo/beta": 0.007707394193857908, "epsilon_dpo/beta_margin_grad_mean": -0.39913296699523926, "epsilon_dpo/beta_margin_grad_std": 0.14908455312252045, "epsilon_dpo/beta_margin_mean": 0.45876824855804443, "epsilon_dpo/beta_margin_std": 0.6842023134231567, "epsilon_dpo/loss_margin_mean": 60.08913040161133, "grad_norm": 44.07225036621094, "kl/avg_steps": 0.375, "kl/beta": 0.007735632359981537, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 9.546025344484868e-08, "logits/chosen": -2.9040579795837402, "logits/rejected": -2.8765950202941895, "logps/chosen": -124.69682312011719, "logps/ref_chosen": -48.973819732666016, "logps/ref_rejected": -70.2862777709961, "logps/rejected": -206.09841918945312, "loss": 1.0855, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5841909646987915, "rewards/margins": 0.45876824855804443, "rewards/rejected": -1.042959213256836, "step": 491 }, { "epoch": 0.7437641723356009, "epsilon_dpo/beta": 0.0076761916279792786, "epsilon_dpo/beta_margin_grad_mean": -0.3959881067276001, "epsilon_dpo/beta_margin_grad_std": 0.16612862050533295, "epsilon_dpo/beta_margin_mean": 0.4829450249671936, "epsilon_dpo/beta_margin_std": 0.779258131980896, "epsilon_dpo/loss_margin_mean": 63.58208465576172, "grad_norm": 38.84324264526367, "kl/avg_steps": 0.40625, "kl/beta": 0.007706732489168644, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 9.442308525541589e-08, "logits/chosen": -2.9204399585723877, "logits/rejected": -2.957973003387451, "logps/chosen": -146.99362182617188, "logps/ref_chosen": -59.815608978271484, "logps/ref_rejected": -72.96083068847656, "logps/rejected": -223.720947265625, "loss": 1.0965, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6716978549957275, "rewards/margins": 0.4829450249671936, "rewards/rejected": -1.1546428203582764, "step": 492 }, { "epoch": 0.745275888133031, "epsilon_dpo/beta": 0.007634326349943876, "epsilon_dpo/beta_margin_grad_mean": -0.36744338274002075, "epsilon_dpo/beta_margin_grad_std": 0.15218232572078705, "epsilon_dpo/beta_margin_mean": 0.6017930507659912, "epsilon_dpo/beta_margin_std": 0.7092916369438171, "epsilon_dpo/loss_margin_mean": 79.3526840209961, "grad_norm": 34.33770751953125, "kl/avg_steps": 0.546875, "kl/beta": 0.007675550412386656, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 9.339026888672468e-08, "logits/chosen": -2.996459484100342, "logits/rejected": -2.9844508171081543, "logps/chosen": -107.84745025634766, "logps/ref_chosen": -49.818687438964844, "logps/ref_rejected": -71.48409271240234, "logps/rejected": -208.86553955078125, "loss": 0.9852, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4440092444419861, "rewards/margins": 0.6017930507659912, "rewards/rejected": -1.0458022356033325, "step": 493 }, { "epoch": 0.7467876039304611, "epsilon_dpo/beta": 0.0075963931158185005, "epsilon_dpo/beta_margin_grad_mean": -0.39531928300857544, "epsilon_dpo/beta_margin_grad_std": 0.16287456452846527, "epsilon_dpo/beta_margin_mean": 0.48147690296173096, "epsilon_dpo/beta_margin_std": 0.7800478935241699, "epsilon_dpo/loss_margin_mean": 63.98380661010742, "grad_norm": 40.0687141418457, "kl/avg_steps": 0.5, "kl/beta": 0.007633802946656942, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 9.236183322886945e-08, "logits/chosen": -3.036259174346924, "logits/rejected": -2.9855732917785645, "logps/chosen": -133.03399658203125, "logps/ref_chosen": -65.77702331542969, "logps/ref_rejected": -74.79572296142578, "logps/rejected": -206.0364990234375, "loss": 1.0966, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5127191543579102, "rewards/margins": 0.48147690296173096, "rewards/rejected": -0.9941960573196411, "step": 494 }, { "epoch": 0.7482993197278912, "epsilon_dpo/beta": 0.007553853094577789, "epsilon_dpo/beta_margin_grad_mean": -0.38129398226737976, "epsilon_dpo/beta_margin_grad_std": 0.13209475576877594, "epsilon_dpo/beta_margin_mean": 0.539579451084137, "epsilon_dpo/beta_margin_std": 0.6482419371604919, "epsilon_dpo/loss_margin_mean": 71.85853576660156, "grad_norm": 38.60284423828125, "kl/avg_steps": 0.5625, "kl/beta": 0.0075958240777254105, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 9.133780704940594e-08, "logits/chosen": -2.7773361206054688, "logits/rejected": -2.9270377159118652, "logps/chosen": -105.00171661376953, "logps/ref_chosen": -43.784461975097656, "logps/ref_rejected": -65.81676483154297, "logps/rejected": -198.89254760742188, "loss": 1.0075, "rewards/accuracies": 0.8125, "rewards/chosen": -0.46360111236572266, "rewards/margins": 0.5395795106887817, "rewards/rejected": -1.0031805038452148, "step": 495 }, { "epoch": 0.7498110355253212, "epsilon_dpo/beta": 0.007525763474404812, "epsilon_dpo/beta_margin_grad_mean": -0.3853885233402252, "epsilon_dpo/beta_margin_grad_std": 0.17010442912578583, "epsilon_dpo/beta_margin_mean": 0.5347515940666199, "epsilon_dpo/beta_margin_std": 0.7886155843734741, "epsilon_dpo/loss_margin_mean": 71.81108856201172, "grad_norm": 48.541229248046875, "kl/avg_steps": 0.375, "kl/beta": 0.007553336210548878, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 9.031821899254797e-08, "logits/chosen": -2.8356199264526367, "logits/rejected": -2.8272812366485596, "logps/chosen": -120.49705505371094, "logps/ref_chosen": -46.80432891845703, "logps/ref_rejected": -75.68951416015625, "logps/rejected": -221.19332885742188, "loss": 1.0611, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5577501058578491, "rewards/margins": 0.5347515940666199, "rewards/rejected": -1.0925016403198242, "step": 496 }, { "epoch": 0.7513227513227513, "epsilon_dpo/beta": 0.007476481143385172, "epsilon_dpo/beta_margin_grad_mean": -0.3400881290435791, "epsilon_dpo/beta_margin_grad_std": 0.13886487483978271, "epsilon_dpo/beta_margin_mean": 0.7273428440093994, "epsilon_dpo/beta_margin_std": 0.6509276032447815, "epsilon_dpo/loss_margin_mean": 97.69979858398438, "grad_norm": 27.575048446655273, "kl/avg_steps": 0.65625, "kl/beta": 0.007525117136538029, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 8.930309757836516e-08, "logits/chosen": -3.0637900829315186, "logits/rejected": -2.9773988723754883, "logps/chosen": -133.81475830078125, "logps/ref_chosen": -63.532344818115234, "logps/ref_rejected": -80.39402770996094, "logps/rejected": -248.37625122070312, "loss": 0.88, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5270121097564697, "rewards/margins": 0.7273428440093994, "rewards/rejected": -1.2543549537658691, "step": 497 }, { "epoch": 0.7528344671201814, "epsilon_dpo/beta": 0.007439419161528349, "epsilon_dpo/beta_margin_grad_mean": -0.37045037746429443, "epsilon_dpo/beta_margin_grad_std": 0.14938174188137054, "epsilon_dpo/beta_margin_mean": 0.5895306468009949, "epsilon_dpo/beta_margin_std": 0.6907029747962952, "epsilon_dpo/loss_margin_mean": 79.80892181396484, "grad_norm": 42.92643737792969, "kl/avg_steps": 0.5, "kl/beta": 0.007476055528968573, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 8.829247120198563e-08, "logits/chosen": -3.060647964477539, "logits/rejected": -2.8303136825561523, "logps/chosen": -119.10681915283203, "logps/ref_chosen": -54.59954071044922, "logps/ref_rejected": -59.73279571533203, "logps/rejected": -204.04901123046875, "loss": 0.9884, "rewards/accuracies": 0.78125, "rewards/chosen": -0.48280271887779236, "rewards/margins": 0.5895305871963501, "rewards/rejected": -1.0723333358764648, "step": 498 }, { "epoch": 0.7543461829176115, "epsilon_dpo/beta": 0.007409381680190563, "epsilon_dpo/beta_margin_grad_mean": -0.3688298463821411, "epsilon_dpo/beta_margin_grad_std": 0.16148851811885834, "epsilon_dpo/beta_margin_mean": 0.6291952729225159, "epsilon_dpo/beta_margin_std": 0.8071854710578918, "epsilon_dpo/loss_margin_mean": 85.62075805664062, "grad_norm": 46.36285400390625, "kl/avg_steps": 0.40625, "kl/beta": 0.007438861299306154, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 8.728636813280163e-08, "logits/chosen": -3.01594614982605, "logits/rejected": -3.1116886138916016, "logps/chosen": -113.1514892578125, "logps/ref_chosen": -51.715484619140625, "logps/ref_rejected": -82.72132873535156, "logps/rejected": -229.77809143066406, "loss": 0.9897, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4573938250541687, "rewards/margins": 0.6291953325271606, "rewards/rejected": -1.0865890979766846, "step": 499 }, { "epoch": 0.7558578987150416, "epsilon_dpo/beta": 0.007370140869170427, "epsilon_dpo/beta_margin_grad_mean": -0.3821001350879669, "epsilon_dpo/beta_margin_grad_std": 0.15127451717853546, "epsilon_dpo/beta_margin_mean": 0.5324572324752808, "epsilon_dpo/beta_margin_std": 0.6965858936309814, "epsilon_dpo/loss_margin_mean": 72.78131866455078, "grad_norm": 51.224552154541016, "kl/avg_steps": 0.53125, "kl/beta": 0.0074087632820010185, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.628481651367875e-08, "logits/chosen": -3.1126279830932617, "logits/rejected": -2.854917287826538, "logps/chosen": -133.07460021972656, "logps/ref_chosen": -65.06736755371094, "logps/ref_rejected": -64.74053955078125, "logps/rejected": -205.52908325195312, "loss": 1.0329, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5042167901992798, "rewards/margins": 0.5324572324752808, "rewards/rejected": -1.0366740226745605, "step": 500 }, { "epoch": 0.7558578987150416, "eval_epsilon_dpo/beta": 0.00733790872618556, "eval_epsilon_dpo/beta_margin_grad_mean": -0.4038424491882324, "eval_epsilon_dpo/beta_margin_grad_std": 0.15525583922863007, "eval_epsilon_dpo/beta_margin_mean": 0.43164491653442383, "eval_epsilon_dpo/beta_margin_std": 0.7150496244430542, "eval_epsilon_dpo/loss_margin_mean": 59.460208892822266, "eval_kl/n_epsilon_steps": 0.2790493071079254, "eval_kl/p_epsilon_steps": 0.7191901206970215, "eval_logits/chosen": -3.2563464641571045, "eval_logits/rejected": -3.08235239982605, "eval_logps/chosen": -156.10247802734375, "eval_logps/ref_chosen": -77.40868377685547, "eval_logps/ref_rejected": -73.52816772460938, "eval_logps/rejected": -211.68215942382812, "eval_loss": 0.5607970356941223, "eval_rewards/accuracies": 0.7257922291755676, "eval_rewards/chosen": -0.5799612998962402, "eval_rewards/margins": 0.43164491653442383, "eval_rewards/rejected": -1.011606216430664, "eval_runtime": 37.0337, "eval_samples_per_second": 62.187, "eval_steps_per_second": 1.944, "step": 500 }, { "epoch": 0.7573696145124716, "epsilon_dpo/beta": 0.007324284873902798, "epsilon_dpo/beta_margin_grad_mean": -0.3951195776462555, "epsilon_dpo/beta_margin_grad_std": 0.12190718948841095, "epsilon_dpo/beta_margin_mean": 0.4633955955505371, "epsilon_dpo/beta_margin_std": 0.5559080243110657, "epsilon_dpo/loss_margin_mean": 63.60774230957031, "grad_norm": 34.26152801513672, "kl/avg_steps": 0.625, "kl/beta": 0.007369612343609333, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 8.528784436016878e-08, "logits/chosen": -3.085878849029541, "logits/rejected": -2.9246068000793457, "logps/chosen": -123.36074829101562, "logps/ref_chosen": -55.657737731933594, "logps/ref_rejected": -67.8047103881836, "logps/rejected": -199.11546325683594, "loss": 1.046, "rewards/accuracies": 0.796875, "rewards/chosen": -0.49674493074417114, "rewards/margins": 0.4633955955505371, "rewards/rejected": -0.960140585899353, "step": 501 }, { "epoch": 0.7588813303099018, "epsilon_dpo/beta": 0.007285658735781908, "epsilon_dpo/beta_margin_grad_mean": -0.38645535707473755, "epsilon_dpo/beta_margin_grad_std": 0.13621246814727783, "epsilon_dpo/beta_margin_mean": 0.5040711760520935, "epsilon_dpo/beta_margin_std": 0.6184014678001404, "epsilon_dpo/loss_margin_mean": 69.67266845703125, "grad_norm": 38.84273147583008, "kl/avg_steps": 0.53125, "kl/beta": 0.00732383830472827, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.4295479559726e-08, "logits/chosen": -3.004547119140625, "logits/rejected": -3.062962532043457, "logps/chosen": -125.55531311035156, "logps/ref_chosen": -57.86445236206055, "logps/ref_rejected": -80.10221862792969, "logps/rejected": -217.4657440185547, "loss": 1.0318, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4950679540634155, "rewards/margins": 0.5040711760520935, "rewards/rejected": -0.9991391897201538, "step": 502 }, { "epoch": 0.7603930461073318, "epsilon_dpo/beta": 0.007244881242513657, "epsilon_dpo/beta_margin_grad_mean": -0.3971402049064636, "epsilon_dpo/beta_margin_grad_std": 0.15177951753139496, "epsilon_dpo/beta_margin_mean": 0.45825666189193726, "epsilon_dpo/beta_margin_std": 0.6895651817321777, "epsilon_dpo/loss_margin_mean": 63.78998565673828, "grad_norm": 32.562339782714844, "kl/avg_steps": 0.5625, "kl/beta": 0.0072851357981562614, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 8.330774987092712e-08, "logits/chosen": -2.8538990020751953, "logits/rejected": -2.6417040824890137, "logps/chosen": -106.99624633789062, "logps/ref_chosen": -45.11316680908203, "logps/ref_rejected": -49.7468376159668, "logps/rejected": -175.41990661621094, "loss": 1.0889, "rewards/accuracies": 0.765625, "rewards/chosen": -0.45119500160217285, "rewards/margins": 0.45825666189193726, "rewards/rejected": -0.9094517230987549, "step": 503 }, { "epoch": 0.7619047619047619, "epsilon_dpo/beta": 0.007207741495221853, "epsilon_dpo/beta_margin_grad_mean": -0.38236457109451294, "epsilon_dpo/beta_margin_grad_std": 0.14294874668121338, "epsilon_dpo/beta_margin_mean": 0.5228719711303711, "epsilon_dpo/beta_margin_std": 0.6393216848373413, "epsilon_dpo/loss_margin_mean": 73.02294158935547, "grad_norm": 37.86565399169922, "kl/avg_steps": 0.515625, "kl/beta": 0.0072443862445652485, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.75, "learning_rate": 8.232468292269479e-08, "logits/chosen": -3.0809526443481445, "logits/rejected": -2.8696625232696533, "logps/chosen": -132.6295928955078, "logps/ref_chosen": -61.24930953979492, "logps/ref_rejected": -69.04974365234375, "logps/rejected": -213.45297241210938, "loss": 1.0246, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5169135332107544, "rewards/margins": 0.5228719711303711, "rewards/rejected": -1.039785623550415, "step": 504 }, { "epoch": 0.763416477702192, "epsilon_dpo/beta": 0.007176410406827927, "epsilon_dpo/beta_margin_grad_mean": -0.4104072153568268, "epsilon_dpo/beta_margin_grad_std": 0.16137582063674927, "epsilon_dpo/beta_margin_mean": 0.4136159121990204, "epsilon_dpo/beta_margin_std": 0.7398061752319336, "epsilon_dpo/loss_margin_mean": 58.29680633544922, "grad_norm": 49.246036529541016, "kl/avg_steps": 0.4375, "kl/beta": 0.007207224145531654, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 8.134630621352483e-08, "logits/chosen": -2.9876012802124023, "logits/rejected": -3.0184221267700195, "logps/chosen": -125.69742584228516, "logps/ref_chosen": -55.810546875, "logps/ref_rejected": -73.23880004882812, "logps/rejected": -201.4224853515625, "loss": 1.1395, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5034447908401489, "rewards/margins": 0.4136159121990204, "rewards/rejected": -0.9170607328414917, "step": 505 }, { "epoch": 0.764928193499622, "epsilon_dpo/beta": 0.007140664383769035, "epsilon_dpo/beta_margin_grad_mean": -0.39997339248657227, "epsilon_dpo/beta_margin_grad_std": 0.15013551712036133, "epsilon_dpo/beta_margin_mean": 0.4456862211227417, "epsilon_dpo/beta_margin_std": 0.6688446998596191, "epsilon_dpo/loss_margin_mean": 63.018272399902344, "grad_norm": 41.16862106323242, "kl/avg_steps": 0.5, "kl/beta": 0.007175829727202654, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 8.037264711071698e-08, "logits/chosen": -3.181790590286255, "logits/rejected": -3.0489487648010254, "logps/chosen": -134.08726501464844, "logps/ref_chosen": -65.04371643066406, "logps/ref_rejected": -74.94888305664062, "logps/rejected": -207.01071166992188, "loss": 1.0933, "rewards/accuracies": 0.734375, "rewards/chosen": -0.49515384435653687, "rewards/margins": 0.4456862211227417, "rewards/rejected": -0.9408400058746338, "step": 506 }, { "epoch": 0.7664399092970522, "epsilon_dpo/beta": 0.0070962123572826385, "epsilon_dpo/beta_margin_grad_mean": -0.38349825143814087, "epsilon_dpo/beta_margin_grad_std": 0.16468922793865204, "epsilon_dpo/beta_margin_mean": 0.5380831360816956, "epsilon_dpo/beta_margin_std": 0.8195880055427551, "epsilon_dpo/loss_margin_mean": 76.4280776977539, "grad_norm": 32.7825813293457, "kl/avg_steps": 0.625, "kl/beta": 0.0071401288732886314, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 7.940373284960933e-08, "logits/chosen": -3.096954345703125, "logits/rejected": -3.0884528160095215, "logps/chosen": -135.63888549804688, "logps/ref_chosen": -62.45527267456055, "logps/ref_rejected": -86.65788269042969, "logps/rejected": -236.26956176757812, "loss": 1.0626, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5210789442062378, "rewards/margins": 0.5380831360816956, "rewards/rejected": -1.0591621398925781, "step": 507 }, { "epoch": 0.7679516250944822, "epsilon_dpo/beta": 0.007061007432639599, "epsilon_dpo/beta_margin_grad_mean": -0.376692533493042, "epsilon_dpo/beta_margin_grad_std": 0.1538206785917282, "epsilon_dpo/beta_margin_mean": 0.554760754108429, "epsilon_dpo/beta_margin_std": 0.7193010449409485, "epsilon_dpo/loss_margin_mean": 79.2007064819336, "grad_norm": 33.159156799316406, "kl/avg_steps": 0.5, "kl/beta": 0.007095780223608017, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 7.843959053281663e-08, "logits/chosen": -2.9666225910186768, "logits/rejected": -3.1040258407592773, "logps/chosen": -122.66288757324219, "logps/ref_chosen": -53.621055603027344, "logps/ref_rejected": -88.15339660644531, "logps/rejected": -236.39593505859375, "loss": 1.0226, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4884149432182312, "rewards/margins": 0.554760754108429, "rewards/rejected": -1.0431756973266602, "step": 508 }, { "epoch": 0.7694633408919124, "epsilon_dpo/beta": 0.007023671641945839, "epsilon_dpo/beta_margin_grad_mean": -0.3967084288597107, "epsilon_dpo/beta_margin_grad_std": 0.1436932533979416, "epsilon_dpo/beta_margin_mean": 0.45068642497062683, "epsilon_dpo/beta_margin_std": 0.6293744444847107, "epsilon_dpo/loss_margin_mean": 64.72048950195312, "grad_norm": 44.39530944824219, "kl/avg_steps": 0.53125, "kl/beta": 0.00706047797575593, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 7.748024712947204e-08, "logits/chosen": -2.8275539875030518, "logits/rejected": -2.8637447357177734, "logps/chosen": -122.241943359375, "logps/ref_chosen": -52.83343505859375, "logps/ref_rejected": -65.79664611816406, "logps/rejected": -199.92562866210938, "loss": 1.0787, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4892672300338745, "rewards/margins": 0.45068639516830444, "rewards/rejected": -0.9399536848068237, "step": 509 }, { "epoch": 0.7709750566893424, "epsilon_dpo/beta": 0.006979970261454582, "epsilon_dpo/beta_margin_grad_mean": -0.3723670244216919, "epsilon_dpo/beta_margin_grad_std": 0.1713150590658188, "epsilon_dpo/beta_margin_mean": 0.6000616550445557, "epsilon_dpo/beta_margin_std": 0.8300207853317261, "epsilon_dpo/loss_margin_mean": 86.62559509277344, "grad_norm": 41.50522232055664, "kl/avg_steps": 0.625, "kl/beta": 0.007023167330771685, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 7.652572947447272e-08, "logits/chosen": -2.9791646003723145, "logits/rejected": -3.0781335830688477, "logps/chosen": -115.27822875976562, "logps/ref_chosen": -49.32378387451172, "logps/ref_rejected": -85.82286071777344, "logps/rejected": -238.40289306640625, "loss": 1.0233, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4626058042049408, "rewards/margins": 0.6000616550445557, "rewards/rejected": -1.0626674890518188, "step": 510 }, { "epoch": 0.7724867724867724, "epsilon_dpo/beta": 0.006943160202354193, "epsilon_dpo/beta_margin_grad_mean": -0.35687482357025146, "epsilon_dpo/beta_margin_grad_std": 0.15585577487945557, "epsilon_dpo/beta_margin_mean": 0.6541997194290161, "epsilon_dpo/beta_margin_std": 0.7319356799125671, "epsilon_dpo/loss_margin_mean": 94.85025024414062, "grad_norm": 30.745098114013672, "kl/avg_steps": 0.53125, "kl/beta": 0.00697954511269927, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 7.557606426772961e-08, "logits/chosen": -2.9528911113739014, "logits/rejected": -2.974071502685547, "logps/chosen": -124.519287109375, "logps/ref_chosen": -53.72102737426758, "logps/ref_rejected": -83.03715515136719, "logps/rejected": -248.6856689453125, "loss": 0.9548, "rewards/accuracies": 0.796875, "rewards/chosen": -0.49385714530944824, "rewards/margins": 0.6541997194290161, "rewards/rejected": -1.148056983947754, "step": 511 }, { "epoch": 0.7739984882842026, "epsilon_dpo/beta": 0.006911884061992168, "epsilon_dpo/beta_margin_grad_mean": -0.39258766174316406, "epsilon_dpo/beta_margin_grad_std": 0.13970740139484406, "epsilon_dpo/beta_margin_mean": 0.47434836626052856, "epsilon_dpo/beta_margin_std": 0.6180079579353333, "epsilon_dpo/loss_margin_mean": 69.21267700195312, "grad_norm": 38.668277740478516, "kl/avg_steps": 0.453125, "kl/beta": 0.006942662410438061, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 7.463127807341966e-08, "logits/chosen": -3.0006868839263916, "logits/rejected": -2.8296306133270264, "logps/chosen": -117.05911254882812, "logps/ref_chosen": -54.16181182861328, "logps/ref_rejected": -60.938201904296875, "logps/rejected": -193.04818725585938, "loss": 1.056, "rewards/accuracies": 0.75, "rewards/chosen": -0.4369485378265381, "rewards/margins": 0.47434839606285095, "rewards/rejected": -0.9112969636917114, "step": 512 }, { "epoch": 0.7755102040816326, "epsilon_dpo/beta": 0.006868836469948292, "epsilon_dpo/beta_margin_grad_mean": -0.3745346963405609, "epsilon_dpo/beta_margin_grad_std": 0.13604199886322021, "epsilon_dpo/beta_margin_mean": 0.5604533553123474, "epsilon_dpo/beta_margin_std": 0.6281871199607849, "epsilon_dpo/loss_margin_mean": 82.01950073242188, "grad_norm": 34.606842041015625, "kl/avg_steps": 0.625, "kl/beta": 0.006911345291882753, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 7.369139731924401e-08, "logits/chosen": -2.867465019226074, "logits/rejected": -2.8276450634002686, "logps/chosen": -101.76954650878906, "logps/ref_chosen": -44.1099853515625, "logps/ref_rejected": -59.48210906982422, "logps/rejected": -199.1611785888672, "loss": 0.9913, "rewards/accuracies": 0.828125, "rewards/chosen": -0.39775681495666504, "rewards/margins": 0.5604534149169922, "rewards/rejected": -0.9582101702690125, "step": 513 }, { "epoch": 0.7770219198790628, "epsilon_dpo/beta": 0.006819732952862978, "epsilon_dpo/beta_margin_grad_mean": -0.3581527769565582, "epsilon_dpo/beta_margin_grad_std": 0.1334848552942276, "epsilon_dpo/beta_margin_mean": 0.6384442448616028, "epsilon_dpo/beta_margin_std": 0.6430338621139526, "epsilon_dpo/loss_margin_mean": 94.02555847167969, "grad_norm": 39.366207122802734, "kl/avg_steps": 0.71875, "kl/beta": 0.006868417840451002, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 7.275644829568747e-08, "logits/chosen": -3.105492353439331, "logits/rejected": -2.9334261417388916, "logps/chosen": -124.1356201171875, "logps/ref_chosen": -59.256065368652344, "logps/ref_rejected": -76.36601257324219, "logps/rejected": -235.2711181640625, "loss": 0.936, "rewards/accuracies": 0.859375, "rewards/chosen": -0.44450265169143677, "rewards/margins": 0.638444185256958, "rewards/rejected": -1.0829468965530396, "step": 514 }, { "epoch": 0.7785336356764928, "epsilon_dpo/beta": 0.006775328423827887, "epsilon_dpo/beta_margin_grad_mean": -0.3701605498790741, "epsilon_dpo/beta_margin_grad_std": 0.12910836935043335, "epsilon_dpo/beta_margin_mean": 0.5764958262443542, "epsilon_dpo/beta_margin_std": 0.5842675566673279, "epsilon_dpo/loss_margin_mean": 85.4928207397461, "grad_norm": 33.335044860839844, "kl/avg_steps": 0.65625, "kl/beta": 0.006819403264671564, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 7.182645715528435e-08, "logits/chosen": -2.830134868621826, "logits/rejected": -2.8089051246643066, "logps/chosen": -115.62060546875, "logps/ref_chosen": -49.75600051879883, "logps/ref_rejected": -72.72505950927734, "logps/rejected": -224.08248901367188, "loss": 0.9686, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4476158916950226, "rewards/margins": 0.5764958262443542, "rewards/rejected": -1.0241117477416992, "step": 515 }, { "epoch": 0.780045351473923, "epsilon_dpo/beta": 0.006733272690325975, "epsilon_dpo/beta_margin_grad_mean": -0.39368221163749695, "epsilon_dpo/beta_margin_grad_std": 0.14413417875766754, "epsilon_dpo/beta_margin_mean": 0.46791815757751465, "epsilon_dpo/beta_margin_std": 0.6604968309402466, "epsilon_dpo/loss_margin_mean": 70.00223541259766, "grad_norm": 36.1458854675293, "kl/avg_steps": 0.625, "kl/beta": 0.006774942856281996, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 7.090144991188568e-08, "logits/chosen": -2.9159159660339355, "logits/rejected": -2.7660303115844727, "logps/chosen": -114.50862121582031, "logps/ref_chosen": -50.218711853027344, "logps/ref_rejected": -62.196388244628906, "logps/rejected": -196.488525390625, "loss": 1.0719, "rewards/accuracies": 0.828125, "rewards/chosen": -0.43447840213775635, "rewards/margins": 0.46791815757751465, "rewards/rejected": -0.902396559715271, "step": 516 }, { "epoch": 0.781557067271353, "epsilon_dpo/beta": 0.006706180516630411, "epsilon_dpo/beta_margin_grad_mean": -0.40641912817955017, "epsilon_dpo/beta_margin_grad_std": 0.14336362481117249, "epsilon_dpo/beta_margin_mean": 0.4176146984100342, "epsilon_dpo/beta_margin_std": 0.6426627039909363, "epsilon_dpo/loss_margin_mean": 62.90836715698242, "grad_norm": 35.3643913269043, "kl/avg_steps": 0.40625, "kl/beta": 0.006732862442731857, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 6.998145243993284e-08, "logits/chosen": -2.99741530418396, "logits/rejected": -2.7058167457580566, "logps/chosen": -133.98065185546875, "logps/ref_chosen": -57.744102478027344, "logps/ref_rejected": -59.522891998291016, "logps/rejected": -198.66781616210938, "loss": 1.1071, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5137192010879517, "rewards/margins": 0.4176146984100342, "rewards/rejected": -0.9313338994979858, "step": 517 }, { "epoch": 0.783068783068783, "epsilon_dpo/beta": 0.006672760006040335, "epsilon_dpo/beta_margin_grad_mean": -0.3890801966190338, "epsilon_dpo/beta_margin_grad_std": 0.13398851454257965, "epsilon_dpo/beta_margin_mean": 0.49426019191741943, "epsilon_dpo/beta_margin_std": 0.6151096820831299, "epsilon_dpo/loss_margin_mean": 74.5699462890625, "grad_norm": 36.2625617980957, "kl/avg_steps": 0.5, "kl/beta": 0.006705620791763067, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 6.906649047373245e-08, "logits/chosen": -3.0250165462493896, "logits/rejected": -3.0383143424987793, "logps/chosen": -112.54780578613281, "logps/ref_chosen": -53.01203918457031, "logps/ref_rejected": -74.49579620361328, "logps/rejected": -208.60150146484375, "loss": 1.0375, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3979129195213318, "rewards/margins": 0.49426019191741943, "rewards/rejected": -0.8921731114387512, "step": 518 }, { "epoch": 0.7845804988662132, "epsilon_dpo/beta": 0.006647903006523848, "epsilon_dpo/beta_margin_grad_mean": -0.41425222158432007, "epsilon_dpo/beta_margin_grad_std": 0.15925246477127075, "epsilon_dpo/beta_margin_mean": 0.3807467222213745, "epsilon_dpo/beta_margin_std": 0.7065668106079102, "epsilon_dpo/loss_margin_mean": 58.0482292175293, "grad_norm": 45.81927490234375, "kl/avg_steps": 0.375, "kl/beta": 0.006672259420156479, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 6.815658960673781e-08, "logits/chosen": -2.8930835723876953, "logits/rejected": -2.841902256011963, "logps/chosen": -128.89305114746094, "logps/ref_chosen": -48.611289978027344, "logps/ref_rejected": -63.99748992919922, "logps/rejected": -202.32748413085938, "loss": 1.1582, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5354052782058716, "rewards/margins": 0.3807467222213745, "rewards/rejected": -0.9161520004272461, "step": 519 }, { "epoch": 0.7860922146636432, "epsilon_dpo/beta": 0.006608524359762669, "epsilon_dpo/beta_margin_grad_mean": -0.40034976601600647, "epsilon_dpo/beta_margin_grad_std": 0.15310275554656982, "epsilon_dpo/beta_margin_mean": 0.4381295144557953, "epsilon_dpo/beta_margin_std": 0.6954367756843567, "epsilon_dpo/loss_margin_mean": 66.88912200927734, "grad_norm": 36.95631408691406, "kl/avg_steps": 0.59375, "kl/beta": 0.006647332105785608, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 6.725177529083209e-08, "logits/chosen": -2.797213315963745, "logits/rejected": -2.8640899658203125, "logps/chosen": -135.1001739501953, "logps/ref_chosen": -56.212623596191406, "logps/ref_rejected": -68.09236145019531, "logps/rejected": -213.86904907226562, "loss": 1.1069, "rewards/accuracies": 0.75, "rewards/chosen": -0.5230200886726379, "rewards/margins": 0.4381295442581177, "rewards/rejected": -0.9611495733261108, "step": 520 }, { "epoch": 0.7876039304610734, "epsilon_dpo/beta": 0.006571582984179258, "epsilon_dpo/beta_margin_grad_mean": -0.3762890696525574, "epsilon_dpo/beta_margin_grad_std": 0.1344679445028305, "epsilon_dpo/beta_margin_mean": 0.5540176033973694, "epsilon_dpo/beta_margin_std": 0.6314072012901306, "epsilon_dpo/loss_margin_mean": 84.81503295898438, "grad_norm": 33.24909591674805, "kl/avg_steps": 0.5625, "kl/beta": 0.006608096417039633, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 6.63520728356167e-08, "logits/chosen": -3.017455577850342, "logits/rejected": -3.085193157196045, "logps/chosen": -134.95330810546875, "logps/ref_chosen": -59.9977912902832, "logps/ref_rejected": -89.10822296142578, "logps/rejected": -248.87875366210938, "loss": 0.9954, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4945220947265625, "rewards/margins": 0.5540176033973694, "rewards/rejected": -1.0485397577285767, "step": 521 }, { "epoch": 0.7891156462585034, "epsilon_dpo/beta": 0.006536878179758787, "epsilon_dpo/beta_margin_grad_mean": -0.4007669985294342, "epsilon_dpo/beta_margin_grad_std": 0.1413155347108841, "epsilon_dpo/beta_margin_mean": 0.43931877613067627, "epsilon_dpo/beta_margin_std": 0.6393123865127563, "epsilon_dpo/loss_margin_mean": 67.76762390136719, "grad_norm": 41.81180191040039, "kl/avg_steps": 0.53125, "kl/beta": 0.006571133621037006, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 6.545750740770336e-08, "logits/chosen": -2.9188756942749023, "logits/rejected": -2.7320048809051514, "logps/chosen": -129.12664794921875, "logps/ref_chosen": -53.707881927490234, "logps/ref_rejected": -66.1962661743164, "logps/rejected": -209.38265991210938, "loss": 1.0884, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4937530755996704, "rewards/margins": 0.43931877613067627, "rewards/rejected": -0.9330717921257019, "step": 522 }, { "epoch": 0.7906273620559335, "epsilon_dpo/beta": 0.006498249247670174, "epsilon_dpo/beta_margin_grad_mean": -0.3842654228210449, "epsilon_dpo/beta_margin_grad_std": 0.13943679630756378, "epsilon_dpo/beta_margin_mean": 0.5076578855514526, "epsilon_dpo/beta_margin_std": 0.6174559593200684, "epsilon_dpo/loss_margin_mean": 78.66818237304688, "grad_norm": 40.41939163208008, "kl/avg_steps": 0.59375, "kl/beta": 0.006536409258842468, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 6.456810403001012e-08, "logits/chosen": -3.0257396697998047, "logits/rejected": -2.9702131748199463, "logps/chosen": -130.9298858642578, "logps/ref_chosen": -57.9495735168457, "logps/ref_rejected": -80.56602478027344, "logps/rejected": -232.2145233154297, "loss": 1.0303, "rewards/accuracies": 0.765625, "rewards/chosen": -0.47569265961647034, "rewards/margins": 0.5076578855514526, "rewards/rejected": -0.9833505153656006, "step": 523 }, { "epoch": 0.7921390778533636, "epsilon_dpo/beta": 0.006470047403126955, "epsilon_dpo/beta_margin_grad_mean": -0.4083709716796875, "epsilon_dpo/beta_margin_grad_std": 0.140652135014534, "epsilon_dpo/beta_margin_mean": 0.4028201401233673, "epsilon_dpo/beta_margin_std": 0.6283307671546936, "epsilon_dpo/loss_margin_mean": 62.86546325683594, "grad_norm": 31.350635528564453, "kl/avg_steps": 0.4375, "kl/beta": 0.0064978282898664474, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 6.368388758106134e-08, "logits/chosen": -3.2298717498779297, "logits/rejected": -3.1413187980651855, "logps/chosen": -131.7079620361328, "logps/ref_chosen": -70.82115173339844, "logps/ref_rejected": -88.40533447265625, "logps/rejected": -212.15760803222656, "loss": 1.1153, "rewards/accuracies": 0.765625, "rewards/chosen": -0.39560985565185547, "rewards/margins": 0.4028201401233673, "rewards/rejected": -0.7984299659729004, "step": 524 }, { "epoch": 0.7936507936507936, "epsilon_dpo/beta": 0.00645399559289217, "epsilon_dpo/beta_margin_grad_mean": -0.4123237133026123, "epsilon_dpo/beta_margin_grad_std": 0.14350363612174988, "epsilon_dpo/beta_margin_mean": 0.3931123614311218, "epsilon_dpo/beta_margin_std": 0.6422011852264404, "epsilon_dpo/loss_margin_mean": 61.61846160888672, "grad_norm": 41.7164421081543, "kl/avg_steps": 0.25, "kl/beta": 0.006469523999840021, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 6.280488279429185e-08, "logits/chosen": -3.084261178970337, "logits/rejected": -3.0053093433380127, "logps/chosen": -145.2374725341797, "logps/ref_chosen": -67.49630737304688, "logps/ref_rejected": -80.82887268066406, "logps/rejected": -220.18850708007812, "loss": 1.1267, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5040992498397827, "rewards/margins": 0.3931123614311218, "rewards/rejected": -0.8972115516662598, "step": 525 }, { "epoch": 0.7951625094482238, "epsilon_dpo/beta": 0.006421766243875027, "epsilon_dpo/beta_margin_grad_mean": -0.4166853129863739, "epsilon_dpo/beta_margin_grad_std": 0.14431175589561462, "epsilon_dpo/beta_margin_mean": 0.3714195489883423, "epsilon_dpo/beta_margin_std": 0.6540345549583435, "epsilon_dpo/loss_margin_mean": 58.37696838378906, "grad_norm": 40.86532211303711, "kl/avg_steps": 0.5, "kl/beta": 0.006453390698879957, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 6.193111425735515e-08, "logits/chosen": -2.9267802238464355, "logits/rejected": -2.9615135192871094, "logps/chosen": -131.61412048339844, "logps/ref_chosen": -51.361576080322266, "logps/ref_rejected": -72.6195068359375, "logps/rejected": -211.2490234375, "loss": 1.1475, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5173400640487671, "rewards/margins": 0.3714195489883423, "rewards/rejected": -0.8887596130371094, "step": 526 }, { "epoch": 0.7966742252456538, "epsilon_dpo/beta": 0.006393830291926861, "epsilon_dpo/beta_margin_grad_mean": -0.42204707860946655, "epsilon_dpo/beta_margin_grad_std": 0.15254561603069305, "epsilon_dpo/beta_margin_mean": 0.34858542680740356, "epsilon_dpo/beta_margin_std": 0.6807777285575867, "epsilon_dpo/loss_margin_mean": 55.222347259521484, "grad_norm": 49.126426696777344, "kl/avg_steps": 0.4375, "kl/beta": 0.006421284284442663, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 6.106260641143546e-08, "logits/chosen": -2.844531297683716, "logits/rejected": -2.90095853805542, "logps/chosen": -143.91748046875, "logps/ref_chosen": -49.62992858886719, "logps/ref_rejected": -77.23707580566406, "logps/rejected": -226.74696350097656, "loss": 1.1757, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6048531532287598, "rewards/margins": 0.34858545660972595, "rewards/rejected": -0.9534386396408081, "step": 527 }, { "epoch": 0.7981859410430839, "epsilon_dpo/beta": 0.006365979090332985, "epsilon_dpo/beta_margin_grad_mean": -0.4285464286804199, "epsilon_dpo/beta_margin_grad_std": 0.16217948496341705, "epsilon_dpo/beta_margin_mean": 0.32109710574150085, "epsilon_dpo/beta_margin_std": 0.7377678751945496, "epsilon_dpo/loss_margin_mean": 51.20362854003906, "grad_norm": 42.96821212768555, "kl/avg_steps": 0.4375, "kl/beta": 0.006393313407897949, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 6.019938355056422e-08, "logits/chosen": -2.7745370864868164, "logits/rejected": -2.7953453063964844, "logps/chosen": -124.53018188476562, "logps/ref_chosen": -50.83088302612305, "logps/ref_rejected": -62.474666595458984, "logps/rejected": -187.37759399414062, "loss": 1.2164, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4711000323295593, "rewards/margins": 0.32109707593917847, "rewards/rejected": -0.7921971082687378, "step": 528 }, { "epoch": 0.799697656840514, "epsilon_dpo/beta": 0.0063223340548574924, "epsilon_dpo/beta_margin_grad_mean": -0.35607659816741943, "epsilon_dpo/beta_margin_grad_std": 0.12120664864778519, "epsilon_dpo/beta_margin_mean": 0.6321461200714111, "epsilon_dpo/beta_margin_std": 0.546614408493042, "epsilon_dpo/loss_margin_mean": 100.41100311279297, "grad_norm": 32.995635986328125, "kl/avg_steps": 0.6875, "kl/beta": 0.00636546453461051, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 5.934146982094049e-08, "logits/chosen": -2.859769821166992, "logits/rejected": -2.8295977115631104, "logps/chosen": -106.63168334960938, "logps/ref_chosen": -47.529258728027344, "logps/ref_rejected": -69.37388610839844, "logps/rejected": -228.88731384277344, "loss": 0.9195, "rewards/accuracies": 0.859375, "rewards/chosen": -0.3749893307685852, "rewards/margins": 0.6321461200714111, "rewards/rejected": -1.0071353912353516, "step": 529 }, { "epoch": 0.8012093726379441, "epsilon_dpo/beta": 0.006292995531111956, "epsilon_dpo/beta_margin_grad_mean": -0.39428189396858215, "epsilon_dpo/beta_margin_grad_std": 0.13655197620391846, "epsilon_dpo/beta_margin_mean": 0.4761402904987335, "epsilon_dpo/beta_margin_std": 0.6301091909408569, "epsilon_dpo/loss_margin_mean": 76.23155212402344, "grad_norm": 32.69504165649414, "kl/avg_steps": 0.46875, "kl/beta": 0.00632200064137578, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 5.848888922025552e-08, "logits/chosen": -2.945476770401001, "logits/rejected": -2.8768386840820312, "logps/chosen": -118.99136352539062, "logps/ref_chosen": -56.40074157714844, "logps/ref_rejected": -67.39358520507812, "logps/rejected": -206.21575927734375, "loss": 1.0553, "rewards/accuracies": 0.75, "rewards/chosen": -0.3948723077774048, "rewards/margins": 0.47614026069641113, "rewards/rejected": -0.8710125684738159, "step": 530 }, { "epoch": 0.8027210884353742, "epsilon_dpo/beta": 0.0062675680965185165, "epsilon_dpo/beta_margin_grad_mean": -0.40403202176094055, "epsilon_dpo/beta_margin_grad_std": 0.12742485105991364, "epsilon_dpo/beta_margin_mean": 0.42395439743995667, "epsilon_dpo/beta_margin_std": 0.5731499195098877, "epsilon_dpo/loss_margin_mean": 68.20128631591797, "grad_norm": 38.166290283203125, "kl/avg_steps": 0.40625, "kl/beta": 0.006292504724115133, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 5.7641665597021435e-08, "logits/chosen": -2.8602776527404785, "logits/rejected": -2.9472453594207764, "logps/chosen": -113.72921752929688, "logps/ref_chosen": -45.397708892822266, "logps/ref_rejected": -74.88275146484375, "logps/rejected": -211.41554260253906, "loss": 1.0822, "rewards/accuracies": 0.75, "rewards/chosen": -0.4295659065246582, "rewards/margins": 0.42395442724227905, "rewards/rejected": -0.8535202741622925, "step": 531 }, { "epoch": 0.8042328042328042, "epsilon_dpo/beta": 0.00622849864885211, "epsilon_dpo/beta_margin_grad_mean": -0.37603089213371277, "epsilon_dpo/beta_margin_grad_std": 0.14204396307468414, "epsilon_dpo/beta_margin_mean": 0.5429787635803223, "epsilon_dpo/beta_margin_std": 0.6385970711708069, "epsilon_dpo/loss_margin_mean": 87.7481918334961, "grad_norm": 35.18793487548828, "kl/avg_steps": 0.625, "kl/beta": 0.006267044693231583, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 5.679982264990424e-08, "logits/chosen": -2.926102876663208, "logits/rejected": -2.7412655353546143, "logps/chosen": -130.45968627929688, "logps/ref_chosen": -50.93199157714844, "logps/ref_rejected": -66.29621887207031, "logps/rejected": -233.5720977783203, "loss": 1.0095, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4965301752090454, "rewards/margins": 0.5429787635803223, "rewards/rejected": -1.0395090579986572, "step": 532 }, { "epoch": 0.8057445200302343, "epsilon_dpo/beta": 0.006191758438944817, "epsilon_dpo/beta_margin_grad_mean": -0.4149245023727417, "epsilon_dpo/beta_margin_grad_std": 0.12032075226306915, "epsilon_dpo/beta_margin_mean": 0.3643096685409546, "epsilon_dpo/beta_margin_std": 0.5264121294021606, "epsilon_dpo/loss_margin_mean": 59.282554626464844, "grad_norm": 30.956478118896484, "kl/avg_steps": 0.59375, "kl/beta": 0.006228119134902954, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.596338392706076e-08, "logits/chosen": -2.9853875637054443, "logits/rejected": -3.003324031829834, "logps/chosen": -111.393798828125, "logps/ref_chosen": -49.83143997192383, "logps/ref_rejected": -68.7999267578125, "logps/rejected": -189.64483642578125, "loss": 1.1201, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3822932243347168, "rewards/margins": 0.364309698343277, "rewards/rejected": -0.7466028928756714, "step": 533 }, { "epoch": 0.8072562358276644, "epsilon_dpo/beta": 0.006164886988699436, "epsilon_dpo/beta_margin_grad_mean": -0.40551069378852844, "epsilon_dpo/beta_margin_grad_std": 0.14718236029148102, "epsilon_dpo/beta_margin_mean": 0.4213190972805023, "epsilon_dpo/beta_margin_std": 0.6546342968940735, "epsilon_dpo/loss_margin_mean": 69.02452850341797, "grad_norm": 36.24626541137695, "kl/avg_steps": 0.4375, "kl/beta": 0.006191357970237732, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 5.513237282548033e-08, "logits/chosen": -3.1082310676574707, "logits/rejected": -3.0677051544189453, "logps/chosen": -118.0005874633789, "logps/ref_chosen": -53.24787902832031, "logps/ref_rejected": -75.25416564941406, "logps/rejected": -209.03140258789062, "loss": 1.1082, "rewards/accuracies": 0.71875, "rewards/chosen": -0.40090060234069824, "rewards/margins": 0.4213190972805023, "rewards/rejected": -0.8222196698188782, "step": 534 }, { "epoch": 0.8087679516250945, "epsilon_dpo/beta": 0.00613225344568491, "epsilon_dpo/beta_margin_grad_mean": -0.3959249258041382, "epsilon_dpo/beta_margin_grad_std": 0.15420867502689362, "epsilon_dpo/beta_margin_mean": 0.4675506353378296, "epsilon_dpo/beta_margin_std": 0.7043803930282593, "epsilon_dpo/loss_margin_mean": 76.9376449584961, "grad_norm": 34.001956939697266, "kl/avg_steps": 0.53125, "kl/beta": 0.006164388731122017, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 5.430681259032957e-08, "logits/chosen": -2.955233573913574, "logits/rejected": -2.943449020385742, "logps/chosen": -130.8115234375, "logps/ref_chosen": -56.85047912597656, "logps/ref_rejected": -72.63682556152344, "logps/rejected": -223.53550720214844, "loss": 1.0857, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4558217525482178, "rewards/margins": 0.4675506353378296, "rewards/rejected": -0.9233723878860474, "step": 535 }, { "epoch": 0.8102796674225246, "epsilon_dpo/beta": 0.006096015218645334, "epsilon_dpo/beta_margin_grad_mean": -0.3757854104042053, "epsilon_dpo/beta_margin_grad_std": 0.12197672575712204, "epsilon_dpo/beta_margin_mean": 0.5512234568595886, "epsilon_dpo/beta_margin_std": 0.5709772109985352, "epsilon_dpo/loss_margin_mean": 90.8038101196289, "grad_norm": 31.365121841430664, "kl/avg_steps": 0.59375, "kl/beta": 0.006131813395768404, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.3486726314303175e-08, "logits/chosen": -2.8355836868286133, "logits/rejected": -2.8151893615722656, "logps/chosen": -109.3181381225586, "logps/ref_chosen": -46.26768112182617, "logps/ref_rejected": -61.68131637573242, "logps/rejected": -215.53558349609375, "loss": 0.9818, "rewards/accuracies": 0.875, "rewards/chosen": -0.3858313262462616, "rewards/margins": 0.5512235164642334, "rewards/rejected": -0.9370548129081726, "step": 536 }, { "epoch": 0.8117913832199547, "epsilon_dpo/beta": 0.006073369644582272, "epsilon_dpo/beta_margin_grad_mean": -0.38766026496887207, "epsilon_dpo/beta_margin_grad_std": 0.1522054821252823, "epsilon_dpo/beta_margin_mean": 0.5152029991149902, "epsilon_dpo/beta_margin_std": 0.719580352306366, "epsilon_dpo/loss_margin_mean": 85.6109390258789, "grad_norm": 29.331253051757812, "kl/avg_steps": 0.375, "kl/beta": 0.006095620803534985, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 5.267213693697695e-08, "logits/chosen": -2.8922853469848633, "logits/rejected": -3.0492963790893555, "logps/chosen": -127.57013702392578, "logps/ref_chosen": -53.744956970214844, "logps/ref_rejected": -91.43638610839844, "logps/rejected": -250.87249755859375, "loss": 1.0506, "rewards/accuracies": 0.71875, "rewards/chosen": -0.45007026195526123, "rewards/margins": 0.5152029991149902, "rewards/rejected": -0.9652732610702515, "step": 537 }, { "epoch": 0.8133030990173847, "epsilon_dpo/beta": 0.006035496015101671, "epsilon_dpo/beta_margin_grad_mean": -0.38688790798187256, "epsilon_dpo/beta_margin_grad_std": 0.13156120479106903, "epsilon_dpo/beta_margin_mean": 0.49831441044807434, "epsilon_dpo/beta_margin_std": 0.6034212708473206, "epsilon_dpo/loss_margin_mean": 83.04730987548828, "grad_norm": 32.05858612060547, "kl/avg_steps": 0.625, "kl/beta": 0.0060728476382792, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 5.1863067244167144e-08, "logits/chosen": -3.0710129737854004, "logits/rejected": -2.864048480987549, "logps/chosen": -136.90936279296875, "logps/ref_chosen": -62.525142669677734, "logps/ref_rejected": -72.6083984375, "logps/rejected": -230.03994750976562, "loss": 1.0315, "rewards/accuracies": 0.859375, "rewards/chosen": -0.4498280882835388, "rewards/margins": 0.49831441044807434, "rewards/rejected": -0.9481425285339355, "step": 538 }, { "epoch": 0.8148148148148148, "epsilon_dpo/beta": 0.006002714391797781, "epsilon_dpo/beta_margin_grad_mean": -0.41232961416244507, "epsilon_dpo/beta_margin_grad_std": 0.11742759495973587, "epsilon_dpo/beta_margin_mean": 0.3830870985984802, "epsilon_dpo/beta_margin_std": 0.52591472864151, "epsilon_dpo/loss_margin_mean": 64.27487182617188, "grad_norm": 38.21563720703125, "kl/avg_steps": 0.546875, "kl/beta": 0.006035128142684698, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 5.105953986729195e-08, "logits/chosen": -2.7718586921691895, "logits/rejected": -3.04723858833313, "logps/chosen": -126.07052612304688, "logps/ref_chosen": -50.92280578613281, "logps/ref_rejected": -78.22198486328125, "logps/rejected": -217.64456176757812, "loss": 1.1034, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4516846835613251, "rewards/margins": 0.3830870985984802, "rewards/rejected": -0.8347717523574829, "step": 539 }, { "epoch": 0.8163265306122449, "epsilon_dpo/beta": 0.005967260804027319, "epsilon_dpo/beta_margin_grad_mean": -0.3793550133705139, "epsilon_dpo/beta_margin_grad_std": 0.1340833604335785, "epsilon_dpo/beta_margin_mean": 0.5425504446029663, "epsilon_dpo/beta_margin_std": 0.6456456780433655, "epsilon_dpo/loss_margin_mean": 91.48410034179688, "grad_norm": 30.373767852783203, "kl/avg_steps": 0.59375, "kl/beta": 0.006002302747219801, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.026157728273966e-08, "logits/chosen": -2.96051287651062, "logits/rejected": -3.120746374130249, "logps/chosen": -122.25755310058594, "logps/ref_chosen": -56.9200553894043, "logps/ref_rejected": -87.46377563476562, "logps/rejected": -244.28536987304688, "loss": 1.0057, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3915056586265564, "rewards/margins": 0.5425504446029663, "rewards/rejected": -0.9340561032295227, "step": 540 }, { "epoch": 0.817838246409675, "epsilon_dpo/beta": 0.005940421484410763, "epsilon_dpo/beta_margin_grad_mean": -0.3963332176208496, "epsilon_dpo/beta_margin_grad_std": 0.1278933882713318, "epsilon_dpo/beta_margin_mean": 0.4507373869419098, "epsilon_dpo/beta_margin_std": 0.5560808777809143, "epsilon_dpo/loss_margin_mean": 76.5111083984375, "grad_norm": 33.86478805541992, "kl/avg_steps": 0.453125, "kl/beta": 0.0059668743051588535, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.9469201811239035e-08, "logits/chosen": -2.904022216796875, "logits/rejected": -2.5795416831970215, "logps/chosen": -126.27151489257812, "logps/ref_chosen": -56.62158203125, "logps/ref_rejected": -49.137245178222656, "logps/rejected": -195.29827880859375, "loss": 1.0583, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41583192348480225, "rewards/margins": 0.4507373571395874, "rewards/rejected": -0.8665692806243896, "step": 541 }, { "epoch": 0.8193499622071051, "epsilon_dpo/beta": 0.0059052808210253716, "epsilon_dpo/beta_margin_grad_mean": -0.39027562737464905, "epsilon_dpo/beta_margin_grad_std": 0.1456802785396576, "epsilon_dpo/beta_margin_mean": 0.48573678731918335, "epsilon_dpo/beta_margin_std": 0.6594421863555908, "epsilon_dpo/loss_margin_mean": 82.8905029296875, "grad_norm": 34.327449798583984, "kl/avg_steps": 0.59375, "kl/beta": 0.005939959082752466, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.868243561723534e-08, "logits/chosen": -2.8303732872009277, "logits/rejected": -2.746593952178955, "logps/chosen": -107.84027099609375, "logps/ref_chosen": -43.757328033447266, "logps/ref_rejected": -63.05831527709961, "logps/rejected": -210.03176879882812, "loss": 1.0581, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3797900378704071, "rewards/margins": 0.48573678731918335, "rewards/rejected": -0.8655267953872681, "step": 542 }, { "epoch": 0.8208616780045351, "epsilon_dpo/beta": 0.005872270558029413, "epsilon_dpo/beta_margin_grad_mean": -0.37960028648376465, "epsilon_dpo/beta_margin_grad_std": 0.1290379911661148, "epsilon_dpo/beta_margin_mean": 0.532049834728241, "epsilon_dpo/beta_margin_std": 0.5892606973648071, "epsilon_dpo/loss_margin_mean": 91.17375183105469, "grad_norm": 30.82787322998047, "kl/avg_steps": 0.5625, "kl/beta": 0.005904898513108492, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.790130070827028e-08, "logits/chosen": -2.730346202850342, "logits/rejected": -2.84165620803833, "logps/chosen": -111.89370727539062, "logps/ref_chosen": -43.32347869873047, "logps/ref_rejected": -70.88679504394531, "logps/rejected": -230.63076782226562, "loss": 1.0022, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4031725823879242, "rewards/margins": 0.5320498943328857, "rewards/rejected": -0.9352225065231323, "step": 543 }, { "epoch": 0.8223733938019653, "epsilon_dpo/beta": 0.005843094550073147, "epsilon_dpo/beta_margin_grad_mean": -0.38737496733665466, "epsilon_dpo/beta_margin_grad_std": 0.1271100491285324, "epsilon_dpo/beta_margin_mean": 0.49497345089912415, "epsilon_dpo/beta_margin_std": 0.5664626955986023, "epsilon_dpo/loss_margin_mean": 85.2799072265625, "grad_norm": 33.99992370605469, "kl/avg_steps": 0.5, "kl/beta": 0.005871869623661041, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.7125818934366454e-08, "logits/chosen": -2.927542209625244, "logits/rejected": -2.9868786334991455, "logps/chosen": -122.0878677368164, "logps/ref_chosen": -54.617713928222656, "logps/ref_rejected": -79.12300109863281, "logps/rejected": -231.87307739257812, "loss": 1.0255, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3952871561050415, "rewards/margins": 0.49497348070144653, "rewards/rejected": -0.8902605772018433, "step": 544 }, { "epoch": 0.8238851095993953, "epsilon_dpo/beta": 0.00581585057079792, "epsilon_dpo/beta_margin_grad_mean": -0.4130638837814331, "epsilon_dpo/beta_margin_grad_std": 0.13133418560028076, "epsilon_dpo/beta_margin_mean": 0.3787357211112976, "epsilon_dpo/beta_margin_std": 0.5837143063545227, "epsilon_dpo/loss_margin_mean": 65.7337417602539, "grad_norm": 34.081382751464844, "kl/avg_steps": 0.46875, "kl/beta": 0.00584265636280179, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.635601198741607e-08, "logits/chosen": -2.8554515838623047, "logits/rejected": -2.8941726684570312, "logps/chosen": -133.48377990722656, "logps/ref_chosen": -54.19837188720703, "logps/ref_rejected": -68.26891326904297, "logps/rejected": -213.28805541992188, "loss": 1.1224, "rewards/accuracies": 0.75, "rewards/chosen": -0.46269890666007996, "rewards/margins": 0.3787357211112976, "rewards/rejected": -0.8414345979690552, "step": 545 }, { "epoch": 0.8253968253968254, "epsilon_dpo/beta": 0.005779628176242113, "epsilon_dpo/beta_margin_grad_mean": -0.4025329053401947, "epsilon_dpo/beta_margin_grad_std": 0.11457894742488861, "epsilon_dpo/beta_margin_mean": 0.41488125920295715, "epsilon_dpo/beta_margin_std": 0.4932222068309784, "epsilon_dpo/loss_margin_mean": 72.2381591796875, "grad_norm": 33.07106399536133, "kl/avg_steps": 0.625, "kl/beta": 0.005815396551042795, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.559190140057428e-08, "logits/chosen": -2.7694177627563477, "logits/rejected": -2.613293409347534, "logps/chosen": -117.21865844726562, "logps/ref_chosen": -49.915283203125, "logps/ref_rejected": -56.072879791259766, "logps/rejected": -195.61441040039062, "loss": 1.0717, "rewards/accuracies": 0.8125, "rewards/chosen": -0.39043018221855164, "rewards/margins": 0.41488125920295715, "rewards/rejected": -0.8053114414215088, "step": 546 }, { "epoch": 0.8269085411942555, "epsilon_dpo/beta": 0.0057328930124640465, "epsilon_dpo/beta_margin_grad_mean": -0.37843915820121765, "epsilon_dpo/beta_margin_grad_std": 0.1237616240978241, "epsilon_dpo/beta_margin_mean": 0.5358549952507019, "epsilon_dpo/beta_margin_std": 0.6030821800231934, "epsilon_dpo/loss_margin_mean": 93.8188705444336, "grad_norm": 29.444089889526367, "kl/avg_steps": 0.8125, "kl/beta": 0.0057792761363089085, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.483350854765672e-08, "logits/chosen": -2.8800837993621826, "logits/rejected": -2.922487258911133, "logps/chosen": -109.74982452392578, "logps/ref_chosen": -45.972801208496094, "logps/ref_rejected": -69.6629867553711, "logps/rejected": -227.25888061523438, "loss": 0.9991, "rewards/accuracies": 0.90625, "rewards/chosen": -0.36659324169158936, "rewards/margins": 0.5358549952507019, "rewards/rejected": -0.9024481773376465, "step": 547 }, { "epoch": 0.8284202569916855, "epsilon_dpo/beta": 0.005706396419554949, "epsilon_dpo/beta_margin_grad_mean": -0.41200029850006104, "epsilon_dpo/beta_margin_grad_std": 0.11187729239463806, "epsilon_dpo/beta_margin_mean": 0.3801129162311554, "epsilon_dpo/beta_margin_std": 0.4975734353065491, "epsilon_dpo/loss_margin_mean": 67.13646697998047, "grad_norm": 39.06440353393555, "kl/avg_steps": 0.46875, "kl/beta": 0.0057326979003846645, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.4080854642541826e-08, "logits/chosen": -2.8946030139923096, "logits/rejected": -2.976044178009033, "logps/chosen": -143.94869995117188, "logps/ref_chosen": -55.202659606933594, "logps/ref_rejected": -75.2496566772461, "logps/rejected": -231.13217163085938, "loss": 1.0993, "rewards/accuracies": 0.75, "rewards/chosen": -0.5070828199386597, "rewards/margins": 0.380112886428833, "rewards/rejected": -0.8871957063674927, "step": 548 }, { "epoch": 0.8299319727891157, "epsilon_dpo/beta": 0.0056815557181835175, "epsilon_dpo/beta_margin_grad_mean": -0.41167497634887695, "epsilon_dpo/beta_margin_grad_std": 0.14200666546821594, "epsilon_dpo/beta_margin_mean": 0.3933207392692566, "epsilon_dpo/beta_margin_std": 0.6406897306442261, "epsilon_dpo/loss_margin_mean": 69.91837310791016, "grad_norm": 36.91124725341797, "kl/avg_steps": 0.4375, "kl/beta": 0.005705951247364283, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.333396073857723e-08, "logits/chosen": -2.9612035751342773, "logits/rejected": -3.0617923736572266, "logps/chosen": -127.36709594726562, "logps/ref_chosen": -54.56769943237305, "logps/ref_rejected": -91.24746704101562, "logps/rejected": -233.96524047851562, "loss": 1.1257, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4152619242668152, "rewards/margins": 0.3933207392692566, "rewards/rejected": -0.8085826635360718, "step": 549 }, { "epoch": 0.8314436885865457, "epsilon_dpo/beta": 0.005657685920596123, "epsilon_dpo/beta_margin_grad_mean": -0.4352229833602905, "epsilon_dpo/beta_margin_grad_std": 0.14876194298267365, "epsilon_dpo/beta_margin_mean": 0.27707934379577637, "epsilon_dpo/beta_margin_std": 0.6561112999916077, "epsilon_dpo/loss_margin_mean": 49.77057647705078, "grad_norm": 31.365854263305664, "kl/avg_steps": 0.421875, "kl/beta": 0.005681096576154232, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.259284772799099e-08, "logits/chosen": -2.8737735748291016, "logits/rejected": -2.75887131690979, "logps/chosen": -144.96083068847656, "logps/ref_chosen": -54.1147575378418, "logps/ref_rejected": -55.54188537597656, "logps/rejected": -196.15853881835938, "loss": 1.23, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5165128707885742, "rewards/margins": 0.27707934379577637, "rewards/rejected": -0.7935922145843506, "step": 550 }, { "epoch": 0.8329554043839759, "epsilon_dpo/beta": 0.005632149986922741, "epsilon_dpo/beta_margin_grad_mean": -0.41093331575393677, "epsilon_dpo/beta_margin_grad_std": 0.1189160868525505, "epsilon_dpo/beta_margin_mean": 0.3892238140106201, "epsilon_dpo/beta_margin_std": 0.5528534650802612, "epsilon_dpo/loss_margin_mean": 69.67448425292969, "grad_norm": 40.25715255737305, "kl/avg_steps": 0.453125, "kl/beta": 0.0056572300381958485, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.1857536341307176e-08, "logits/chosen": -2.91385555267334, "logits/rejected": -2.970834732055664, "logps/chosen": -144.48776245117188, "logps/ref_chosen": -58.17372131347656, "logps/ref_rejected": -79.69941711425781, "logps/rejected": -235.68792724609375, "loss": 1.1031, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4870685636997223, "rewards/margins": 0.3892238140106201, "rewards/rejected": -0.87629234790802, "step": 551 }, { "epoch": 0.8344671201814059, "epsilon_dpo/beta": 0.005598832853138447, "epsilon_dpo/beta_margin_grad_mean": -0.4119429588317871, "epsilon_dpo/beta_margin_grad_std": 0.11989124864339828, "epsilon_dpo/beta_margin_mean": 0.37389206886291504, "epsilon_dpo/beta_margin_std": 0.5159745812416077, "epsilon_dpo/loss_margin_mean": 67.30302429199219, "grad_norm": 31.336740493774414, "kl/avg_steps": 0.59375, "kl/beta": 0.005631711333990097, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.112804714676593e-08, "logits/chosen": -2.9715137481689453, "logits/rejected": -2.991689682006836, "logps/chosen": -132.0955352783203, "logps/ref_chosen": -57.822784423828125, "logps/ref_rejected": -69.76190948486328, "logps/rejected": -211.33767700195312, "loss": 1.1104, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4171101450920105, "rewards/margins": 0.37389206886291504, "rewards/rejected": -0.7910022139549255, "step": 552 }, { "epoch": 0.8359788359788359, "epsilon_dpo/beta": 0.005574534647166729, "epsilon_dpo/beta_margin_grad_mean": -0.4040873348712921, "epsilon_dpo/beta_margin_grad_std": 0.13302232325077057, "epsilon_dpo/beta_margin_mean": 0.42919787764549255, "epsilon_dpo/beta_margin_std": 0.6149832010269165, "epsilon_dpo/loss_margin_mean": 77.68640899658203, "grad_norm": 38.32421875, "kl/avg_steps": 0.4375, "kl/beta": 0.005598470568656921, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.0404400549748144e-08, "logits/chosen": -2.7747843265533447, "logits/rejected": -2.8361945152282715, "logps/chosen": -128.4781494140625, "logps/ref_chosen": -50.11827087402344, "logps/ref_rejected": -76.44326782226562, "logps/rejected": -232.4895477294922, "loss": 1.0877, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4373798966407776, "rewards/margins": 0.42919787764549255, "rewards/rejected": -0.8665777444839478, "step": 553 }, { "epoch": 0.8374905517762661, "epsilon_dpo/beta": 0.005543284118175507, "epsilon_dpo/beta_margin_grad_mean": -0.4028905928134918, "epsilon_dpo/beta_margin_grad_std": 0.13257694244384766, "epsilon_dpo/beta_margin_mean": 0.41732776165008545, "epsilon_dpo/beta_margin_std": 0.5878807306289673, "epsilon_dpo/loss_margin_mean": 75.90509033203125, "grad_norm": 38.00951385498047, "kl/avg_steps": 0.5625, "kl/beta": 0.005574083887040615, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.968661679220467e-08, "logits/chosen": -2.923233985900879, "logits/rejected": -2.870636224746704, "logps/chosen": -138.48934936523438, "logps/ref_chosen": -58.00884246826172, "logps/ref_rejected": -67.21849060058594, "logps/rejected": -223.60409545898438, "loss": 1.0928, "rewards/accuracies": 0.796875, "rewards/chosen": -0.44757354259490967, "rewards/margins": 0.41732776165008545, "rewards/rejected": -0.8649013042449951, "step": 554 }, { "epoch": 0.8390022675736961, "epsilon_dpo/beta": 0.005510545335710049, "epsilon_dpo/beta_margin_grad_mean": -0.40936601161956787, "epsilon_dpo/beta_margin_grad_std": 0.13739651441574097, "epsilon_dpo/beta_margin_mean": 0.38663244247436523, "epsilon_dpo/beta_margin_std": 0.5907249450683594, "epsilon_dpo/loss_margin_mean": 70.83427429199219, "grad_norm": 27.83380699157715, "kl/avg_steps": 0.59375, "kl/beta": 0.005542905069887638, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.89747159520904e-08, "logits/chosen": -2.963087558746338, "logits/rejected": -2.7683815956115723, "logps/chosen": -136.2809600830078, "logps/ref_chosen": -60.854896545410156, "logps/ref_rejected": -58.344696044921875, "logps/rejected": -204.60504150390625, "loss": 1.1199, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4176791310310364, "rewards/margins": 0.38663244247436523, "rewards/rejected": -0.8043116331100464, "step": 555 }, { "epoch": 0.8405139833711263, "epsilon_dpo/beta": 0.0054831854067742825, "epsilon_dpo/beta_margin_grad_mean": -0.4070160388946533, "epsilon_dpo/beta_margin_grad_std": 0.1391827017068863, "epsilon_dpo/beta_margin_mean": 0.40655556321144104, "epsilon_dpo/beta_margin_std": 0.6125519871711731, "epsilon_dpo/loss_margin_mean": 74.86852264404297, "grad_norm": 35.101261138916016, "kl/avg_steps": 0.5, "kl/beta": 0.005510188173502684, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.826871794280192e-08, "logits/chosen": -2.686767101287842, "logits/rejected": -2.647946834564209, "logps/chosen": -126.06890869140625, "logps/ref_chosen": -41.8829345703125, "logps/ref_rejected": -58.50324249267578, "logps/rejected": -217.5577392578125, "loss": 1.1084, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4630902409553528, "rewards/margins": 0.40655556321144104, "rewards/rejected": -0.8696458339691162, "step": 556 }, { "epoch": 0.8420256991685563, "epsilon_dpo/beta": 0.005454192403703928, "epsilon_dpo/beta_margin_grad_mean": -0.39166730642318726, "epsilon_dpo/beta_margin_grad_std": 0.1265869140625, "epsilon_dpo/beta_margin_mean": 0.4764001965522766, "epsilon_dpo/beta_margin_std": 0.5695289373397827, "epsilon_dpo/loss_margin_mean": 87.9591064453125, "grad_norm": 30.492870330810547, "kl/avg_steps": 0.53125, "kl/beta": 0.00548277422785759, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.756864251262143e-08, "logits/chosen": -2.855372190475464, "logits/rejected": -2.91074275970459, "logps/chosen": -136.32662963867188, "logps/ref_chosen": -51.69257354736328, "logps/ref_rejected": -71.78218078613281, "logps/rejected": -244.37533569335938, "loss": 1.04, "rewards/accuracies": 0.75, "rewards/chosen": -0.46403536200523376, "rewards/margins": 0.4764001965522766, "rewards/rejected": -0.940435528755188, "step": 557 }, { "epoch": 0.8435374149659864, "epsilon_dpo/beta": 0.005425369832664728, "epsilon_dpo/beta_margin_grad_mean": -0.3974941074848175, "epsilon_dpo/beta_margin_grad_std": 0.1295580118894577, "epsilon_dpo/beta_margin_mean": 0.4458405673503876, "epsilon_dpo/beta_margin_std": 0.5686999559402466, "epsilon_dpo/loss_margin_mean": 82.80966186523438, "grad_norm": 25.268863677978516, "kl/avg_steps": 0.53125, "kl/beta": 0.005453800782561302, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.687450924416341e-08, "logits/chosen": -3.0402002334594727, "logits/rejected": -2.958615303039551, "logps/chosen": -138.29959106445312, "logps/ref_chosen": -58.85462951660156, "logps/ref_rejected": -80.87055969238281, "logps/rejected": -243.12518310546875, "loss": 1.0651, "rewards/accuracies": 0.78125, "rewards/chosen": -0.43247419595718384, "rewards/margins": 0.4458405673503876, "rewards/rejected": -0.878314733505249, "step": 558 }, { "epoch": 0.8450491307634165, "epsilon_dpo/beta": 0.005399234592914581, "epsilon_dpo/beta_margin_grad_mean": -0.39557376503944397, "epsilon_dpo/beta_margin_grad_std": 0.13823488354682922, "epsilon_dpo/beta_margin_mean": 0.4677964448928833, "epsilon_dpo/beta_margin_std": 0.6431779265403748, "epsilon_dpo/loss_margin_mean": 87.31603240966797, "grad_norm": 25.808263778686523, "kl/avg_steps": 0.484375, "kl/beta": 0.005424980539828539, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.6186337553827743e-08, "logits/chosen": -2.821948766708374, "logits/rejected": -2.8376193046569824, "logps/chosen": -132.65518188476562, "logps/ref_chosen": -51.850799560546875, "logps/ref_rejected": -76.71221923828125, "logps/rejected": -244.8326416015625, "loss": 1.0651, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4373553395271301, "rewards/margins": 0.4677964448928833, "rewards/rejected": -0.9051517844200134, "step": 559 }, { "epoch": 0.8465608465608465, "epsilon_dpo/beta": 0.005374895874410868, "epsilon_dpo/beta_margin_grad_mean": -0.4086878299713135, "epsilon_dpo/beta_margin_grad_std": 0.12076151371002197, "epsilon_dpo/beta_margin_mean": 0.39374908804893494, "epsilon_dpo/beta_margin_std": 0.5231068730354309, "epsilon_dpo/loss_margin_mean": 73.8972396850586, "grad_norm": 36.12723159790039, "kl/avg_steps": 0.453125, "kl/beta": 0.005398830398917198, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.550414669125573e-08, "logits/chosen": -2.8402633666992188, "logits/rejected": -2.846738338470459, "logps/chosen": -140.06784057617188, "logps/ref_chosen": -52.74456787109375, "logps/ref_rejected": -71.90850067138672, "logps/rejected": -233.1290283203125, "loss": 1.0955, "rewards/accuracies": 0.71875, "rewards/chosen": -0.47079208493232727, "rewards/margins": 0.39374905824661255, "rewards/rejected": -0.8645411729812622, "step": 560 }, { "epoch": 0.8480725623582767, "epsilon_dpo/beta": 0.005349819548428059, "epsilon_dpo/beta_margin_grad_mean": -0.41835686564445496, "epsilon_dpo/beta_margin_grad_std": 0.11519598215818405, "epsilon_dpo/beta_margin_mean": 0.350050151348114, "epsilon_dpo/beta_margin_std": 0.4948336184024811, "epsilon_dpo/loss_margin_mean": 66.0028076171875, "grad_norm": 26.221477508544922, "kl/avg_steps": 0.46875, "kl/beta": 0.005374477244913578, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.482795573879241e-08, "logits/chosen": -2.9749813079833984, "logits/rejected": -2.8635973930358887, "logps/chosen": -143.88693237304688, "logps/ref_chosen": -55.23572540283203, "logps/ref_rejected": -69.04344177246094, "logps/rejected": -223.6974639892578, "loss": 1.1248, "rewards/accuracies": 0.75, "rewards/chosen": -0.4758322834968567, "rewards/margins": 0.350050151348114, "rewards/rejected": -0.8258824348449707, "step": 561 }, { "epoch": 0.8495842781557067, "epsilon_dpo/beta": 0.005324859172105789, "epsilon_dpo/beta_margin_grad_mean": -0.4036167562007904, "epsilon_dpo/beta_margin_grad_std": 0.1328066885471344, "epsilon_dpo/beta_margin_mean": 0.4261111617088318, "epsilon_dpo/beta_margin_std": 0.5968400239944458, "epsilon_dpo/loss_margin_mean": 80.71043395996094, "grad_norm": 31.69719886779785, "kl/avg_steps": 0.46875, "kl/beta": 0.005349401850253344, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.415778361095226e-08, "logits/chosen": -3.0495078563690186, "logits/rejected": -3.045661687850952, "logps/chosen": -165.94175720214844, "logps/ref_chosen": -71.56375885009766, "logps/ref_rejected": -84.80111694335938, "logps/rejected": -259.8895568847656, "loss": 1.0869, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5041170120239258, "rewards/margins": 0.4261111617088318, "rewards/rejected": -0.9302281141281128, "step": 562 }, { "epoch": 0.8510959939531368, "epsilon_dpo/beta": 0.005291694775223732, "epsilon_dpo/beta_margin_grad_mean": -0.3964901268482208, "epsilon_dpo/beta_margin_grad_std": 0.1304260790348053, "epsilon_dpo/beta_margin_mean": 0.4481057822704315, "epsilon_dpo/beta_margin_std": 0.5686075687408447, "epsilon_dpo/loss_margin_mean": 85.28060913085938, "grad_norm": 33.23506546020508, "kl/avg_steps": 0.625, "kl/beta": 0.005324443336576223, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.349364905389032e-08, "logits/chosen": -2.7171127796173096, "logits/rejected": -2.7322144508361816, "logps/chosen": -104.35655212402344, "logps/ref_chosen": -42.256004333496094, "logps/ref_rejected": -62.18010711669922, "logps/rejected": -209.56126403808594, "loss": 1.0637, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3300304710865021, "rewards/margins": 0.4481058120727539, "rewards/rejected": -0.7781362533569336, "step": 563 }, { "epoch": 0.8526077097505669, "epsilon_dpo/beta": 0.005265442188829184, "epsilon_dpo/beta_margin_grad_mean": -0.392487496137619, "epsilon_dpo/beta_margin_grad_std": 0.13288848102092743, "epsilon_dpo/beta_margin_mean": 0.46262475848197937, "epsilon_dpo/beta_margin_std": 0.5870823264122009, "epsilon_dpo/loss_margin_mean": 88.58763885498047, "grad_norm": 24.4987735748291, "kl/avg_steps": 0.5, "kl/beta": 0.005291372537612915, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.283557064487785e-08, "logits/chosen": -2.898423194885254, "logits/rejected": -2.969186305999756, "logps/chosen": -123.16287231445312, "logps/ref_chosen": -51.451927185058594, "logps/ref_rejected": -73.02340698242188, "logps/rejected": -233.32199096679688, "loss": 1.057, "rewards/accuracies": 0.8125, "rewards/chosen": -0.37985044717788696, "rewards/margins": 0.46262478828430176, "rewards/rejected": -0.842475175857544, "step": 564 }, { "epoch": 0.854119425547997, "epsilon_dpo/beta": 0.005240891128778458, "epsilon_dpo/beta_margin_grad_mean": -0.4184736907482147, "epsilon_dpo/beta_margin_grad_std": 0.12026441842317581, "epsilon_dpo/beta_margin_mean": 0.34834542870521545, "epsilon_dpo/beta_margin_std": 0.5139597058296204, "epsilon_dpo/loss_margin_mean": 67.06075286865234, "grad_norm": 30.50493621826172, "kl/avg_steps": 0.46875, "kl/beta": 0.005265047308057547, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.218356679178252e-08, "logits/chosen": -2.8498477935791016, "logits/rejected": -2.7176618576049805, "logps/chosen": -159.74114990234375, "logps/ref_chosen": -59.28217315673828, "logps/ref_rejected": -70.09977722167969, "logps/rejected": -237.61949157714844, "loss": 1.1311, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5282279253005981, "rewards/margins": 0.34834545850753784, "rewards/rejected": -0.8765733242034912, "step": 565 }, { "epoch": 0.8556311413454271, "epsilon_dpo/beta": 0.005219714716076851, "epsilon_dpo/beta_margin_grad_mean": -0.3980344235897064, "epsilon_dpo/beta_margin_grad_std": 0.14285139739513397, "epsilon_dpo/beta_margin_mean": 0.4586411416530609, "epsilon_dpo/beta_margin_std": 0.6637029051780701, "epsilon_dpo/loss_margin_mean": 88.65043640136719, "grad_norm": 29.21318244934082, "kl/avg_steps": 0.40625, "kl/beta": 0.0052404822781682014, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.1537655732553764e-08, "logits/chosen": -2.9841766357421875, "logits/rejected": -2.870380163192749, "logps/chosen": -136.72784423828125, "logps/ref_chosen": -57.57489013671875, "logps/ref_rejected": -65.68423461914062, "logps/rejected": -233.48764038085938, "loss": 1.0784, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4144941568374634, "rewards/margins": 0.4586411118507385, "rewards/rejected": -0.8731353282928467, "step": 566 }, { "epoch": 0.8571428571428571, "epsilon_dpo/beta": 0.0051830909214913845, "epsilon_dpo/beta_margin_grad_mean": -0.38887423276901245, "epsilon_dpo/beta_margin_grad_std": 0.10783424973487854, "epsilon_dpo/beta_margin_mean": 0.47166863083839417, "epsilon_dpo/beta_margin_std": 0.46536019444465637, "epsilon_dpo/loss_margin_mean": 91.4534912109375, "grad_norm": 26.888280868530273, "kl/avg_steps": 0.703125, "kl/beta": 0.005219278857111931, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.089785553471233e-08, "logits/chosen": -2.8272597789764404, "logits/rejected": -2.7455508708953857, "logps/chosen": -123.11620330810547, "logps/ref_chosen": -46.228599548339844, "logps/ref_rejected": -63.579586029052734, "logps/rejected": -231.92068481445312, "loss": 1.0206, "rewards/accuracies": 0.859375, "rewards/chosen": -0.39988017082214355, "rewards/margins": 0.47166863083839417, "rewards/rejected": -0.8715487718582153, "step": 567 }, { "epoch": 0.8586545729402872, "epsilon_dpo/beta": 0.005152578931301832, "epsilon_dpo/beta_margin_grad_mean": -0.4006498456001282, "epsilon_dpo/beta_margin_grad_std": 0.11870747059583664, "epsilon_dpo/beta_margin_mean": 0.4401390254497528, "epsilon_dpo/beta_margin_std": 0.5688689351081848, "epsilon_dpo/loss_margin_mean": 85.92711639404297, "grad_norm": 30.92510414123535, "kl/avg_steps": 0.59375, "kl/beta": 0.005182837136089802, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.026418409484513e-08, "logits/chosen": -2.9527692794799805, "logits/rejected": -2.954845905303955, "logps/chosen": -125.73403930664062, "logps/ref_chosen": -50.062171936035156, "logps/ref_rejected": -78.94384765625, "logps/rejected": -240.54281616210938, "loss": 1.0648, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3904677927494049, "rewards/margins": 0.4401390552520752, "rewards/rejected": -0.8306068181991577, "step": 568 }, { "epoch": 0.8601662887377173, "epsilon_dpo/beta": 0.0051302178762853146, "epsilon_dpo/beta_margin_grad_mean": -0.41234061121940613, "epsilon_dpo/beta_margin_grad_std": 0.11904479563236237, "epsilon_dpo/beta_margin_mean": 0.3765088617801666, "epsilon_dpo/beta_margin_std": 0.514091432094574, "epsilon_dpo/loss_margin_mean": 74.07112121582031, "grad_norm": 32.503944396972656, "kl/avg_steps": 0.4375, "kl/beta": 0.005152245983481407, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.963665913810451e-08, "logits/chosen": -2.893313407897949, "logits/rejected": -2.8092219829559326, "logps/chosen": -144.25140380859375, "logps/ref_chosen": -57.58082962036133, "logps/ref_rejected": -67.66905212402344, "logps/rejected": -228.41073608398438, "loss": 1.1074, "rewards/accuracies": 0.765625, "rewards/chosen": -0.44680964946746826, "rewards/margins": 0.3765088617801666, "rewards/rejected": -0.8233184814453125, "step": 569 }, { "epoch": 0.8616780045351474, "epsilon_dpo/beta": 0.005101457703858614, "epsilon_dpo/beta_margin_grad_mean": -0.3925977647304535, "epsilon_dpo/beta_margin_grad_std": 0.12843039631843567, "epsilon_dpo/beta_margin_mean": 0.46372681856155396, "epsilon_dpo/beta_margin_std": 0.573695719242096, "epsilon_dpo/loss_margin_mean": 91.54476165771484, "grad_norm": 33.932918548583984, "kl/avg_steps": 0.5625, "kl/beta": 0.005129802972078323, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.9015298217712453e-08, "logits/chosen": -2.8977036476135254, "logits/rejected": -2.7423322200775146, "logps/chosen": -119.95408630371094, "logps/ref_chosen": -47.76339340209961, "logps/ref_rejected": -72.54924011230469, "logps/rejected": -236.28469848632812, "loss": 1.0518, "rewards/accuracies": 0.828125, "rewards/chosen": -0.37008702754974365, "rewards/margins": 0.46372681856155396, "rewards/rejected": -0.8338139057159424, "step": 570 }, { "epoch": 0.8631897203325775, "epsilon_dpo/beta": 0.005077705252915621, "epsilon_dpo/beta_margin_grad_mean": -0.4264798164367676, "epsilon_dpo/beta_margin_grad_std": 0.12628421187400818, "epsilon_dpo/beta_margin_mean": 0.3123824894428253, "epsilon_dpo/beta_margin_std": 0.5387343764305115, "epsilon_dpo/loss_margin_mean": 62.217010498046875, "grad_norm": 27.88601303100586, "kl/avg_steps": 0.46875, "kl/beta": 0.0051011089235544205, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.840011871446962e-08, "logits/chosen": -2.8612377643585205, "logits/rejected": -2.7869882583618164, "logps/chosen": -134.21749877929688, "logps/ref_chosen": -49.611907958984375, "logps/ref_rejected": -57.0242805480957, "logps/rejected": -203.84689331054688, "loss": 1.1678, "rewards/accuracies": 0.75, "rewards/chosen": -0.43143609166145325, "rewards/margins": 0.3123824894428253, "rewards/rejected": -0.7438186407089233, "step": 571 }, { "epoch": 0.8647014361300076, "epsilon_dpo/beta": 0.00505797378718853, "epsilon_dpo/beta_margin_grad_mean": -0.41616883873939514, "epsilon_dpo/beta_margin_grad_std": 0.1045081838965416, "epsilon_dpo/beta_margin_mean": 0.3525901436805725, "epsilon_dpo/beta_margin_std": 0.4411594867706299, "epsilon_dpo/loss_margin_mean": 70.31642150878906, "grad_norm": 28.107685089111328, "kl/avg_steps": 0.390625, "kl/beta": 0.005077309440821409, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.7791137836269158e-08, "logits/chosen": -3.1014351844787598, "logits/rejected": -2.8060758113861084, "logps/chosen": -155.38946533203125, "logps/ref_chosen": -68.19845581054688, "logps/ref_rejected": -59.052757263183594, "logps/rejected": -216.56019592285156, "loss": 1.1113, "rewards/accuracies": 0.75, "rewards/chosen": -0.4428062438964844, "rewards/margins": 0.3525901436805725, "rewards/rejected": -0.7953963875770569, "step": 572 }, { "epoch": 0.8662131519274376, "epsilon_dpo/beta": 0.005031188018620014, "epsilon_dpo/beta_margin_grad_mean": -0.3980614244937897, "epsilon_dpo/beta_margin_grad_std": 0.1336616724729538, "epsilon_dpo/beta_margin_mean": 0.4519176781177521, "epsilon_dpo/beta_margin_std": 0.611924409866333, "epsilon_dpo/loss_margin_mean": 90.50875854492188, "grad_norm": 27.5058536529541, "kl/avg_steps": 0.53125, "kl/beta": 0.005057553295046091, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.718837261761528e-08, "logits/chosen": -2.995171070098877, "logits/rejected": -2.9232358932495117, "logps/chosen": -146.7447509765625, "logps/ref_chosen": -58.643585205078125, "logps/ref_rejected": -82.25306701660156, "logps/rejected": -260.86297607421875, "loss": 1.0698, "rewards/accuracies": 0.765625, "rewards/chosen": -0.44364631175994873, "rewards/margins": 0.4519176483154297, "rewards/rejected": -0.8955639600753784, "step": 573 }, { "epoch": 0.8677248677248677, "epsilon_dpo/beta": 0.005002235062420368, "epsilon_dpo/beta_margin_grad_mean": -0.39403048157691956, "epsilon_dpo/beta_margin_grad_std": 0.11160728335380554, "epsilon_dpo/beta_margin_mean": 0.4603891670703888, "epsilon_dpo/beta_margin_std": 0.5035741329193115, "epsilon_dpo/loss_margin_mean": 92.53056335449219, "grad_norm": 25.848438262939453, "kl/avg_steps": 0.578125, "kl/beta": 0.005030827131122351, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.659183991914696e-08, "logits/chosen": -2.940920352935791, "logits/rejected": -2.7493834495544434, "logps/chosen": -134.89476013183594, "logps/ref_chosen": -52.27944564819336, "logps/ref_rejected": -71.58317565917969, "logps/rejected": -246.72903442382812, "loss": 1.0361, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4145534634590149, "rewards/margins": 0.4603891372680664, "rewards/rejected": -0.8749426007270813, "step": 574 }, { "epoch": 0.8692365835222978, "epsilon_dpo/beta": 0.00497896084561944, "epsilon_dpo/beta_margin_grad_mean": -0.4240894317626953, "epsilon_dpo/beta_margin_grad_std": 0.14027918875217438, "epsilon_dpo/beta_margin_mean": 0.3399903476238251, "epsilon_dpo/beta_margin_std": 0.6345133185386658, "epsilon_dpo/loss_margin_mean": 69.05403900146484, "grad_norm": 28.529333114624023, "kl/avg_steps": 0.46875, "kl/beta": 0.00500190956518054, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.600155642716606e-08, "logits/chosen": -2.9890761375427246, "logits/rejected": -3.1348118782043457, "logps/chosen": -144.7169189453125, "logps/ref_chosen": -60.32773208618164, "logps/ref_rejected": -93.00959014892578, "logps/rejected": -246.45281982421875, "loss": 1.1677, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4222748875617981, "rewards/margins": 0.3399903476238251, "rewards/rejected": -0.7622652053833008, "step": 575 }, { "epoch": 0.8707482993197279, "epsilon_dpo/beta": 0.004951063077896833, "epsilon_dpo/beta_margin_grad_mean": -0.3928014039993286, "epsilon_dpo/beta_margin_grad_std": 0.12241636961698532, "epsilon_dpo/beta_margin_mean": 0.4605081081390381, "epsilon_dpo/beta_margin_std": 0.5317562818527222, "epsilon_dpo/loss_margin_mean": 93.65644073486328, "grad_norm": 24.784679412841797, "kl/avg_steps": 0.5625, "kl/beta": 0.004978572484105825, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.5417538653170754e-08, "logits/chosen": -2.817110061645508, "logits/rejected": -2.827260732650757, "logps/chosen": -111.0085220336914, "logps/ref_chosen": -44.90370178222656, "logps/ref_rejected": -73.90789794921875, "logps/rejected": -233.66915893554688, "loss": 1.0446, "rewards/accuracies": 0.8125, "rewards/chosen": -0.32796165347099304, "rewards/margins": 0.4605081081390381, "rewards/rejected": -0.7884697914123535, "step": 576 }, { "epoch": 0.872260015117158, "epsilon_dpo/beta": 0.004928010981529951, "epsilon_dpo/beta_margin_grad_mean": -0.4263003170490265, "epsilon_dpo/beta_margin_grad_std": 0.11355873942375183, "epsilon_dpo/beta_margin_mean": 0.3075699508190155, "epsilon_dpo/beta_margin_std": 0.4823225438594818, "epsilon_dpo/loss_margin_mean": 63.062171936035156, "grad_norm": 26.644804000854492, "kl/avg_steps": 0.46875, "kl/beta": 0.004950724542140961, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.4839802933393607e-08, "logits/chosen": -2.8866803646087646, "logits/rejected": -2.8407955169677734, "logps/chosen": -131.4127960205078, "logps/ref_chosen": -52.16259765625, "logps/ref_rejected": -63.407508850097656, "logps/rejected": -205.71987915039062, "loss": 1.1583, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3915864825248718, "rewards/margins": 0.3075699508190155, "rewards/rejected": -0.6991564631462097, "step": 577 }, { "epoch": 0.873771730914588, "epsilon_dpo/beta": 0.004900398664176464, "epsilon_dpo/beta_margin_grad_mean": -0.42768949270248413, "epsilon_dpo/beta_margin_grad_std": 0.10880084335803986, "epsilon_dpo/beta_margin_mean": 0.310811311006546, "epsilon_dpo/beta_margin_std": 0.4840993583202362, "epsilon_dpo/loss_margin_mean": 63.94895553588867, "grad_norm": 30.19374656677246, "kl/avg_steps": 0.5625, "kl/beta": 0.004927626345306635, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.4268365428344733e-08, "logits/chosen": -2.795511245727539, "logits/rejected": -2.834501266479492, "logps/chosen": -128.01966857910156, "logps/ref_chosen": -48.55311965942383, "logps/ref_rejected": -65.43995666503906, "logps/rejected": -208.85545349121094, "loss": 1.1539, "rewards/accuracies": 0.75, "rewards/chosen": -0.3906499147415161, "rewards/margins": 0.310811311006546, "rewards/rejected": -0.7014611959457397, "step": 578 }, { "epoch": 0.8752834467120182, "epsilon_dpo/beta": 0.004869925323873758, "epsilon_dpo/beta_margin_grad_mean": -0.4004862904548645, "epsilon_dpo/beta_margin_grad_std": 0.11176857352256775, "epsilon_dpo/beta_margin_mean": 0.4231148660182953, "epsilon_dpo/beta_margin_std": 0.48036590218544006, "epsilon_dpo/loss_margin_mean": 87.42357635498047, "grad_norm": 24.844749450683594, "kl/avg_steps": 0.625, "kl/beta": 0.00490006385371089, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.3703242122359357e-08, "logits/chosen": -2.7219746112823486, "logits/rejected": -2.5129263401031494, "logps/chosen": -133.3014373779297, "logps/ref_chosen": -48.57540512084961, "logps/ref_rejected": -55.7696418762207, "logps/rejected": -227.91925048828125, "loss": 1.0621, "rewards/accuracies": 0.78125, "rewards/chosen": -0.41359296441078186, "rewards/margins": 0.4231148362159729, "rewards/rejected": -0.8367078304290771, "step": 579 }, { "epoch": 0.8767951625094482, "epsilon_dpo/beta": 0.004847286734730005, "epsilon_dpo/beta_margin_grad_mean": -0.4198722541332245, "epsilon_dpo/beta_margin_grad_std": 0.12368777394294739, "epsilon_dpo/beta_margin_mean": 0.3465627133846283, "epsilon_dpo/beta_margin_std": 0.5466067790985107, "epsilon_dpo/loss_margin_mean": 72.16964721679688, "grad_norm": 27.92873191833496, "kl/avg_steps": 0.46875, "kl/beta": 0.004869628231972456, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.3144448823151392e-08, "logits/chosen": -2.9493236541748047, "logits/rejected": -2.945380210876465, "logps/chosen": -130.52403259277344, "logps/ref_chosen": -52.675498962402344, "logps/ref_rejected": -70.52151489257812, "logps/rejected": -220.53968811035156, "loss": 1.1394, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3781663775444031, "rewards/margins": 0.3465627133846283, "rewards/rejected": -0.724729061126709, "step": 580 }, { "epoch": 0.8783068783068783, "epsilon_dpo/beta": 0.004815582185983658, "epsilon_dpo/beta_margin_grad_mean": -0.3895708918571472, "epsilon_dpo/beta_margin_grad_std": 0.11792002618312836, "epsilon_dpo/beta_margin_mean": 0.4733353555202484, "epsilon_dpo/beta_margin_std": 0.5180349349975586, "epsilon_dpo/loss_margin_mean": 98.87920379638672, "grad_norm": 26.747188568115234, "kl/avg_steps": 0.65625, "kl/beta": 0.004846908617764711, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.259200116137039e-08, "logits/chosen": -2.927241325378418, "logits/rejected": -2.91082763671875, "logps/chosen": -128.44998168945312, "logps/ref_chosen": -51.772216796875, "logps/ref_rejected": -80.52066040039062, "logps/rejected": -256.0776062011719, "loss": 1.0308, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3703047037124634, "rewards/margins": 0.4733353555202484, "rewards/rejected": -0.8436400890350342, "step": 581 }, { "epoch": 0.8798185941043084, "epsilon_dpo/beta": 0.004796225111931562, "epsilon_dpo/beta_margin_grad_mean": -0.4094151556491852, "epsilon_dpo/beta_margin_grad_std": 0.12034616619348526, "epsilon_dpo/beta_margin_mean": 0.3930991590023041, "epsilon_dpo/beta_margin_std": 0.5316392183303833, "epsilon_dpo/loss_margin_mean": 82.65021514892578, "grad_norm": 26.879863739013672, "kl/avg_steps": 0.40625, "kl/beta": 0.004815307911485434, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.204591459016525e-08, "logits/chosen": -2.913239002227783, "logits/rejected": -2.562662363052368, "logps/chosen": -119.0417709350586, "logps/ref_chosen": -50.28034210205078, "logps/ref_rejected": -53.197418212890625, "logps/rejected": -204.6090545654297, "loss": 1.0973, "rewards/accuracies": 0.78125, "rewards/chosen": -0.33147940039634705, "rewards/margins": 0.39309918880462646, "rewards/rejected": -0.7245785593986511, "step": 582 }, { "epoch": 0.8813303099017384, "epsilon_dpo/beta": 0.004776819609105587, "epsilon_dpo/beta_margin_grad_mean": -0.3991386890411377, "epsilon_dpo/beta_margin_grad_std": 0.1288546919822693, "epsilon_dpo/beta_margin_mean": 0.4450855255126953, "epsilon_dpo/beta_margin_std": 0.5808146595954895, "epsilon_dpo/loss_margin_mean": 93.97341918945312, "grad_norm": 33.40367126464844, "kl/avg_steps": 0.40625, "kl/beta": 0.004795825108885765, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.1506204384751064e-08, "logits/chosen": -2.7750816345214844, "logits/rejected": -3.044987678527832, "logps/chosen": -113.00186157226562, "logps/ref_chosen": -43.323341369628906, "logps/ref_rejected": -83.89755249023438, "logps/rejected": -247.54949951171875, "loss": 1.0673, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3340206742286682, "rewards/margins": 0.4450855255126953, "rewards/rejected": -0.7791062593460083, "step": 583 }, { "epoch": 0.8828420256991686, "epsilon_dpo/beta": 0.004748535808175802, "epsilon_dpo/beta_margin_grad_mean": -0.40925657749176025, "epsilon_dpo/beta_margin_grad_std": 0.11300100386142731, "epsilon_dpo/beta_margin_mean": 0.3902464509010315, "epsilon_dpo/beta_margin_std": 0.496918648481369, "epsilon_dpo/loss_margin_mean": 82.72006225585938, "grad_norm": 30.039215087890625, "kl/avg_steps": 0.59375, "kl/beta": 0.004776421003043652, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.09728856419826e-08, "logits/chosen": -2.7098846435546875, "logits/rejected": -2.7800514698028564, "logps/chosen": -107.04545593261719, "logps/ref_chosen": -42.23283386230469, "logps/ref_rejected": -69.93894958496094, "logps/rejected": -217.47161865234375, "loss": 1.0914, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3089197278022766, "rewards/margins": 0.3902464509010315, "rewards/rejected": -0.6991661787033081, "step": 584 }, { "epoch": 0.8843537414965986, "epsilon_dpo/beta": 0.0047264439053833485, "epsilon_dpo/beta_margin_grad_mean": -0.41896024346351624, "epsilon_dpo/beta_margin_grad_std": 0.11874654144048691, "epsilon_dpo/beta_margin_mean": 0.3512818217277527, "epsilon_dpo/beta_margin_std": 0.5333604216575623, "epsilon_dpo/loss_margin_mean": 74.9803695678711, "grad_norm": 25.852636337280273, "kl/avg_steps": 0.46875, "kl/beta": 0.004748228471726179, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.044597327993153e-08, "logits/chosen": -2.9065184593200684, "logits/rejected": -2.8376402854919434, "logps/chosen": -134.8012237548828, "logps/ref_chosen": -51.71220779418945, "logps/ref_rejected": -75.57369232177734, "logps/rejected": -233.64306640625, "loss": 1.1312, "rewards/accuracies": 0.796875, "rewards/chosen": -0.39305341243743896, "rewards/margins": 0.3512818217277527, "rewards/rejected": -0.7443352341651917, "step": 585 }, { "epoch": 0.8858654572940288, "epsilon_dpo/beta": 0.004697006195783615, "epsilon_dpo/beta_margin_grad_mean": -0.4013258218765259, "epsilon_dpo/beta_margin_grad_std": 0.09699484705924988, "epsilon_dpo/beta_margin_mean": 0.4172016978263855, "epsilon_dpo/beta_margin_std": 0.4171310365200043, "epsilon_dpo/loss_margin_mean": 89.20626068115234, "grad_norm": 28.50130844116211, "kl/avg_steps": 0.625, "kl/beta": 0.004726074635982513, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.9925482037469187e-08, "logits/chosen": -2.8047897815704346, "logits/rejected": -2.527097225189209, "logps/chosen": -134.52972412109375, "logps/ref_chosen": -50.79621124267578, "logps/ref_rejected": -55.9830207824707, "logps/rejected": -228.92279052734375, "loss": 1.0533, "rewards/accuracies": 0.84375, "rewards/chosen": -0.39414092898368835, "rewards/margins": 0.4172016978263855, "rewards/rejected": -0.8113425970077515, "step": 586 }, { "epoch": 0.8873771730914588, "epsilon_dpo/beta": 0.0046737040393054485, "epsilon_dpo/beta_margin_grad_mean": -0.41657954454421997, "epsilon_dpo/beta_margin_grad_std": 0.13375388085842133, "epsilon_dpo/beta_margin_mean": 0.35297098755836487, "epsilon_dpo/beta_margin_std": 0.5909594297409058, "epsilon_dpo/loss_margin_mean": 76.33965301513672, "grad_norm": 26.915908813476562, "kl/avg_steps": 0.5, "kl/beta": 0.004696720279753208, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.9411426473854687e-08, "logits/chosen": -2.9439432621002197, "logits/rejected": -2.631052017211914, "logps/chosen": -140.51397705078125, "logps/ref_chosen": -57.724342346191406, "logps/ref_rejected": -57.326927185058594, "logps/rejected": -216.45620727539062, "loss": 1.1464, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3891202211380005, "rewards/margins": 0.35297101736068726, "rewards/rejected": -0.742091178894043, "step": 587 }, { "epoch": 0.8888888888888888, "epsilon_dpo/beta": 0.004646069835871458, "epsilon_dpo/beta_margin_grad_mean": -0.402698814868927, "epsilon_dpo/beta_margin_grad_std": 0.11800126731395721, "epsilon_dpo/beta_margin_mean": 0.4177962839603424, "epsilon_dpo/beta_margin_std": 0.5144898295402527, "epsilon_dpo/loss_margin_mean": 90.54971313476562, "grad_norm": 31.378711700439453, "kl/avg_steps": 0.59375, "kl/beta": 0.004673353396356106, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.890382096832699e-08, "logits/chosen": -2.8207640647888184, "logits/rejected": -2.857759952545166, "logps/chosen": -124.81417846679688, "logps/ref_chosen": -52.82999038696289, "logps/ref_rejected": -74.073486328125, "logps/rejected": -236.60739135742188, "loss": 1.0738, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3361615538597107, "rewards/margins": 0.4177963137626648, "rewards/rejected": -0.7539578676223755, "step": 588 }, { "epoch": 0.890400604686319, "epsilon_dpo/beta": 0.00461719511076808, "epsilon_dpo/beta_margin_grad_mean": -0.3952021896839142, "epsilon_dpo/beta_margin_grad_std": 0.10597512125968933, "epsilon_dpo/beta_margin_mean": 0.44892969727516174, "epsilon_dpo/beta_margin_std": 0.46933525800704956, "epsilon_dpo/loss_margin_mean": 97.71900939941406, "grad_norm": 22.320722579956055, "kl/avg_steps": 0.625, "kl/beta": 0.0046457694843411446, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.840267971970344e-08, "logits/chosen": -2.9170284271240234, "logits/rejected": -2.8472824096679688, "logps/chosen": -124.15682983398438, "logps/ref_chosen": -48.84576416015625, "logps/ref_rejected": -71.93696594238281, "logps/rejected": -244.967041015625, "loss": 1.0381, "rewards/accuracies": 0.828125, "rewards/chosen": -0.34913724660873413, "rewards/margins": 0.44892966747283936, "rewards/rejected": -0.7980669140815735, "step": 589 }, { "epoch": 0.891912320483749, "epsilon_dpo/beta": 0.004592845216393471, "epsilon_dpo/beta_margin_grad_mean": -0.40456217527389526, "epsilon_dpo/beta_margin_grad_std": 0.10449983179569244, "epsilon_dpo/beta_margin_mean": 0.4037516415119171, "epsilon_dpo/beta_margin_std": 0.45043227076530457, "epsilon_dpo/loss_margin_mean": 88.48765563964844, "grad_norm": 28.321367263793945, "kl/avg_steps": 0.53125, "kl/beta": 0.004616913385689259, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.7908016745981856e-08, "logits/chosen": -2.877673625946045, "logits/rejected": -2.9151241779327393, "logps/chosen": -147.12371826171875, "logps/ref_chosen": -57.26580810546875, "logps/ref_rejected": -78.49354553222656, "logps/rejected": -256.839111328125, "loss": 1.0709, "rewards/accuracies": 0.828125, "rewards/chosen": -0.41413813829421997, "rewards/margins": 0.4037516713142395, "rewards/rejected": -0.8178898096084595, "step": 590 }, { "epoch": 0.8934240362811792, "epsilon_dpo/beta": 0.004572156351059675, "epsilon_dpo/beta_margin_grad_mean": -0.39427638053894043, "epsilon_dpo/beta_margin_grad_std": 0.1213071197271347, "epsilon_dpo/beta_margin_mean": 0.4570046067237854, "epsilon_dpo/beta_margin_std": 0.5295947790145874, "epsilon_dpo/loss_margin_mean": 100.73342895507812, "grad_norm": 29.821186065673828, "kl/avg_steps": 0.453125, "kl/beta": 0.004592515993863344, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.7419845883949098e-08, "logits/chosen": -2.8785204887390137, "logits/rejected": -2.9485535621643066, "logps/chosen": -113.63851165771484, "logps/ref_chosen": -52.94178009033203, "logps/ref_rejected": -80.93779754638672, "logps/rejected": -242.3679656982422, "loss": 1.0464, "rewards/accuracies": 0.71875, "rewards/chosen": -0.27880746126174927, "rewards/margins": 0.4570046067237854, "rewards/rejected": -0.7358120679855347, "step": 591 }, { "epoch": 0.8949357520786092, "epsilon_dpo/beta": 0.0045479666441679, "epsilon_dpo/beta_margin_grad_mean": -0.41210874915122986, "epsilon_dpo/beta_margin_grad_std": 0.11019527912139893, "epsilon_dpo/beta_margin_mean": 0.37838515639305115, "epsilon_dpo/beta_margin_std": 0.4852598011493683, "epsilon_dpo/loss_margin_mean": 83.76229858398438, "grad_norm": 29.20871353149414, "kl/avg_steps": 0.53125, "kl/beta": 0.0045717996545135975, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.6938180788793556e-08, "logits/chosen": -2.7410738468170166, "logits/rejected": -2.8994481563568115, "logps/chosen": -105.55567169189453, "logps/ref_chosen": -43.858707427978516, "logps/ref_rejected": -74.10005187988281, "logps/rejected": -219.559326171875, "loss": 1.0983, "rewards/accuracies": 0.765625, "rewards/chosen": -0.2813223898410797, "rewards/margins": 0.37838515639305115, "rewards/rejected": -0.6597075462341309, "step": 592 }, { "epoch": 0.8964474678760394, "epsilon_dpo/beta": 0.004523933865129948, "epsilon_dpo/beta_margin_grad_mean": -0.40942201018333435, "epsilon_dpo/beta_margin_grad_std": 0.11248691380023956, "epsilon_dpo/beta_margin_mean": 0.38533318042755127, "epsilon_dpo/beta_margin_std": 0.4791716933250427, "epsilon_dpo/loss_margin_mean": 85.82151794433594, "grad_norm": 23.08043098449707, "kl/avg_steps": 0.53125, "kl/beta": 0.004547640681266785, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.6463034933723336e-08, "logits/chosen": -2.6225242614746094, "logits/rejected": -2.703193187713623, "logps/chosen": -89.98973846435547, "logps/ref_chosen": -37.054229736328125, "logps/ref_rejected": -59.88328552246094, "logps/rejected": -198.64031982421875, "loss": 1.0925, "rewards/accuracies": 0.75, "rewards/chosen": -0.24037033319473267, "rewards/margins": 0.38533318042755127, "rewards/rejected": -0.6257035732269287, "step": 593 }, { "epoch": 0.8979591836734694, "epsilon_dpo/beta": 0.004507096018642187, "epsilon_dpo/beta_margin_grad_mean": -0.42514657974243164, "epsilon_dpo/beta_margin_grad_std": 0.10777713358402252, "epsilon_dpo/beta_margin_mean": 0.3148195147514343, "epsilon_dpo/beta_margin_std": 0.4587323069572449, "epsilon_dpo/loss_margin_mean": 70.49857330322266, "grad_norm": 22.410165786743164, "kl/avg_steps": 0.375, "kl/beta": 0.004523608833551407, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.5994421609589385e-08, "logits/chosen": -2.884352684020996, "logits/rejected": -2.7569727897644043, "logps/chosen": -138.41769409179688, "logps/ref_chosen": -56.08311462402344, "logps/ref_rejected": -65.32080078125, "logps/rejected": -218.15396118164062, "loss": 1.1466, "rewards/accuracies": 0.75, "rewards/chosen": -0.3732605576515198, "rewards/margins": 0.3148195147514343, "rewards/rejected": -0.6880800724029541, "step": 594 }, { "epoch": 0.8994708994708994, "epsilon_dpo/beta": 0.004476172383874655, "epsilon_dpo/beta_margin_grad_mean": -0.3953574597835541, "epsilon_dpo/beta_margin_grad_std": 0.10987018048763275, "epsilon_dpo/beta_margin_mean": 0.4444710314273834, "epsilon_dpo/beta_margin_std": 0.4707097113132477, "epsilon_dpo/loss_margin_mean": 99.83670806884766, "grad_norm": 31.24823570251465, "kl/avg_steps": 0.6875, "kl/beta": 0.004506708588451147, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.553235392451377e-08, "logits/chosen": -2.774689197540283, "logits/rejected": -2.816162586212158, "logps/chosen": -123.99523162841797, "logps/ref_chosen": -47.433860778808594, "logps/ref_rejected": -77.87222290039062, "logps/rejected": -254.27029418945312, "loss": 1.0432, "rewards/accuracies": 0.859375, "rewards/chosen": -0.34356194734573364, "rewards/margins": 0.44447100162506104, "rewards/rejected": -0.7880330085754395, "step": 595 }, { "epoch": 0.9009826152683296, "epsilon_dpo/beta": 0.004462395794689655, "epsilon_dpo/beta_margin_grad_mean": -0.44681206345558167, "epsilon_dpo/beta_margin_grad_std": 0.10053098946809769, "epsilon_dpo/beta_margin_mean": 0.2238132357597351, "epsilon_dpo/beta_margin_std": 0.4253426194190979, "epsilon_dpo/loss_margin_mean": 50.81544876098633, "grad_norm": 26.036802291870117, "kl/avg_steps": 0.3125, "kl/beta": 0.004475936759263277, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.507684480352292e-08, "logits/chosen": -2.942657709121704, "logits/rejected": -2.7600269317626953, "logps/chosen": -141.15467834472656, "logps/ref_chosen": -58.89629364013672, "logps/ref_rejected": -54.34562683105469, "logps/rejected": -187.41946411132812, "loss": 1.2186, "rewards/accuracies": 0.671875, "rewards/chosen": -0.36806827783584595, "rewards/margins": 0.2238132506608963, "rewards/rejected": -0.591881513595581, "step": 596 }, { "epoch": 0.9024943310657596, "epsilon_dpo/beta": 0.004438732285052538, "epsilon_dpo/beta_margin_grad_mean": -0.42079585790634155, "epsilon_dpo/beta_margin_grad_std": 0.10180939733982086, "epsilon_dpo/beta_margin_mean": 0.33635836839675903, "epsilon_dpo/beta_margin_std": 0.4384286105632782, "epsilon_dpo/loss_margin_mean": 76.30511474609375, "grad_norm": 21.564193725585938, "kl/avg_steps": 0.53125, "kl/beta": 0.004461992997676134, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.4627906988186111e-08, "logits/chosen": -2.7075767517089844, "logits/rejected": -2.569540500640869, "logps/chosen": -102.53973388671875, "logps/ref_chosen": -44.906951904296875, "logps/ref_rejected": -51.548377990722656, "logps/rejected": -185.4862823486328, "loss": 1.1236, "rewards/accuracies": 0.75, "rewards/chosen": -0.25682199001312256, "rewards/margins": 0.33635836839675903, "rewards/rejected": -0.5931803584098816, "step": 597 }, { "epoch": 0.9040060468631897, "epsilon_dpo/beta": 0.004419437609612942, "epsilon_dpo/beta_margin_grad_mean": -0.4219864308834076, "epsilon_dpo/beta_margin_grad_std": 0.10997747629880905, "epsilon_dpo/beta_margin_mean": 0.333780974149704, "epsilon_dpo/beta_margin_std": 0.4738696813583374, "epsilon_dpo/loss_margin_mean": 76.19058227539062, "grad_norm": 28.94860076904297, "kl/avg_steps": 0.4375, "kl/beta": 0.004438413772732019, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.4185553036259095e-08, "logits/chosen": -2.84783935546875, "logits/rejected": -2.8290257453918457, "logps/chosen": -143.2371826171875, "logps/ref_chosen": -52.169158935546875, "logps/ref_rejected": -72.91605377197266, "logps/rejected": -240.17465209960938, "loss": 1.1334, "rewards/accuracies": 0.671875, "rewards/chosen": -0.4042730927467346, "rewards/margins": 0.33378100395202637, "rewards/rejected": -0.7380540370941162, "step": 598 }, { "epoch": 0.9055177626606198, "epsilon_dpo/beta": 0.00440018717199564, "epsilon_dpo/beta_margin_grad_mean": -0.4423910975456238, "epsilon_dpo/beta_margin_grad_std": 0.10905639082193375, "epsilon_dpo/beta_margin_mean": 0.2445920705795288, "epsilon_dpo/beta_margin_std": 0.4666239619255066, "epsilon_dpo/loss_margin_mean": 56.237117767333984, "grad_norm": 30.14556312561035, "kl/avg_steps": 0.4375, "kl/beta": 0.004419080447405577, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.3749795321332885e-08, "logits/chosen": -2.808702230453491, "logits/rejected": -2.7852392196655273, "logps/chosen": -136.89111328125, "logps/ref_chosen": -48.68849182128906, "logps/ref_rejected": -66.994384765625, "logps/rejected": -211.4341278076172, "loss": 1.2088, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3896467685699463, "rewards/margins": 0.24459205567836761, "rewards/rejected": -0.6342388391494751, "step": 599 }, { "epoch": 0.9070294784580499, "epsilon_dpo/beta": 0.004372769501060247, "epsilon_dpo/beta_margin_grad_mean": -0.4085850417613983, "epsilon_dpo/beta_margin_grad_std": 0.10669802129268646, "epsilon_dpo/beta_margin_mean": 0.39262694120407104, "epsilon_dpo/beta_margin_std": 0.47047311067581177, "epsilon_dpo/loss_margin_mean": 90.25757598876953, "grad_norm": 28.697786331176758, "kl/avg_steps": 0.625, "kl/beta": 0.004399830941110849, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.3320646032487393e-08, "logits/chosen": -2.897848606109619, "logits/rejected": -2.8930137157440186, "logps/chosen": -123.99551391601562, "logps/ref_chosen": -52.185142517089844, "logps/ref_rejected": -77.21652221679688, "logps/rejected": -239.2844696044922, "loss": 1.0833, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3151997923851013, "rewards/margins": 0.39262694120407104, "rewards/rejected": -0.7078267335891724, "step": 600 }, { "epoch": 0.9070294784580499, "eval_epsilon_dpo/beta": 0.004354366101324558, "eval_epsilon_dpo/beta_margin_grad_mean": -0.433778315782547, "eval_epsilon_dpo/beta_margin_grad_std": 0.11544771492481232, "eval_epsilon_dpo/beta_margin_mean": 0.2809829115867615, "eval_epsilon_dpo/beta_margin_std": 0.5001131892204285, "eval_epsilon_dpo/loss_margin_mean": 65.28053283691406, "eval_kl/n_epsilon_steps": 0.2869718372821808, "eval_kl/p_epsilon_steps": 0.7117077708244324, "eval_logits/chosen": -3.1673834323883057, "eval_logits/rejected": -2.9705352783203125, "eval_logps/chosen": -171.74356079101562, "eval_logps/ref_chosen": -77.40868377685547, "eval_logps/ref_rejected": -73.52816772460938, "eval_logps/rejected": -233.14356994628906, "eval_loss": 0.5935050249099731, "eval_rewards/accuracies": 0.7196303009986877, "eval_rewards/chosen": -0.41265228390693665, "eval_rewards/margins": 0.2809829115867615, "eval_rewards/rejected": -0.6936351656913757, "eval_runtime": 37.244, "eval_samples_per_second": 61.836, "eval_steps_per_second": 1.933, "step": 600 }, { "epoch": 0.90854119425548, "epsilon_dpo/beta": 0.004351075738668442, "epsilon_dpo/beta_margin_grad_mean": -0.40804341435432434, "epsilon_dpo/beta_margin_grad_std": 0.11943647265434265, "epsilon_dpo/beta_margin_mean": 0.40172702074050903, "epsilon_dpo/beta_margin_std": 0.5348893404006958, "epsilon_dpo/loss_margin_mean": 93.01659393310547, "grad_norm": 22.14923667907715, "kl/avg_steps": 0.5, "kl/beta": 0.004372503142803907, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.2898117173950868e-08, "logits/chosen": -2.942854642868042, "logits/rejected": -3.0243873596191406, "logps/chosen": -121.71076965332031, "logps/ref_chosen": -51.47467041015625, "logps/ref_rejected": -80.93226623535156, "logps/rejected": -244.18496704101562, "loss": 1.0904, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3073297142982483, "rewards/margins": 0.40172702074050903, "rewards/rejected": -0.7090567350387573, "step": 601 }, { "epoch": 0.91005291005291, "epsilon_dpo/beta": 0.004326708614826202, "epsilon_dpo/beta_margin_grad_mean": -0.41767358779907227, "epsilon_dpo/beta_margin_grad_std": 0.1040278822183609, "epsilon_dpo/beta_margin_mean": 0.35357314348220825, "epsilon_dpo/beta_margin_std": 0.4726065993309021, "epsilon_dpo/loss_margin_mean": 82.27822875976562, "grad_norm": 24.7401123046875, "kl/avg_steps": 0.5625, "kl/beta": 0.004350749310106039, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.2482220564763667e-08, "logits/chosen": -2.941708564758301, "logits/rejected": -2.7913525104522705, "logps/chosen": -125.37621307373047, "logps/ref_chosen": -57.40228271484375, "logps/ref_rejected": -63.27595520019531, "logps/rejected": -213.52810668945312, "loss": 1.1146, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2954300045967102, "rewards/margins": 0.35357314348220825, "rewards/rejected": -0.6490031480789185, "step": 602 }, { "epoch": 0.9115646258503401, "epsilon_dpo/beta": 0.004309267736971378, "epsilon_dpo/beta_margin_grad_mean": -0.42406466603279114, "epsilon_dpo/beta_margin_grad_std": 0.11282075941562653, "epsilon_dpo/beta_margin_mean": 0.32830390334129333, "epsilon_dpo/beta_margin_std": 0.5021275281906128, "epsilon_dpo/loss_margin_mean": 76.90867614746094, "grad_norm": 26.939064025878906, "kl/avg_steps": 0.40625, "kl/beta": 0.004326412919908762, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.2072967838448051e-08, "logits/chosen": -2.7085061073303223, "logits/rejected": -2.710064649581909, "logps/chosen": -119.18380737304688, "logps/ref_chosen": -43.703678131103516, "logps/ref_rejected": -70.03620147705078, "logps/rejected": -222.4250030517578, "loss": 1.1432, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3262648284435272, "rewards/margins": 0.32830390334129333, "rewards/rejected": -0.6545687317848206, "step": 603 }, { "epoch": 0.9130763416477702, "epsilon_dpo/beta": 0.004290485754609108, "epsilon_dpo/beta_margin_grad_mean": -0.43978404998779297, "epsilon_dpo/beta_margin_grad_std": 0.11687634140253067, "epsilon_dpo/beta_margin_mean": 0.25768232345581055, "epsilon_dpo/beta_margin_std": 0.5071827173233032, "epsilon_dpo/loss_margin_mean": 60.8136100769043, "grad_norm": 28.23758888244629, "kl/avg_steps": 0.4375, "kl/beta": 0.004308908246457577, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.1670370442682459e-08, "logits/chosen": -2.9371981620788574, "logits/rejected": -2.7974581718444824, "logps/chosen": -143.65338134765625, "logps/ref_chosen": -65.72084045410156, "logps/ref_rejected": -61.16135787963867, "logps/rejected": -199.90750122070312, "loss": 1.2061, "rewards/accuracies": 0.671875, "rewards/chosen": -0.33624735474586487, "rewards/margins": 0.25768232345581055, "rewards/rejected": -0.593929648399353, "step": 604 }, { "epoch": 0.9145880574452003, "epsilon_dpo/beta": 0.004262411035597324, "epsilon_dpo/beta_margin_grad_mean": -0.4184982180595398, "epsilon_dpo/beta_margin_grad_std": 0.10317090898752213, "epsilon_dpo/beta_margin_mean": 0.344939649105072, "epsilon_dpo/beta_margin_std": 0.4527731239795685, "epsilon_dpo/loss_margin_mean": 81.44808959960938, "grad_norm": 28.087738037109375, "kl/avg_steps": 0.65625, "kl/beta": 0.004290138836950064, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.1274439638981532e-08, "logits/chosen": -2.9225902557373047, "logits/rejected": -2.8271231651306152, "logps/chosen": -136.39425659179688, "logps/ref_chosen": -55.168067932128906, "logps/ref_rejected": -71.60811614990234, "logps/rejected": -234.28240966796875, "loss": 1.119, "rewards/accuracies": 0.8125, "rewards/chosen": -0.34680402278900146, "rewards/margins": 0.344939649105072, "rewards/rejected": -0.6917436718940735, "step": 605 }, { "epoch": 0.9160997732426304, "epsilon_dpo/beta": 0.004243945702910423, "epsilon_dpo/beta_margin_grad_mean": -0.4184347093105316, "epsilon_dpo/beta_margin_grad_std": 0.10365695506334305, "epsilon_dpo/beta_margin_mean": 0.3442908823490143, "epsilon_dpo/beta_margin_std": 0.44092631340026855, "epsilon_dpo/loss_margin_mean": 81.81916809082031, "grad_norm": 27.393869400024414, "kl/avg_steps": 0.4375, "kl/beta": 0.004262168426066637, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.0885186502381016e-08, "logits/chosen": -2.8369555473327637, "logits/rejected": -2.8978796005249023, "logps/chosen": -119.1710205078125, "logps/ref_chosen": -47.27920150756836, "logps/ref_rejected": -71.61123657226562, "logps/rejected": -225.3222198486328, "loss": 1.1179, "rewards/accuracies": 0.75, "rewards/chosen": -0.3063431680202484, "rewards/margins": 0.3442908525466919, "rewards/rejected": -0.6506340503692627, "step": 606 }, { "epoch": 0.9176114890400605, "epsilon_dpo/beta": 0.004221480805426836, "epsilon_dpo/beta_margin_grad_mean": -0.4102874994277954, "epsilon_dpo/beta_margin_grad_std": 0.10008849203586578, "epsilon_dpo/beta_margin_mean": 0.3794682025909424, "epsilon_dpo/beta_margin_std": 0.4316023886203766, "epsilon_dpo/loss_margin_mean": 90.42269897460938, "grad_norm": 21.85505485534668, "kl/avg_steps": 0.53125, "kl/beta": 0.0042436025105416775, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.0502621921127774e-08, "logits/chosen": -2.7999117374420166, "logits/rejected": -2.6736483573913574, "logps/chosen": -129.7017822265625, "logps/ref_chosen": -55.4199104309082, "logps/ref_rejected": -71.0352783203125, "logps/rejected": -235.73983764648438, "loss": 1.0866, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3146989643573761, "rewards/margins": 0.3794682025909424, "rewards/rejected": -0.6941671371459961, "step": 607 }, { "epoch": 0.9191232048374905, "epsilon_dpo/beta": 0.004200491588562727, "epsilon_dpo/beta_margin_grad_mean": -0.422316312789917, "epsilon_dpo/beta_margin_grad_std": 0.10893644392490387, "epsilon_dpo/beta_margin_mean": 0.33482372760772705, "epsilon_dpo/beta_margin_std": 0.4801502823829651, "epsilon_dpo/loss_margin_mean": 80.29708862304688, "grad_norm": 29.400541305541992, "kl/avg_steps": 0.5, "kl/beta": 0.004221177659928799, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.0126756596375685e-08, "logits/chosen": -2.796797275543213, "logits/rejected": -2.924224376678467, "logps/chosen": -142.24488830566406, "logps/ref_chosen": -53.32804489135742, "logps/ref_rejected": -92.88279724121094, "logps/rejected": -262.0967102050781, "loss": 1.1332, "rewards/accuracies": 0.78125, "rewards/chosen": -0.37449347972869873, "rewards/margins": 0.33482372760772705, "rewards/rejected": -0.7093172073364258, "step": 608 }, { "epoch": 0.9206349206349206, "epsilon_dpo/beta": 0.004170405212789774, "epsilon_dpo/beta_margin_grad_mean": -0.3986393213272095, "epsilon_dpo/beta_margin_grad_std": 0.08508092164993286, "epsilon_dpo/beta_margin_mean": 0.42621928453445435, "epsilon_dpo/beta_margin_std": 0.37211543321609497, "epsilon_dpo/loss_margin_mean": 102.51534271240234, "grad_norm": 24.681608200073242, "kl/avg_steps": 0.71875, "kl/beta": 0.004200176801532507, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 9.757601041885694e-09, "logits/chosen": -2.8861827850341797, "logits/rejected": -2.725926399230957, "logps/chosen": -98.91116333007812, "logps/ref_chosen": -45.84394073486328, "logps/ref_rejected": -58.6300163269043, "logps/rejected": -214.21258544921875, "loss": 1.0374, "rewards/accuracies": 0.890625, "rewards/chosen": -0.22198635339736938, "rewards/margins": 0.42621928453445435, "rewards/rejected": -0.6482056379318237, "step": 609 }, { "epoch": 0.9221466364323507, "epsilon_dpo/beta": 0.00414650235325098, "epsilon_dpo/beta_margin_grad_mean": -0.4166888892650604, "epsilon_dpo/beta_margin_grad_std": 0.11117517948150635, "epsilon_dpo/beta_margin_mean": 0.3558121919631958, "epsilon_dpo/beta_margin_std": 0.4910913407802582, "epsilon_dpo/loss_margin_mean": 86.44804382324219, "grad_norm": 21.312742233276367, "kl/avg_steps": 0.578125, "kl/beta": 0.004170203115791082, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 9.395165583732379e-09, "logits/chosen": -2.973998546600342, "logits/rejected": -2.940730094909668, "logps/chosen": -145.8649444580078, "logps/ref_chosen": -64.93912506103516, "logps/ref_rejected": -80.43107604980469, "logps/rejected": -247.80494689941406, "loss": 1.1181, "rewards/accuracies": 0.78125, "rewards/chosen": -0.33682912588119507, "rewards/margins": 0.3558121919631958, "rewards/rejected": -0.6926413178443909, "step": 610 }, { "epoch": 0.9236583522297808, "epsilon_dpo/beta": 0.004129801876842976, "epsilon_dpo/beta_margin_grad_mean": -0.43115392327308655, "epsilon_dpo/beta_margin_grad_std": 0.10162033140659332, "epsilon_dpo/beta_margin_mean": 0.289081871509552, "epsilon_dpo/beta_margin_std": 0.42765942215919495, "epsilon_dpo/loss_margin_mean": 70.68115997314453, "grad_norm": 23.11383056640625, "kl/avg_steps": 0.40625, "kl/beta": 0.004146232735365629, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 9.03946036001449e-09, "logits/chosen": -2.901271343231201, "logits/rejected": -2.9097323417663574, "logps/chosen": -110.976318359375, "logps/ref_chosen": -41.86810302734375, "logps/ref_rejected": -65.64398193359375, "logps/rejected": -205.433349609375, "loss": 1.1621, "rewards/accuracies": 0.703125, "rewards/chosen": -0.2863096594810486, "rewards/margins": 0.2890819013118744, "rewards/rejected": -0.5753915309906006, "step": 611 }, { "epoch": 0.9251700680272109, "epsilon_dpo/beta": 0.0041027674451470375, "epsilon_dpo/beta_margin_grad_mean": -0.4063434600830078, "epsilon_dpo/beta_margin_grad_std": 0.11194141954183578, "epsilon_dpo/beta_margin_mean": 0.40143850445747375, "epsilon_dpo/beta_margin_std": 0.49290162324905396, "epsilon_dpo/loss_margin_mean": 98.34375762939453, "grad_norm": 25.89215660095215, "kl/avg_steps": 0.65625, "kl/beta": 0.004129456821829081, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 8.690495320571839e-09, "logits/chosen": -3.020923614501953, "logits/rejected": -3.09545636177063, "logps/chosen": -141.45028686523438, "logps/ref_chosen": -58.225341796875, "logps/ref_rejected": -92.80967712402344, "logps/rejected": -274.37835693359375, "loss": 1.0815, "rewards/accuracies": 0.875, "rewards/chosen": -0.34255552291870117, "rewards/margins": 0.40143853425979614, "rewards/rejected": -0.7439939975738525, "step": 612 }, { "epoch": 0.926681783824641, "epsilon_dpo/beta": 0.004082429688423872, "epsilon_dpo/beta_margin_grad_mean": -0.4085478186607361, "epsilon_dpo/beta_margin_grad_std": 0.10222490131855011, "epsilon_dpo/beta_margin_mean": 0.38443639874458313, "epsilon_dpo/beta_margin_std": 0.4361855983734131, "epsilon_dpo/loss_margin_mean": 94.82818603515625, "grad_norm": 27.970016479492188, "kl/avg_steps": 0.5, "kl/beta": 0.004102534148842096, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 8.348280226706722e-09, "logits/chosen": -2.817446231842041, "logits/rejected": -2.598099708557129, "logps/chosen": -111.469970703125, "logps/ref_chosen": -45.957725524902344, "logps/ref_rejected": -54.168678283691406, "logps/rejected": -214.50912475585938, "loss": 1.0839, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2686220407485962, "rewards/margins": 0.38443639874458313, "rewards/rejected": -0.6530584096908569, "step": 613 }, { "epoch": 0.9281934996220711, "epsilon_dpo/beta": 0.0040531884878873825, "epsilon_dpo/beta_margin_grad_mean": -0.396096408367157, "epsilon_dpo/beta_margin_grad_std": 0.10546223819255829, "epsilon_dpo/beta_margin_mean": 0.43869373202323914, "epsilon_dpo/beta_margin_std": 0.47157707810401917, "epsilon_dpo/loss_margin_mean": 108.73670196533203, "grad_norm": 26.494779586791992, "kl/avg_steps": 0.71875, "kl/beta": 0.0040821232832968235, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 8.012824650910937e-09, "logits/chosen": -2.8438384532928467, "logits/rejected": -2.549985885620117, "logps/chosen": -121.44894409179688, "logps/ref_chosen": -49.787620544433594, "logps/ref_rejected": -66.30805206298828, "logps/rejected": -246.70606994628906, "loss": 1.0467, "rewards/accuracies": 0.875, "rewards/chosen": -0.2915458083152771, "rewards/margins": 0.43869373202323914, "rewards/rejected": -0.7302395105361938, "step": 614 }, { "epoch": 0.9297052154195011, "epsilon_dpo/beta": 0.004034397192299366, "epsilon_dpo/beta_margin_grad_mean": -0.417182981967926, "epsilon_dpo/beta_margin_grad_std": 0.09964893013238907, "epsilon_dpo/beta_margin_mean": 0.3520458936691284, "epsilon_dpo/beta_margin_std": 0.43388935923576355, "epsilon_dpo/loss_margin_mean": 87.89664459228516, "grad_norm": 25.445205688476562, "kl/avg_steps": 0.46875, "kl/beta": 0.004052992444485426, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 7.684137976598088e-09, "logits/chosen": -3.123307228088379, "logits/rejected": -3.107877492904663, "logps/chosen": -151.17300415039062, "logps/ref_chosen": -66.87469482421875, "logps/ref_rejected": -96.79026794433594, "logps/rejected": -268.9852294921875, "loss": 1.1092, "rewards/accuracies": 0.765625, "rewards/chosen": -0.34137794375419617, "rewards/margins": 0.3520458936691284, "rewards/rejected": -0.693423867225647, "step": 615 }, { "epoch": 0.9312169312169312, "epsilon_dpo/beta": 0.004021878354251385, "epsilon_dpo/beta_margin_grad_mean": -0.43128591775894165, "epsilon_dpo/beta_margin_grad_std": 0.09986625611782074, "epsilon_dpo/beta_margin_mean": 0.28901001811027527, "epsilon_dpo/beta_margin_std": 0.41993412375450134, "epsilon_dpo/loss_margin_mean": 72.62486267089844, "grad_norm": 24.475595474243164, "kl/avg_steps": 0.3125, "kl/beta": 0.004034082870930433, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 7.36222939784098e-09, "logits/chosen": -2.7118353843688965, "logits/rejected": -2.619784355163574, "logps/chosen": -140.18765258789062, "logps/ref_chosen": -52.034664154052734, "logps/ref_rejected": -66.86343383789062, "logps/rejected": -227.64129638671875, "loss": 1.1606, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3562736213207245, "rewards/margins": 0.28901001811027527, "rewards/rejected": -0.6452836394309998, "step": 616 }, { "epoch": 0.9327286470143613, "epsilon_dpo/beta": 0.003993010148406029, "epsilon_dpo/beta_margin_grad_mean": -0.42017489671707153, "epsilon_dpo/beta_margin_grad_std": 0.09223072230815887, "epsilon_dpo/beta_margin_mean": 0.33353152871131897, "epsilon_dpo/beta_margin_std": 0.39289456605911255, "epsilon_dpo/loss_margin_mean": 83.95561218261719, "grad_norm": 24.979202270507812, "kl/avg_steps": 0.71875, "kl/beta": 0.004021515604108572, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 7.047107919114586e-09, "logits/chosen": -2.7569775581359863, "logits/rejected": -2.811448097229004, "logps/chosen": -132.08035278320312, "logps/ref_chosen": -49.29132843017578, "logps/ref_rejected": -76.61003112792969, "logps/rejected": -243.35467529296875, "loss": 1.1173, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3312140107154846, "rewards/margins": 0.33353152871131897, "rewards/rejected": -0.6647455096244812, "step": 617 }, { "epoch": 0.9342403628117913, "epsilon_dpo/beta": 0.003968259319663048, "epsilon_dpo/beta_margin_grad_mean": -0.4206778109073639, "epsilon_dpo/beta_margin_grad_std": 0.09060176461935043, "epsilon_dpo/beta_margin_mean": 0.3324904143810272, "epsilon_dpo/beta_margin_std": 0.3870512545108795, "epsilon_dpo/loss_margin_mean": 84.2536849975586, "grad_norm": 24.261600494384766, "kl/avg_steps": 0.625, "kl/beta": 0.003992817364633083, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 6.738782355044048e-09, "logits/chosen": -2.917628765106201, "logits/rejected": -2.896481990814209, "logps/chosen": -129.22314453125, "logps/ref_chosen": -53.00926971435547, "logps/ref_rejected": -77.66494750976562, "logps/rejected": -238.13250732421875, "loss": 1.117, "rewards/accuracies": 0.828125, "rewards/chosen": -0.30343344807624817, "rewards/margins": 0.33249038457870483, "rewards/rejected": -0.6359238624572754, "step": 618 }, { "epoch": 0.9357520786092215, "epsilon_dpo/beta": 0.003941131290048361, "epsilon_dpo/beta_margin_grad_mean": -0.41602036356925964, "epsilon_dpo/beta_margin_grad_std": 0.09459950029850006, "epsilon_dpo/beta_margin_mean": 0.35305193066596985, "epsilon_dpo/beta_margin_std": 0.4066655933856964, "epsilon_dpo/loss_margin_mean": 90.03031921386719, "grad_norm": 24.391008377075195, "kl/avg_steps": 0.6875, "kl/beta": 0.003968017175793648, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 6.437261330158206e-09, "logits/chosen": -2.759155035018921, "logits/rejected": -2.827423334121704, "logps/chosen": -113.48587036132812, "logps/ref_chosen": -44.78382110595703, "logps/ref_rejected": -70.15126037597656, "logps/rejected": -228.88360595703125, "loss": 1.1035, "rewards/accuracies": 0.859375, "rewards/chosen": -0.2713110148906708, "rewards/margins": 0.35305193066596985, "rewards/rejected": -0.6243629455566406, "step": 619 }, { "epoch": 0.9372637944066515, "epsilon_dpo/beta": 0.003924074117094278, "epsilon_dpo/beta_margin_grad_mean": -0.430696964263916, "epsilon_dpo/beta_margin_grad_std": 0.09414663165807724, "epsilon_dpo/beta_margin_mean": 0.2892342507839203, "epsilon_dpo/beta_margin_std": 0.3954410254955292, "epsilon_dpo/loss_margin_mean": 74.37684631347656, "grad_norm": 25.752180099487305, "kl/avg_steps": 0.4375, "kl/beta": 0.003940923139452934, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 6.142553278648238e-09, "logits/chosen": -2.8241682052612305, "logits/rejected": -2.7016429901123047, "logps/chosen": -130.63491821289062, "logps/ref_chosen": -52.11176681518555, "logps/ref_rejected": -61.76144790649414, "logps/rejected": -214.66143798828125, "loss": 1.1557, "rewards/accuracies": 0.75, "rewards/chosen": -0.3093280494213104, "rewards/margins": 0.2892342805862427, "rewards/rejected": -0.5985623598098755, "step": 620 }, { "epoch": 0.9387755102040817, "epsilon_dpo/beta": 0.003904528683051467, "epsilon_dpo/beta_margin_grad_mean": -0.4154071807861328, "epsilon_dpo/beta_margin_grad_std": 0.10143372416496277, "epsilon_dpo/beta_margin_mean": 0.3625137209892273, "epsilon_dpo/beta_margin_std": 0.4495268762111664, "epsilon_dpo/loss_margin_mean": 93.47482299804688, "grad_norm": 29.722471237182617, "kl/avg_steps": 0.5, "kl/beta": 0.003923757001757622, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.854666444131934e-09, "logits/chosen": -2.760948419570923, "logits/rejected": -2.9007914066314697, "logps/chosen": -111.168701171875, "logps/ref_chosen": -43.81205749511719, "logps/ref_rejected": -81.87296295166016, "logps/rejected": -242.70443725585938, "loss": 1.1033, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2636774480342865, "rewards/margins": 0.3625137209892273, "rewards/rejected": -0.6261911392211914, "step": 621 }, { "epoch": 0.9402872260015117, "epsilon_dpo/beta": 0.0038875434547662735, "epsilon_dpo/beta_margin_grad_mean": -0.42293763160705566, "epsilon_dpo/beta_margin_grad_std": 0.09363308548927307, "epsilon_dpo/beta_margin_mean": 0.32302403450012207, "epsilon_dpo/beta_margin_std": 0.39626947045326233, "epsilon_dpo/loss_margin_mean": 83.76898193359375, "grad_norm": 22.920312881469727, "kl/avg_steps": 0.4375, "kl/beta": 0.0039042357821017504, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 5.573608879422875e-09, "logits/chosen": -2.9230761528015137, "logits/rejected": -2.877959966659546, "logps/chosen": -142.1696319580078, "logps/ref_chosen": -54.599464416503906, "logps/ref_rejected": -73.850341796875, "logps/rejected": -245.18951416015625, "loss": 1.1269, "rewards/accuracies": 0.765625, "rewards/chosen": -0.34153610467910767, "rewards/margins": 0.32302403450012207, "rewards/rejected": -0.6645601391792297, "step": 622 }, { "epoch": 0.9417989417989417, "epsilon_dpo/beta": 0.0038754690904170275, "epsilon_dpo/beta_margin_grad_mean": -0.44045644998550415, "epsilon_dpo/beta_margin_grad_std": 0.11197041720151901, "epsilon_dpo/beta_margin_mean": 0.24990543723106384, "epsilon_dpo/beta_margin_std": 0.4693276584148407, "epsilon_dpo/loss_margin_mean": 65.39022064208984, "grad_norm": 23.00417709350586, "kl/avg_steps": 0.3125, "kl/beta": 0.0038872291333973408, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 5.299388446305342e-09, "logits/chosen": -2.9515085220336914, "logits/rejected": -2.813601493835449, "logps/chosen": -173.72381591796875, "logps/ref_chosen": -60.89094543457031, "logps/ref_rejected": -78.56924438476562, "logps/rejected": -256.7923583984375, "loss": 1.2054, "rewards/accuracies": 0.671875, "rewards/chosen": -0.43945789337158203, "rewards/margins": 0.24990543723106384, "rewards/rejected": -0.6893633008003235, "step": 623 }, { "epoch": 0.9433106575963719, "epsilon_dpo/beta": 0.0038500740192830563, "epsilon_dpo/beta_margin_grad_mean": -0.4076012969017029, "epsilon_dpo/beta_margin_grad_std": 0.10472734272480011, "epsilon_dpo/beta_margin_mean": 0.3958456218242645, "epsilon_dpo/beta_margin_std": 0.4630257189273834, "epsilon_dpo/loss_margin_mean": 103.33561706542969, "grad_norm": 22.28907585144043, "kl/avg_steps": 0.65625, "kl/beta": 0.0038751193787902594, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 5.03201281531429e-09, "logits/chosen": -2.7976436614990234, "logits/rejected": -2.9112424850463867, "logps/chosen": -108.95379638671875, "logps/ref_chosen": -44.95097351074219, "logps/ref_rejected": -76.1891098022461, "logps/rejected": -243.52755737304688, "loss": 1.0791, "rewards/accuracies": 0.8125, "rewards/chosen": -0.24731740355491638, "rewards/margins": 0.3958456218242645, "rewards/rejected": -0.6431630849838257, "step": 624 }, { "epoch": 0.9448223733938019, "epsilon_dpo/beta": 0.003832787275314331, "epsilon_dpo/beta_margin_grad_mean": -0.4320569634437561, "epsilon_dpo/beta_margin_grad_std": 0.10303378850221634, "epsilon_dpo/beta_margin_mean": 0.2886675000190735, "epsilon_dpo/beta_margin_std": 0.44123175740242004, "epsilon_dpo/loss_margin_mean": 75.97837829589844, "grad_norm": 24.238208770751953, "kl/avg_steps": 0.453125, "kl/beta": 0.0038498546928167343, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.7714894655209174e-09, "logits/chosen": -2.802748680114746, "logits/rejected": -2.8705244064331055, "logps/chosen": -125.26852416992188, "logps/ref_chosen": -49.88308334350586, "logps/ref_rejected": -82.66120910644531, "logps/rejected": -234.0250244140625, "loss": 1.1648, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2895575761795044, "rewards/margins": 0.2886675000190735, "rewards/rejected": -0.5782250165939331, "step": 625 }, { "epoch": 0.9463340891912321, "epsilon_dpo/beta": 0.0038077188655734062, "epsilon_dpo/beta_margin_grad_mean": -0.4186539649963379, "epsilon_dpo/beta_margin_grad_std": 0.10792769491672516, "epsilon_dpo/beta_margin_mean": 0.3568192422389984, "epsilon_dpo/beta_margin_std": 0.5255135297775269, "epsilon_dpo/loss_margin_mean": 94.29745483398438, "grad_norm": 24.139524459838867, "kl/avg_steps": 0.65625, "kl/beta": 0.003832488786429167, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.517825684323323e-09, "logits/chosen": -2.6338891983032227, "logits/rejected": -2.7265381813049316, "logps/chosen": -99.3694839477539, "logps/ref_chosen": -37.54242706298828, "logps/ref_rejected": -79.72758483886719, "logps/rejected": -235.8520965576172, "loss": 1.1206, "rewards/accuracies": 0.8125, "rewards/chosen": -0.23634018003940582, "rewards/margins": 0.3568192720413208, "rewards/rejected": -0.5931594371795654, "step": 626 }, { "epoch": 0.9478458049886621, "epsilon_dpo/beta": 0.003780513536185026, "epsilon_dpo/beta_margin_grad_mean": -0.40569671988487244, "epsilon_dpo/beta_margin_grad_std": 0.09859655797481537, "epsilon_dpo/beta_margin_mean": 0.3945680260658264, "epsilon_dpo/beta_margin_std": 0.4171292185783386, "epsilon_dpo/loss_margin_mean": 104.91930389404297, "grad_norm": 26.619958877563477, "kl/avg_steps": 0.71875, "kl/beta": 0.003807501867413521, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.271028567242818e-09, "logits/chosen": -2.7842442989349365, "logits/rejected": -2.8724188804626465, "logps/chosen": -132.5081787109375, "logps/ref_chosen": -51.583740234375, "logps/ref_rejected": -91.06707763671875, "logps/rejected": -276.91082763671875, "loss": 1.072, "rewards/accuracies": 0.828125, "rewards/chosen": -0.30688077211380005, "rewards/margins": 0.3945680260658264, "rewards/rejected": -0.7014487981796265, "step": 627 }, { "epoch": 0.9493575207860923, "epsilon_dpo/beta": 0.0037582609802484512, "epsilon_dpo/beta_margin_grad_mean": -0.41581279039382935, "epsilon_dpo/beta_margin_grad_std": 0.1070471853017807, "epsilon_dpo/beta_margin_mean": 0.3519708812236786, "epsilon_dpo/beta_margin_std": 0.4553413689136505, "epsilon_dpo/loss_margin_mean": 94.31489562988281, "grad_norm": 22.983261108398438, "kl/avg_steps": 0.59375, "kl/beta": 0.003780330764129758, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.0311050177251895e-09, "logits/chosen": -2.8451311588287354, "logits/rejected": -2.8059580326080322, "logps/chosen": -118.60432434082031, "logps/ref_chosen": -45.13766860961914, "logps/ref_rejected": -73.203369140625, "logps/rejected": -240.98492431640625, "loss": 1.1148, "rewards/accuracies": 0.828125, "rewards/chosen": -0.27747979760169983, "rewards/margins": 0.351970911026001, "rewards/rejected": -0.6294506788253784, "step": 628 }, { "epoch": 0.9508692365835223, "epsilon_dpo/beta": 0.003740775864571333, "epsilon_dpo/beta_margin_grad_mean": -0.42868202924728394, "epsilon_dpo/beta_margin_grad_std": 0.08597146719694138, "epsilon_dpo/beta_margin_mean": 0.29704347252845764, "epsilon_dpo/beta_margin_std": 0.36039096117019653, "epsilon_dpo/loss_margin_mean": 79.92932891845703, "grad_norm": 22.34004020690918, "kl/avg_steps": 0.46875, "kl/beta": 0.003758017672225833, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.798061746947995e-09, "logits/chosen": -2.968493938446045, "logits/rejected": -2.9278275966644287, "logps/chosen": -135.05108642578125, "logps/ref_chosen": -60.266902923583984, "logps/ref_rejected": -76.06556701660156, "logps/rejected": -230.77908325195312, "loss": 1.1426, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2802099585533142, "rewards/margins": 0.29704347252845764, "rewards/rejected": -0.5772534608840942, "step": 629 }, { "epoch": 0.9523809523809523, "epsilon_dpo/beta": 0.003717477899044752, "epsilon_dpo/beta_margin_grad_mean": -0.42418649792671204, "epsilon_dpo/beta_margin_grad_std": 0.10443674772977829, "epsilon_dpo/beta_margin_mean": 0.3174794614315033, "epsilon_dpo/beta_margin_std": 0.461091011762619, "epsilon_dpo/loss_margin_mean": 85.97848510742188, "grad_norm": 17.792505264282227, "kl/avg_steps": 0.625, "kl/beta": 0.0037404841277748346, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.5719052736323806e-09, "logits/chosen": -2.8431665897369385, "logits/rejected": -2.8568801879882812, "logps/chosen": -124.49134063720703, "logps/ref_chosen": -52.86223602294922, "logps/ref_rejected": -67.2427978515625, "logps/rejected": -224.85037231445312, "loss": 1.1439, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2672687768936157, "rewards/margins": 0.3174794316291809, "rewards/rejected": -0.5847482681274414, "step": 630 }, { "epoch": 0.9538926681783825, "epsilon_dpo/beta": 0.0036978733260184526, "epsilon_dpo/beta_margin_grad_mean": -0.4132729172706604, "epsilon_dpo/beta_margin_grad_std": 0.1023065596818924, "epsilon_dpo/beta_margin_mean": 0.3696702718734741, "epsilon_dpo/beta_margin_std": 0.4456428587436676, "epsilon_dpo/loss_margin_mean": 100.61067199707031, "grad_norm": 24.26034927368164, "kl/avg_steps": 0.53125, "kl/beta": 0.003717251354828477, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.352641923861144e-09, "logits/chosen": -2.931039810180664, "logits/rejected": -3.074314832687378, "logps/chosen": -126.13163757324219, "logps/ref_chosen": -54.735382080078125, "logps/ref_rejected": -88.74754333496094, "logps/rejected": -260.75445556640625, "loss": 1.0972, "rewards/accuracies": 0.796875, "rewards/chosen": -0.26499316096305847, "rewards/margins": 0.3696702718734741, "rewards/rejected": -0.634663462638855, "step": 631 }, { "epoch": 0.9554043839758125, "epsilon_dpo/beta": 0.003682370763272047, "epsilon_dpo/beta_margin_grad_mean": -0.4201679229736328, "epsilon_dpo/beta_margin_grad_std": 0.09138067066669464, "epsilon_dpo/beta_margin_mean": 0.3336542248725891, "epsilon_dpo/beta_margin_std": 0.38792508840560913, "epsilon_dpo/loss_margin_mean": 91.34883117675781, "grad_norm": 20.042898178100586, "kl/avg_steps": 0.421875, "kl/beta": 0.0036976076662540436, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.140277830901428e-09, "logits/chosen": -2.859529972076416, "logits/rejected": -2.6504650115966797, "logps/chosen": -129.81094360351562, "logps/ref_chosen": -54.572113037109375, "logps/ref_rejected": -61.702392578125, "logps/rejected": -228.29005432128906, "loss": 1.1163, "rewards/accuracies": 0.75, "rewards/chosen": -0.27816224098205566, "rewards/margins": 0.3336542248725891, "rewards/rejected": -0.6118165254592896, "step": 632 }, { "epoch": 0.9569160997732427, "epsilon_dpo/beta": 0.0036674821749329567, "epsilon_dpo/beta_margin_grad_mean": -0.43735867738723755, "epsilon_dpo/beta_margin_grad_std": 0.1024823933839798, "epsilon_dpo/beta_margin_mean": 0.2638026475906372, "epsilon_dpo/beta_margin_std": 0.43162766098976135, "epsilon_dpo/loss_margin_mean": 72.73527526855469, "grad_norm": 22.10957908630371, "kl/avg_steps": 0.40625, "kl/beta": 0.0036820739042013884, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.9348189350335007e-09, "logits/chosen": -2.8056819438934326, "logits/rejected": -2.7880361080169678, "logps/chosen": -114.36033630371094, "logps/ref_chosen": -47.56438446044922, "logps/ref_rejected": -62.69200134277344, "logps/rejected": -202.22323608398438, "loss": 1.1848, "rewards/accuracies": 0.75, "rewards/chosen": -0.24606679379940033, "rewards/margins": 0.2638026475906372, "rewards/rejected": -0.5098694562911987, "step": 633 }, { "epoch": 0.9584278155706727, "epsilon_dpo/beta": 0.0036606660578399897, "epsilon_dpo/beta_margin_grad_mean": -0.4639250338077545, "epsilon_dpo/beta_margin_grad_std": 0.0947214737534523, "epsilon_dpo/beta_margin_mean": 0.15465617179870605, "epsilon_dpo/beta_margin_std": 0.40847793221473694, "epsilon_dpo/loss_margin_mean": 42.98078918457031, "grad_norm": 21.879907608032227, "kl/avg_steps": 0.1875, "kl/beta": 0.003667176002636552, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 2.736270983384276e-09, "logits/chosen": -2.7410030364990234, "logits/rejected": -2.6300296783447266, "logps/chosen": -134.33767700195312, "logps/ref_chosen": -49.21610641479492, "logps/ref_rejected": -52.37419891357422, "logps/rejected": -180.4765625, "loss": 1.2774, "rewards/accuracies": 0.578125, "rewards/chosen": -0.3121024966239929, "rewards/margins": 0.15465617179870605, "rewards/rejected": -0.466758668422699, "step": 634 }, { "epoch": 0.9599395313681028, "epsilon_dpo/beta": 0.003640087554231286, "epsilon_dpo/beta_margin_grad_mean": -0.43818002939224243, "epsilon_dpo/beta_margin_grad_std": 0.09200131148099899, "epsilon_dpo/beta_margin_mean": 0.25685325264930725, "epsilon_dpo/beta_margin_std": 0.3824840188026428, "epsilon_dpo/loss_margin_mean": 71.14636993408203, "grad_norm": 22.298091888427734, "kl/avg_steps": 0.5625, "kl/beta": 0.0036603128537535667, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.5446395297668287e-09, "logits/chosen": -2.8987905979156494, "logits/rejected": -2.926848888397217, "logps/chosen": -160.62265014648438, "logps/ref_chosen": -60.46380615234375, "logps/ref_rejected": -75.53865051269531, "logps/rejected": -246.8438720703125, "loss": 1.1815, "rewards/accuracies": 0.765625, "rewards/chosen": -0.365718275308609, "rewards/margins": 0.25685322284698486, "rewards/rejected": -0.6225715279579163, "step": 635 }, { "epoch": 0.9614512471655329, "epsilon_dpo/beta": 0.003616314148530364, "epsilon_dpo/beta_margin_grad_mean": -0.417025089263916, "epsilon_dpo/beta_margin_grad_std": 0.08757653832435608, "epsilon_dpo/beta_margin_mean": 0.34894245862960815, "epsilon_dpo/beta_margin_std": 0.3803805708885193, "epsilon_dpo/loss_margin_mean": 96.94343566894531, "grad_norm": 23.10371971130371, "kl/avg_steps": 0.65625, "kl/beta": 0.0036398388911038637, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.359929934524829e-09, "logits/chosen": -2.8144843578338623, "logits/rejected": -2.9669675827026367, "logps/chosen": -104.42010498046875, "logps/ref_chosen": -45.507652282714844, "logps/ref_rejected": -77.757568359375, "logps/rejected": -233.61346435546875, "loss": 1.1017, "rewards/accuracies": 0.859375, "rewards/chosen": -0.21348105370998383, "rewards/margins": 0.34894245862960815, "rewards/rejected": -0.5624235272407532, "step": 636 }, { "epoch": 0.9629629629629629, "epsilon_dpo/beta": 0.0036051683127880096, "epsilon_dpo/beta_margin_grad_mean": -0.4348142743110657, "epsilon_dpo/beta_margin_grad_std": 0.08453306555747986, "epsilon_dpo/beta_margin_mean": 0.27077460289001465, "epsilon_dpo/beta_margin_std": 0.3549792468547821, "epsilon_dpo/loss_margin_mean": 75.79081726074219, "grad_norm": 21.20489501953125, "kl/avg_steps": 0.3125, "kl/beta": 0.0036161080934107304, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.1821473643827137e-09, "logits/chosen": -2.8718724250793457, "logits/rejected": -2.82586407661438, "logps/chosen": -173.86734008789062, "logps/ref_chosen": -67.2421875, "logps/ref_rejected": -76.69988250732422, "logps/rejected": -259.1158447265625, "loss": 1.1642, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3856601119041443, "rewards/margins": 0.27077460289001465, "rewards/rejected": -0.6564347147941589, "step": 637 }, { "epoch": 0.9644746787603931, "epsilon_dpo/beta": 0.003584924153983593, "epsilon_dpo/beta_margin_grad_mean": -0.41527074575424194, "epsilon_dpo/beta_margin_grad_std": 0.09653671085834503, "epsilon_dpo/beta_margin_mean": 0.35729044675827026, "epsilon_dpo/beta_margin_std": 0.4160517752170563, "epsilon_dpo/loss_margin_mean": 100.26811218261719, "grad_norm": 20.479127883911133, "kl/avg_steps": 0.5625, "kl/beta": 0.003604843048378825, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.0112967923011646e-09, "logits/chosen": -2.7425355911254883, "logits/rejected": -2.768967866897583, "logps/chosen": -131.8798370361328, "logps/ref_chosen": -52.86170959472656, "logps/ref_rejected": -80.53226470947266, "logps/rejected": -259.8185119628906, "loss": 1.1017, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2845107913017273, "rewards/margins": 0.35729044675827026, "rewards/rejected": -0.6418012380599976, "step": 638 }, { "epoch": 0.9659863945578231, "epsilon_dpo/beta": 0.0035682327579706907, "epsilon_dpo/beta_margin_grad_mean": -0.4282129406929016, "epsilon_dpo/beta_margin_grad_std": 0.09305259585380554, "epsilon_dpo/beta_margin_mean": 0.30104079842567444, "epsilon_dpo/beta_margin_std": 0.39397481083869934, "epsilon_dpo/loss_margin_mean": 85.02322387695312, "grad_norm": 22.482376098632812, "kl/avg_steps": 0.46875, "kl/beta": 0.003584679216146469, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.847382997337943e-09, "logits/chosen": -2.805812358856201, "logits/rejected": -2.765056848526001, "logps/chosen": -117.75298309326172, "logps/ref_chosen": -45.926212310791016, "logps/ref_rejected": -64.27857971191406, "logps/rejected": -221.12857055664062, "loss": 1.1451, "rewards/accuracies": 0.765625, "rewards/chosen": -0.2574160695075989, "rewards/margins": 0.30104079842567444, "rewards/rejected": -0.5584568977355957, "step": 639 }, { "epoch": 0.9674981103552532, "epsilon_dpo/beta": 0.0035437792539596558, "epsilon_dpo/beta_margin_grad_mean": -0.42435556650161743, "epsilon_dpo/beta_margin_grad_std": 0.09582994878292084, "epsilon_dpo/beta_margin_mean": 0.3155733048915863, "epsilon_dpo/beta_margin_std": 0.4039221704006195, "epsilon_dpo/loss_margin_mean": 89.62804412841797, "grad_norm": 24.7429141998291, "kl/avg_steps": 0.6875, "kl/beta": 0.0035679545253515244, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.690410564514244e-09, "logits/chosen": -2.7961437702178955, "logits/rejected": -2.892221689224243, "logps/chosen": -127.24449157714844, "logps/ref_chosen": -46.06987380981445, "logps/ref_rejected": -72.00648498535156, "logps/rejected": -242.80914306640625, "loss": 1.1348, "rewards/accuracies": 0.828125, "rewards/chosen": -0.28810858726501465, "rewards/margins": 0.3155732750892639, "rewards/rejected": -0.6036818623542786, "step": 640 }, { "epoch": 0.9690098261526833, "epsilon_dpo/beta": 0.0035306569188833237, "epsilon_dpo/beta_margin_grad_mean": -0.44718924164772034, "epsilon_dpo/beta_margin_grad_std": 0.07965311408042908, "epsilon_dpo/beta_margin_mean": 0.21816930174827576, "epsilon_dpo/beta_margin_std": 0.32938826084136963, "epsilon_dpo/loss_margin_mean": 62.4356575012207, "grad_norm": 24.2479305267334, "kl/avg_steps": 0.375, "kl/beta": 0.0035435922909528017, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.5403838846864692e-09, "logits/chosen": -2.834224224090576, "logits/rejected": -2.7274556159973145, "logps/chosen": -151.83746337890625, "logps/ref_chosen": -62.31818389892578, "logps/ref_rejected": -72.54466247558594, "logps/rejected": -224.49960327148438, "loss": 1.2065, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3168661892414093, "rewards/margins": 0.21816930174827576, "rewards/rejected": -0.5350354909896851, "step": 641 }, { "epoch": 0.9705215419501134, "epsilon_dpo/beta": 0.0035174661315977573, "epsilon_dpo/beta_margin_grad_mean": -0.4405294954776764, "epsilon_dpo/beta_margin_grad_std": 0.08478718996047974, "epsilon_dpo/beta_margin_mean": 0.2457943707704544, "epsilon_dpo/beta_margin_std": 0.35019582509994507, "epsilon_dpo/loss_margin_mean": 70.578369140625, "grad_norm": 23.22334861755371, "kl/avg_steps": 0.375, "kl/beta": 0.003530353307723999, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.3973071544233218e-09, "logits/chosen": -2.8545050621032715, "logits/rejected": -2.674060821533203, "logps/chosen": -139.794677734375, "logps/ref_chosen": -58.85230255126953, "logps/ref_rejected": -63.89229202270508, "logps/rejected": -215.4130401611328, "loss": 1.1855, "rewards/accuracies": 0.671875, "rewards/chosen": -0.2857569456100464, "rewards/margins": 0.2457943856716156, "rewards/rejected": -0.5315513014793396, "step": 642 }, { "epoch": 0.9720332577475435, "epsilon_dpo/beta": 0.0035010273568332195, "epsilon_dpo/beta_margin_grad_mean": -0.43383854627609253, "epsilon_dpo/beta_margin_grad_std": 0.09310735762119293, "epsilon_dpo/beta_margin_mean": 0.275696724653244, "epsilon_dpo/beta_margin_std": 0.3892333209514618, "epsilon_dpo/loss_margin_mean": 79.4657974243164, "grad_norm": 26.52947235107422, "kl/avg_steps": 0.46875, "kl/beta": 0.0035171639174222946, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.261184375888541e-09, "logits/chosen": -2.838277816772461, "logits/rejected": -2.8054704666137695, "logps/chosen": -140.97860717773438, "logps/ref_chosen": -58.589515686035156, "logps/ref_rejected": -74.7073974609375, "logps/rejected": -236.56228637695312, "loss": 1.1662, "rewards/accuracies": 0.71875, "rewards/chosen": -0.29013603925704956, "rewards/margins": 0.275696724653244, "rewards/rejected": -0.5658327341079712, "step": 643 }, { "epoch": 0.9735449735449735, "epsilon_dpo/beta": 0.003484140383079648, "epsilon_dpo/beta_margin_grad_mean": -0.4290357530117035, "epsilon_dpo/beta_margin_grad_std": 0.0820101872086525, "epsilon_dpo/beta_margin_mean": 0.2923728823661804, "epsilon_dpo/beta_margin_std": 0.3420700430870056, "epsilon_dpo/loss_margin_mean": 84.55711364746094, "grad_norm": 18.6878662109375, "kl/avg_steps": 0.484375, "kl/beta": 0.0035007542464882135, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.1320193567288527e-09, "logits/chosen": -2.819199562072754, "logits/rejected": -2.805053234100342, "logps/chosen": -117.35118103027344, "logps/ref_chosen": -45.64036560058594, "logps/ref_rejected": -58.39754104614258, "logps/rejected": -214.66546630859375, "loss": 1.1436, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2507040798664093, "rewards/margins": 0.2923728823661804, "rewards/rejected": -0.5430769920349121, "step": 644 }, { "epoch": 0.9750566893424036, "epsilon_dpo/beta": 0.003466806374490261, "epsilon_dpo/beta_margin_grad_mean": -0.4383280873298645, "epsilon_dpo/beta_margin_grad_std": 0.08082949370145798, "epsilon_dpo/beta_margin_mean": 0.25321659445762634, "epsilon_dpo/beta_margin_std": 0.3326885402202606, "epsilon_dpo/loss_margin_mean": 73.65705871582031, "grad_norm": 21.3898983001709, "kl/avg_steps": 0.5, "kl/beta": 0.0034838791470974684, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.0098157099674987e-09, "logits/chosen": -2.8883442878723145, "logits/rejected": -2.7249622344970703, "logps/chosen": -131.4900360107422, "logps/ref_chosen": -54.29890441894531, "logps/ref_rejected": -62.46510314941406, "logps/rejected": -213.31329345703125, "loss": 1.1762, "rewards/accuracies": 0.796875, "rewards/chosen": -0.2689738869667053, "rewards/margins": 0.25321662425994873, "rewards/rejected": -0.522190511226654, "step": 645 }, { "epoch": 0.9765684051398337, "epsilon_dpo/beta": 0.003450642107054591, "epsilon_dpo/beta_margin_grad_mean": -0.4384441673755646, "epsilon_dpo/beta_margin_grad_std": 0.0880342572927475, "epsilon_dpo/beta_margin_mean": 0.2561708986759186, "epsilon_dpo/beta_margin_std": 0.3683074414730072, "epsilon_dpo/loss_margin_mean": 74.91301727294922, "grad_norm": 22.905025482177734, "kl/avg_steps": 0.46875, "kl/beta": 0.0034665465354919434, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 8.945768539031783e-10, "logits/chosen": -2.8017702102661133, "logits/rejected": -2.79258394241333, "logps/chosen": -149.4557647705078, "logps/ref_chosen": -56.12446212768555, "logps/ref_rejected": -71.72216033935547, "logps/rejected": -239.96649169921875, "loss": 1.1793, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3234636187553406, "rewards/margins": 0.25617092847824097, "rewards/rejected": -0.5796345472335815, "step": 646 }, { "epoch": 0.9780801209372638, "epsilon_dpo/beta": 0.0034259159583598375, "epsilon_dpo/beta_margin_grad_mean": -0.41254451870918274, "epsilon_dpo/beta_margin_grad_std": 0.09956267476081848, "epsilon_dpo/beta_margin_mean": 0.36902064085006714, "epsilon_dpo/beta_margin_std": 0.4401799440383911, "epsilon_dpo/loss_margin_mean": 108.2276840209961, "grad_norm": 21.12336540222168, "kl/avg_steps": 0.71875, "kl/beta": 0.0034503729548305273, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 7.863060120144316e-10, "logits/chosen": -2.9666121006011963, "logits/rejected": -2.9039487838745117, "logps/chosen": -151.60028076171875, "logps/ref_chosen": -59.301612854003906, "logps/ref_rejected": -95.66838073730469, "logps/rejected": -296.1947326660156, "loss": 1.0963, "rewards/accuracies": 0.859375, "rewards/chosen": -0.31780529022216797, "rewards/margins": 0.36902064085006714, "rewards/rejected": -0.6868259310722351, "step": 647 }, { "epoch": 0.9795918367346939, "epsilon_dpo/beta": 0.003408962395042181, "epsilon_dpo/beta_margin_grad_mean": -0.4289514124393463, "epsilon_dpo/beta_margin_grad_std": 0.0915476456284523, "epsilon_dpo/beta_margin_mean": 0.2957060635089874, "epsilon_dpo/beta_margin_std": 0.3852503001689911, "epsilon_dpo/loss_margin_mean": 87.4386215209961, "grad_norm": 19.848743438720703, "kl/avg_steps": 0.5, "kl/beta": 0.003425750182941556, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 6.850062128694045e-10, "logits/chosen": -2.791996955871582, "logits/rejected": -2.700218439102173, "logps/chosen": -140.98074340820312, "logps/ref_chosen": -56.8007698059082, "logps/ref_rejected": -72.17013549804688, "logps/rejected": -243.78872680664062, "loss": 1.1482, "rewards/accuracies": 0.75, "rewards/chosen": -0.2882939577102661, "rewards/margins": 0.29570603370666504, "rewards/rejected": -0.5839999914169312, "step": 648 }, { "epoch": 0.981103552532124, "epsilon_dpo/beta": 0.003388806479051709, "epsilon_dpo/beta_margin_grad_mean": -0.42987704277038574, "epsilon_dpo/beta_margin_grad_std": 0.09154313057661057, "epsilon_dpo/beta_margin_mean": 0.29058176279067993, "epsilon_dpo/beta_margin_std": 0.38440483808517456, "epsilon_dpo/loss_margin_mean": 86.38435363769531, "grad_norm": 20.142330169677734, "kl/avg_steps": 0.59375, "kl/beta": 0.003408706746995449, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.906802900412788e-10, "logits/chosen": -2.7263295650482178, "logits/rejected": -2.6930899620056152, "logps/chosen": -124.84034729003906, "logps/ref_chosen": -45.76455307006836, "logps/ref_rejected": -66.81488037109375, "logps/rejected": -232.2750244140625, "loss": 1.1525, "rewards/accuracies": 0.828125, "rewards/chosen": -0.269201397895813, "rewards/margins": 0.29058176279067993, "rewards/rejected": -0.5597831606864929, "step": 649 }, { "epoch": 0.982615268329554, "epsilon_dpo/beta": 0.0033688039984554052, "epsilon_dpo/beta_margin_grad_mean": -0.4207400381565094, "epsilon_dpo/beta_margin_grad_std": 0.08710543066263199, "epsilon_dpo/beta_margin_mean": 0.32889360189437866, "epsilon_dpo/beta_margin_std": 0.36216315627098083, "epsilon_dpo/loss_margin_mean": 98.25837707519531, "grad_norm": 17.375455856323242, "kl/avg_steps": 0.59375, "kl/beta": 0.0033885869197547436, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.033308820289184e-10, "logits/chosen": -2.752939462661743, "logits/rejected": -2.749907970428467, "logps/chosen": -112.35929107666016, "logps/ref_chosen": -47.4556884765625, "logps/ref_rejected": -69.6038589477539, "logps/rejected": -232.76583862304688, "loss": 1.1161, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2194691002368927, "rewards/margins": 0.32889360189437866, "rewards/rejected": -0.548362672328949, "step": 650 }, { "epoch": 0.9841269841269841, "epsilon_dpo/beta": 0.0033531205262988806, "epsilon_dpo/beta_margin_grad_mean": -0.4406028687953949, "epsilon_dpo/beta_margin_grad_std": 0.08775340765714645, "epsilon_dpo/beta_margin_mean": 0.24748744070529938, "epsilon_dpo/beta_margin_std": 0.36865469813346863, "epsilon_dpo/loss_margin_mean": 74.44540405273438, "grad_norm": 22.54496192932129, "kl/avg_steps": 0.46875, "kl/beta": 0.003368586068972945, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.2296043218295606e-10, "logits/chosen": -2.859612464904785, "logits/rejected": -2.813570261001587, "logps/chosen": -128.33189392089844, "logps/ref_chosen": -50.950531005859375, "logps/ref_rejected": -66.28712463378906, "logps/rejected": -218.1138916015625, "loss": 1.1869, "rewards/accuracies": 0.75, "rewards/chosen": -0.26044052839279175, "rewards/margins": 0.24748745560646057, "rewards/rejected": -0.5079280138015747, "step": 651 }, { "epoch": 0.9856386999244142, "epsilon_dpo/beta": 0.0033395823556929827, "epsilon_dpo/beta_margin_grad_mean": -0.43764322996139526, "epsilon_dpo/beta_margin_grad_std": 0.09326408058404922, "epsilon_dpo/beta_margin_mean": 0.2556740641593933, "epsilon_dpo/beta_margin_std": 0.3997213542461395, "epsilon_dpo/loss_margin_mean": 77.22918701171875, "grad_norm": 20.155189514160156, "kl/avg_steps": 0.40625, "kl/beta": 0.0033528695348650217, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.4957118863768176e-10, "logits/chosen": -2.8741235733032227, "logits/rejected": -2.662733554840088, "logps/chosen": -151.67822265625, "logps/ref_chosen": -56.13648986816406, "logps/ref_rejected": -67.399169921875, "logps/rejected": -240.17007446289062, "loss": 1.1853, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3204508423805237, "rewards/margins": 0.2556740641593933, "rewards/rejected": -0.576124906539917, "step": 652 }, { "epoch": 0.9871504157218443, "epsilon_dpo/beta": 0.0033187647350132465, "epsilon_dpo/beta_margin_grad_mean": -0.427230566740036, "epsilon_dpo/beta_margin_grad_std": 0.08171962946653366, "epsilon_dpo/beta_margin_mean": 0.30349186062812805, "epsilon_dpo/beta_margin_std": 0.3495093584060669, "epsilon_dpo/loss_margin_mean": 91.90961456298828, "grad_norm": 22.07168197631836, "kl/avg_steps": 0.625, "kl/beta": 0.0033393034245818853, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.831652042480093e-10, "logits/chosen": -2.945342540740967, "logits/rejected": -2.7773563861846924, "logps/chosen": -125.92410278320312, "logps/ref_chosen": -53.61786651611328, "logps/ref_rejected": -70.0718994140625, "logps/rejected": -234.28775024414062, "loss": 1.1348, "rewards/accuracies": 0.828125, "rewards/chosen": -0.24086423218250275, "rewards/margins": 0.30349186062812805, "rewards/rejected": -0.544356107711792, "step": 653 }, { "epoch": 0.9886621315192744, "epsilon_dpo/beta": 0.0033002253621816635, "epsilon_dpo/beta_margin_grad_mean": -0.4346742331981659, "epsilon_dpo/beta_margin_grad_std": 0.08571688830852509, "epsilon_dpo/beta_margin_mean": 0.27267175912857056, "epsilon_dpo/beta_margin_std": 0.3640969693660736, "epsilon_dpo/loss_margin_mean": 83.1996841430664, "grad_norm": 24.286649703979492, "kl/avg_steps": 0.5625, "kl/beta": 0.0033185624051839113, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.2374433653205016e-10, "logits/chosen": -2.6473493576049805, "logits/rejected": -2.8752939701080322, "logps/chosen": -137.35940551757812, "logps/ref_chosen": -50.120521545410156, "logps/ref_rejected": -78.58853149414062, "logps/rejected": -249.027099609375, "loss": 1.1639, "rewards/accuracies": 0.796875, "rewards/chosen": -0.28836315870285034, "rewards/margins": 0.27267175912857056, "rewards/rejected": -0.5610349178314209, "step": 654 }, { "epoch": 0.9901738473167044, "epsilon_dpo/beta": 0.003279702737927437, "epsilon_dpo/beta_margin_grad_mean": -0.4207151532173157, "epsilon_dpo/beta_margin_grad_std": 0.08204860985279083, "epsilon_dpo/beta_margin_mean": 0.32972094416618347, "epsilon_dpo/beta_margin_std": 0.34827113151550293, "epsilon_dpo/loss_margin_mean": 101.06061553955078, "grad_norm": 19.97102928161621, "kl/avg_steps": 0.625, "kl/beta": 0.0032999999821186066, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.7131024761923852e-10, "logits/chosen": -2.7710342407226562, "logits/rejected": -2.8003931045532227, "logps/chosen": -108.00314331054688, "logps/ref_chosen": -42.75675964355469, "logps/ref_rejected": -70.48648834228516, "logps/rejected": -236.79348754882812, "loss": 1.1126, "rewards/accuracies": 0.875, "rewards/chosen": -0.21476933360099792, "rewards/margins": 0.32972094416618347, "rewards/rejected": -0.5444902777671814, "step": 655 }, { "epoch": 0.9916855631141346, "epsilon_dpo/beta": 0.0032613822259008884, "epsilon_dpo/beta_margin_grad_mean": -0.43266981840133667, "epsilon_dpo/beta_margin_grad_std": 0.08869948238134384, "epsilon_dpo/beta_margin_mean": 0.2799364924430847, "epsilon_dpo/beta_margin_std": 0.37124961614608765, "epsilon_dpo/loss_margin_mean": 86.4959487915039, "grad_norm": 23.93140983581543, "kl/avg_steps": 0.5625, "kl/beta": 0.0032795032020658255, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.2586440420372934e-10, "logits/chosen": -3.0065250396728516, "logits/rejected": -2.8719706535339355, "logps/chosen": -159.40811157226562, "logps/ref_chosen": -65.75037384033203, "logps/ref_rejected": -77.79585266113281, "logps/rejected": -257.94952392578125, "loss": 1.1592, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3061829209327698, "rewards/margins": 0.2799364924430847, "rewards/rejected": -0.5861194133758545, "step": 656 }, { "epoch": 0.9931972789115646, "epsilon_dpo/beta": 0.0032380432821810246, "epsilon_dpo/beta_margin_grad_mean": -0.41607293486595154, "epsilon_dpo/beta_margin_grad_std": 0.08618675917387009, "epsilon_dpo/beta_margin_mean": 0.34958741068840027, "epsilon_dpo/beta_margin_std": 0.36564967036247253, "epsilon_dpo/loss_margin_mean": 108.44845581054688, "grad_norm": 19.01677131652832, "kl/avg_steps": 0.71875, "kl/beta": 0.003261159174144268, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 8.740807750345913e-11, "logits/chosen": -2.7217354774475098, "logits/rejected": -2.671895980834961, "logps/chosen": -117.72611999511719, "logps/ref_chosen": -42.9866943359375, "logps/ref_rejected": -71.7634048461914, "logps/rejected": -254.9512939453125, "loss": 1.0991, "rewards/accuracies": 0.84375, "rewards/chosen": -0.24281863868236542, "rewards/margins": 0.3495873808860779, "rewards/rejected": -0.5924060344696045, "step": 657 }, { "epoch": 0.9947089947089947, "epsilon_dpo/beta": 0.0032250552903860807, "epsilon_dpo/beta_margin_grad_mean": -0.43769460916519165, "epsilon_dpo/beta_margin_grad_std": 0.0908212959766388, "epsilon_dpo/beta_margin_mean": 0.2616998851299286, "epsilon_dpo/beta_margin_std": 0.38718393445014954, "epsilon_dpo/loss_margin_mean": 81.89582061767578, "grad_norm": 18.544219970703125, "kl/avg_steps": 0.40625, "kl/beta": 0.003237886819988489, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 5.594234322453539e-11, "logits/chosen": -2.9648542404174805, "logits/rejected": -2.8824892044067383, "logps/chosen": -133.65147399902344, "logps/ref_chosen": -56.295372009277344, "logps/ref_rejected": -74.53610229492188, "logps/rejected": -233.78802490234375, "loss": 1.1775, "rewards/accuracies": 0.71875, "rewards/chosen": -0.25034821033477783, "rewards/margins": 0.261699914932251, "rewards/rejected": -0.5120481252670288, "step": 658 }, { "epoch": 0.9962207105064248, "epsilon_dpo/beta": 0.0032140223775058985, "epsilon_dpo/beta_margin_grad_mean": -0.45865002274513245, "epsilon_dpo/beta_margin_grad_std": 0.08520904183387756, "epsilon_dpo/beta_margin_mean": 0.17159080505371094, "epsilon_dpo/beta_margin_std": 0.3541049361228943, "epsilon_dpo/loss_margin_mean": 54.13220977783203, "grad_norm": 25.61504364013672, "kl/avg_steps": 0.34375, "kl/beta": 0.003224786138162017, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.146808153123293e-11, "logits/chosen": -2.766911745071411, "logits/rejected": -2.6418380737304688, "logps/chosen": -141.21490478515625, "logps/ref_chosen": -49.89054489135742, "logps/ref_rejected": -54.54624938964844, "logps/rejected": -200.0028076171875, "loss": 1.2527, "rewards/accuracies": 0.71875, "rewards/chosen": -0.29459086060523987, "rewards/margins": 0.17159080505371094, "rewards/rejected": -0.4661816656589508, "step": 659 }, { "epoch": 0.9977324263038548, "epsilon_dpo/beta": 0.003194976830855012, "epsilon_dpo/beta_margin_grad_mean": -0.42317187786102295, "epsilon_dpo/beta_margin_grad_std": 0.08135712146759033, "epsilon_dpo/beta_margin_mean": 0.32237380743026733, "epsilon_dpo/beta_margin_std": 0.3604564964771271, "epsilon_dpo/loss_margin_mean": 101.43387603759766, "grad_norm": 21.981107711791992, "kl/avg_steps": 0.59375, "kl/beta": 0.003213738789781928, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.3985977021235829e-11, "logits/chosen": -2.8313307762145996, "logits/rejected": -2.8559885025024414, "logps/chosen": -127.81207275390625, "logps/ref_chosen": -48.67169189453125, "logps/ref_rejected": -77.04704284667969, "logps/rejected": -257.6213073730469, "loss": 1.12, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2535557746887207, "rewards/margins": 0.32237380743026733, "rewards/rejected": -0.575929582118988, "step": 660 }, { "epoch": 0.999244142101285, "epsilon_dpo/beta": 0.0031781154684722424, "epsilon_dpo/beta_margin_grad_mean": -0.44626033306121826, "epsilon_dpo/beta_margin_grad_std": 0.08369455486536026, "epsilon_dpo/beta_margin_mean": 0.22157911956310272, "epsilon_dpo/beta_margin_std": 0.347888320684433, "epsilon_dpo/loss_margin_mean": 70.36561584472656, "grad_norm": 20.05291175842285, "kl/avg_steps": 0.53125, "kl/beta": 0.0031947698444128036, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.4965187065971735e-12, "logits/chosen": -2.79404878616333, "logits/rejected": -2.7915592193603516, "logps/chosen": -159.8495635986328, "logps/ref_chosen": -56.03480911254883, "logps/ref_rejected": -71.04798889160156, "logps/rejected": -245.2283477783203, "loss": 1.2065, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3311423063278198, "rewards/margins": 0.22157913446426392, "rewards/rejected": -0.5527214407920837, "step": 661 }, { "epoch": 0.999244142101285, "step": 661, "total_flos": 0.0, "train_loss": 1.0336073330418967, "train_runtime": 2985.5995, "train_samples_per_second": 14.18, "train_steps_per_second": 0.221 } ], "logging_steps": 1, "max_steps": 661, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }