{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014684287812041115, "epsilon_dpo/beta": 0.09979122877120972, "epsilon_dpo/beta_margin_grad_mean": -0.4994426667690277, "epsilon_dpo/beta_margin_grad_std": 0.007109890226274729, "epsilon_dpo/beta_margin_mean": 0.0022308863699436188, "epsilon_dpo/beta_margin_std": 0.028449052944779396, "epsilon_dpo/loss_margin_mean": 0.024216145277023315, "grad_norm": 153.6117401123047, "kl/avg_steps": 0.21875, "kl/beta": 0.10000000149011612, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 0.0, "logits/chosen": -3.435748338699341, "logits/rejected": -3.461001396179199, "logps/chosen": -31.484785079956055, "logps/ref_chosen": -31.500404357910156, "logps/ref_rejected": -79.35133361816406, "logps/rejected": -79.35993194580078, "loss": 1.3843, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0014821073273196816, "rewards/margins": 0.002230936661362648, "rewards/rejected": -0.0007488295086659491, "step": 1 }, { "epoch": 0.002936857562408223, "epsilon_dpo/beta": 0.09972933679819107, "epsilon_dpo/beta_margin_grad_mean": -0.5002588629722595, "epsilon_dpo/beta_margin_grad_std": 0.006068441551178694, "epsilon_dpo/beta_margin_mean": -0.0010360678425058722, "epsilon_dpo/beta_margin_std": 0.024278299883008003, "epsilon_dpo/loss_margin_mean": -0.008541211485862732, "grad_norm": 155.6954803466797, "kl/avg_steps": 0.0625, "kl/beta": 0.09978172928094864, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 7.246376811594203e-09, "logits/chosen": -3.517317533493042, "logits/rejected": -3.4876961708068848, "logps/chosen": -36.63703918457031, "logps/ref_chosen": -36.63695526123047, "logps/ref_rejected": -80.44895935058594, "logps/rejected": -80.44050598144531, "loss": 1.3875, "rewards/accuracies": 0.546875, "rewards/chosen": -8.885323768481612e-05, "rewards/margins": -0.0010360784363001585, "rewards/rejected": 0.0009472252568230033, "step": 2 }, { "epoch": 0.004405286343612335, "epsilon_dpo/beta": 0.09971363842487335, "epsilon_dpo/beta_margin_grad_mean": -0.4988003969192505, "epsilon_dpo/beta_margin_grad_std": 0.007179732900112867, "epsilon_dpo/beta_margin_mean": 0.004800259135663509, "epsilon_dpo/beta_margin_std": 0.028725432232022285, "epsilon_dpo/loss_margin_mean": 0.050338417291641235, "grad_norm": 160.3519744873047, "kl/avg_steps": 0.015625, "kl/beta": 0.09971940517425537, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.5, "learning_rate": 1.4492753623188406e-08, "logits/chosen": -3.5040597915649414, "logits/rejected": -3.4887866973876953, "logps/chosen": -37.792213439941406, "logps/ref_chosen": -37.83708190917969, "logps/ref_rejected": -73.14408874511719, "logps/rejected": -73.14955139160156, "loss": 1.3817, "rewards/accuracies": 0.515625, "rewards/chosen": 0.004386060871183872, "rewards/margins": 0.004800218157470226, "rewards/rejected": -0.00041415661689825356, "step": 3 }, { "epoch": 0.005873715124816446, "epsilon_dpo/beta": 0.09960457682609558, "epsilon_dpo/beta_margin_grad_mean": -0.4994582235813141, "epsilon_dpo/beta_margin_grad_std": 0.007399262860417366, "epsilon_dpo/beta_margin_mean": 0.002166971331462264, "epsilon_dpo/beta_margin_std": 0.029604651033878326, "epsilon_dpo/loss_margin_mean": 0.02400125563144684, "grad_norm": 155.65963745117188, "kl/avg_steps": 0.109375, "kl/beta": 0.0997038260102272, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.546875, "learning_rate": 2.1739130434782606e-08, "logits/chosen": -3.468817710876465, "logits/rejected": -3.508214235305786, "logps/chosen": -43.339508056640625, "logps/ref_chosen": -43.336036682128906, "logps/ref_rejected": -93.7607650756836, "logps/rejected": -93.78823852539062, "loss": 1.3843, "rewards/accuracies": 0.53125, "rewards/chosen": -0.000430062209488824, "rewards/margins": 0.002166960621252656, "rewards/rejected": -0.0025970228016376495, "step": 4 }, { "epoch": 0.007342143906020558, "epsilon_dpo/beta": 0.09963598102331161, "epsilon_dpo/beta_margin_grad_mean": -0.5009759068489075, "epsilon_dpo/beta_margin_grad_std": 0.008376221172511578, "epsilon_dpo/beta_margin_mean": -0.003906682599335909, "epsilon_dpo/beta_margin_std": 0.033521223813295364, "epsilon_dpo/loss_margin_mean": -0.036883413791656494, "grad_norm": 179.6461639404297, "kl/avg_steps": -0.03125, "kl/beta": 0.09959489107131958, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 2.898550724637681e-08, "logits/chosen": -3.436485767364502, "logits/rejected": -3.4411563873291016, "logps/chosen": -32.93244934082031, "logps/ref_chosen": -32.90675354003906, "logps/ref_rejected": -90.30764770507812, "logps/rejected": -90.29646301269531, "loss": 1.3905, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0026276579592376947, "rewards/margins": -0.003906670026481152, "rewards/rejected": 0.0012790121836587787, "step": 5 }, { "epoch": 0.00881057268722467, "epsilon_dpo/beta": 0.09954258054494858, "epsilon_dpo/beta_margin_grad_mean": -0.4993135929107666, "epsilon_dpo/beta_margin_grad_std": 0.006349037401378155, "epsilon_dpo/beta_margin_mean": 0.002748197177425027, "epsilon_dpo/beta_margin_std": 0.025408554822206497, "epsilon_dpo/loss_margin_mean": 0.029358193278312683, "grad_norm": 168.01638793945312, "kl/avg_steps": 0.09375, "kl/beta": 0.09962602704763412, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 3.6231884057971014e-08, "logits/chosen": -3.476027011871338, "logits/rejected": -3.391662836074829, "logps/chosen": -40.59587478637695, "logps/ref_chosen": -40.57701110839844, "logps/ref_rejected": -98.47296142578125, "logps/rejected": -98.52117919921875, "loss": 1.3837, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0019228225573897362, "rewards/margins": 0.0027480702847242355, "rewards/rejected": -0.004670892842113972, "step": 6 }, { "epoch": 0.010279001468428781, "epsilon_dpo/beta": 0.09951155632734299, "epsilon_dpo/beta_margin_grad_mean": -0.49992430210113525, "epsilon_dpo/beta_margin_grad_std": 0.007149491459131241, "epsilon_dpo/beta_margin_mean": 0.0003023834724444896, "epsilon_dpo/beta_margin_std": 0.028604645282030106, "epsilon_dpo/loss_margin_mean": 0.00525471568107605, "grad_norm": 165.30645751953125, "kl/avg_steps": 0.03125, "kl/beta": 0.09953271597623825, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 4.347826086956521e-08, "logits/chosen": -3.4681951999664307, "logits/rejected": -3.4873459339141846, "logps/chosen": -41.3924674987793, "logps/ref_chosen": -41.414642333984375, "logps/ref_rejected": -111.01716613769531, "logps/rejected": -111.000244140625, "loss": 1.3862, "rewards/accuracies": 0.515625, "rewards/chosen": 0.002097110729664564, "rewards/margins": 0.0003024058823939413, "rewards/rejected": 0.0017947049345821142, "step": 7 }, { "epoch": 0.011747430249632892, "epsilon_dpo/beta": 0.09983792901039124, "epsilon_dpo/beta_margin_grad_mean": -0.5019076466560364, "epsilon_dpo/beta_margin_grad_std": 0.006241849157959223, "epsilon_dpo/beta_margin_mean": -0.007631635759025812, "epsilon_dpo/beta_margin_std": 0.02497241646051407, "epsilon_dpo/loss_margin_mean": -0.07474762201309204, "grad_norm": 170.1224822998047, "kl/avg_steps": -0.328125, "kl/beta": 0.09950161725282669, "kl/n_epsilon_steps": 0.65625, "kl/p_epsilon_steps": 0.328125, "learning_rate": 5.0724637681159424e-08, "logits/chosen": -3.5701963901519775, "logits/rejected": -3.46230411529541, "logps/chosen": -39.27941131591797, "logps/ref_chosen": -39.25566482543945, "logps/ref_rejected": -85.94947814941406, "logps/rejected": -85.89848327636719, "loss": 1.3941, "rewards/accuracies": 0.328125, "rewards/chosen": -0.0024497562553733587, "rewards/margins": -0.007631613872945309, "rewards/rejected": 0.005181857850402594, "step": 8 }, { "epoch": 0.013215859030837005, "epsilon_dpo/beta": 0.09983916580677032, "epsilon_dpo/beta_margin_grad_mean": -0.5000754594802856, "epsilon_dpo/beta_margin_grad_std": 0.006013626232743263, "epsilon_dpo/beta_margin_mean": -0.0003015802940353751, "epsilon_dpo/beta_margin_std": 0.02405872382223606, "epsilon_dpo/loss_margin_mean": -0.0012376010417938232, "grad_norm": 179.10533142089844, "kl/avg_steps": 0.0, "kl/beta": 0.09982918202877045, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 5.797101449275362e-08, "logits/chosen": -3.526092529296875, "logits/rejected": -3.532437801361084, "logps/chosen": -38.925682067871094, "logps/ref_chosen": -38.9265251159668, "logps/ref_rejected": -106.43075561523438, "logps/rejected": -106.42868041992188, "loss": 1.3867, "rewards/accuracies": 0.453125, "rewards/chosen": -4.6280911192297935e-07, "rewards/margins": -0.0003015893744304776, "rewards/rejected": 0.00030112662352621555, "step": 9 }, { "epoch": 0.014684287812041116, "epsilon_dpo/beta": 0.09972980618476868, "epsilon_dpo/beta_margin_grad_mean": -0.49933433532714844, "epsilon_dpo/beta_margin_grad_std": 0.005578126758337021, "epsilon_dpo/beta_margin_mean": 0.0026634030509740114, "epsilon_dpo/beta_margin_std": 0.02231536991894245, "epsilon_dpo/loss_margin_mean": 0.028553977608680725, "grad_norm": 161.84414672851562, "kl/avg_steps": 0.109375, "kl/beta": 0.09982918202877045, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.546875, "learning_rate": 6.521739130434782e-08, "logits/chosen": -3.487415313720703, "logits/rejected": -3.4311609268188477, "logps/chosen": -37.931678771972656, "logps/ref_chosen": -37.94172668457031, "logps/ref_rejected": -84.4234848022461, "logps/rejected": -84.44198608398438, "loss": 1.3838, "rewards/accuracies": 0.578125, "rewards/chosen": 0.0009347056620754302, "rewards/margins": 0.0026633867528289557, "rewards/rejected": -0.0017286810325458646, "step": 10 }, { "epoch": 0.016152716593245228, "epsilon_dpo/beta": 0.09954308718442917, "epsilon_dpo/beta_margin_grad_mean": -0.49776411056518555, "epsilon_dpo/beta_margin_grad_std": 0.006337402854114771, "epsilon_dpo/beta_margin_mean": 0.008945857174694538, "epsilon_dpo/beta_margin_std": 0.02535552717745304, "epsilon_dpo/loss_margin_mean": 0.09173570573329926, "grad_norm": 167.265380859375, "kl/avg_steps": 0.1875, "kl/beta": 0.09972011297941208, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 7.246376811594203e-08, "logits/chosen": -3.4978814125061035, "logits/rejected": -3.5033230781555176, "logps/chosen": -32.73089599609375, "logps/ref_chosen": -32.742462158203125, "logps/ref_rejected": -94.11013793945312, "logps/rejected": -94.19029998779297, "loss": 1.3775, "rewards/accuracies": 0.578125, "rewards/chosen": 0.0010777543066069484, "rewards/margins": 0.008945956826210022, "rewards/rejected": -0.007868202403187752, "step": 11 }, { "epoch": 0.01762114537444934, "epsilon_dpo/beta": 0.09929458051919937, "epsilon_dpo/beta_margin_grad_mean": -0.49879196286201477, "epsilon_dpo/beta_margin_grad_std": 0.008286659605801105, "epsilon_dpo/beta_margin_mean": 0.004831877537071705, "epsilon_dpo/beta_margin_std": 0.03315826877951622, "epsilon_dpo/loss_margin_mean": 0.05105578899383545, "grad_norm": 184.79959106445312, "kl/avg_steps": 0.25, "kl/beta": 0.09953349083662033, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 7.971014492753623e-08, "logits/chosen": -3.5019748210906982, "logits/rejected": -3.507927417755127, "logps/chosen": -43.87688446044922, "logps/ref_chosen": -43.85453796386719, "logps/ref_rejected": -111.72984313964844, "logps/rejected": -111.80323791503906, "loss": 1.3817, "rewards/accuracies": 0.625, "rewards/chosen": -0.0023162683937698603, "rewards/margins": 0.004831977654248476, "rewards/rejected": -0.007148245815187693, "step": 12 }, { "epoch": 0.01908957415565345, "epsilon_dpo/beta": 0.09935726225376129, "epsilon_dpo/beta_margin_grad_mean": -0.49919238686561584, "epsilon_dpo/beta_margin_grad_std": 0.006513380445539951, "epsilon_dpo/beta_margin_mean": 0.00323132099583745, "epsilon_dpo/beta_margin_std": 0.026057543233036995, "epsilon_dpo/loss_margin_mean": 0.034654468297958374, "grad_norm": 170.2859344482422, "kl/avg_steps": -0.0625, "kl/beta": 0.09928527474403381, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 8.695652173913042e-08, "logits/chosen": -3.540365219116211, "logits/rejected": -3.4680285453796387, "logps/chosen": -41.653934478759766, "logps/ref_chosen": -41.67176818847656, "logps/ref_rejected": -91.09086608886719, "logps/rejected": -91.1076889038086, "loss": 1.3832, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0017041495302692056, "rewards/margins": 0.0032313289120793343, "rewards/rejected": -0.0015271796146407723, "step": 13 }, { "epoch": 0.020558002936857563, "epsilon_dpo/beta": 0.09892261028289795, "epsilon_dpo/beta_margin_grad_mean": -0.49505484104156494, "epsilon_dpo/beta_margin_grad_std": 0.007620512507855892, "epsilon_dpo/beta_margin_mean": 0.01978609710931778, "epsilon_dpo/beta_margin_std": 0.03049391135573387, "epsilon_dpo/loss_margin_mean": 0.20201179385185242, "grad_norm": 191.28602600097656, "kl/avg_steps": 0.4375, "kl/beta": 0.09934736788272858, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 9.420289855072464e-08, "logits/chosen": -3.5063905715942383, "logits/rejected": -3.5407614707946777, "logps/chosen": -32.60367202758789, "logps/ref_chosen": -32.668601989746094, "logps/ref_rejected": -111.8526611328125, "logps/rejected": -111.98973846435547, "loss": 1.3668, "rewards/accuracies": 0.734375, "rewards/chosen": 0.0063568041659891605, "rewards/margins": 0.019785966724157333, "rewards/rejected": -0.01342916302382946, "step": 14 }, { "epoch": 0.022026431718061675, "epsilon_dpo/beta": 0.09844519197940826, "epsilon_dpo/beta_margin_grad_mean": -0.4958465099334717, "epsilon_dpo/beta_margin_grad_std": 0.006347531918436289, "epsilon_dpo/beta_margin_mean": 0.016617843881249428, "epsilon_dpo/beta_margin_std": 0.025397835299372673, "epsilon_dpo/loss_margin_mean": 0.17031680047512054, "grad_norm": 144.098388671875, "kl/avg_steps": 0.484375, "kl/beta": 0.09891461580991745, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.0144927536231885e-07, "logits/chosen": -3.4299309253692627, "logits/rejected": -3.4826550483703613, "logps/chosen": -41.79482650756836, "logps/ref_chosen": -41.78295135498047, "logps/ref_rejected": -86.42213439941406, "logps/rejected": -86.60433197021484, "loss": 1.3699, "rewards/accuracies": 0.734375, "rewards/chosen": -0.0012117127189412713, "rewards/margins": 0.01661786250770092, "rewards/rejected": -0.01782957650721073, "step": 15 }, { "epoch": 0.023494860499265784, "epsilon_dpo/beta": 0.09795541316270828, "epsilon_dpo/beta_margin_grad_mean": -0.4958525598049164, "epsilon_dpo/beta_margin_grad_std": 0.007386185694485903, "epsilon_dpo/beta_margin_mean": 0.016594676300883293, "epsilon_dpo/beta_margin_std": 0.029557235538959503, "epsilon_dpo/loss_margin_mean": 0.1711507886648178, "grad_norm": 150.39041137695312, "kl/avg_steps": 0.5, "kl/beta": 0.09843780845403671, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -3.477813959121704, "logits/rejected": -3.4168307781219482, "logps/chosen": -40.885353088378906, "logps/ref_chosen": -40.9011116027832, "logps/ref_rejected": -88.89961242675781, "logps/rejected": -89.05500793457031, "loss": 1.37, "rewards/accuracies": 0.75, "rewards/chosen": 0.0014686340000480413, "rewards/margins": 0.016594652086496353, "rewards/rejected": -0.015126017853617668, "step": 16 }, { "epoch": 0.024963289280469897, "epsilon_dpo/beta": 0.09734562784433365, "epsilon_dpo/beta_margin_grad_mean": -0.49166470766067505, "epsilon_dpo/beta_margin_grad_std": 0.009398871101439, "epsilon_dpo/beta_margin_mean": 0.033359427005052567, "epsilon_dpo/beta_margin_std": 0.037631068378686905, "epsilon_dpo/loss_margin_mean": 0.34458500146865845, "grad_norm": 171.44161987304688, "kl/avg_steps": 0.625, "kl/beta": 0.09794806689023972, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.1594202898550725e-07, "logits/chosen": -3.526141405105591, "logits/rejected": -3.4046664237976074, "logps/chosen": -38.45612335205078, "logps/ref_chosen": -38.555274963378906, "logps/ref_rejected": -91.66143798828125, "logps/rejected": -91.90687561035156, "loss": 1.3536, "rewards/accuracies": 0.8125, "rewards/chosen": 0.00957987830042839, "rewards/margins": 0.03335944563150406, "rewards/rejected": -0.023779571056365967, "step": 17 }, { "epoch": 0.02643171806167401, "epsilon_dpo/beta": 0.09655846655368805, "epsilon_dpo/beta_margin_grad_mean": -0.4906516373157501, "epsilon_dpo/beta_margin_grad_std": 0.009394010528922081, "epsilon_dpo/beta_margin_mean": 0.03741518035531044, "epsilon_dpo/beta_margin_std": 0.03761782497167587, "epsilon_dpo/loss_margin_mean": 0.3884708285331726, "grad_norm": 174.4280242919922, "kl/avg_steps": 0.8125, "kl/beta": 0.0973396971821785, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.2318840579710146e-07, "logits/chosen": -3.457716464996338, "logits/rejected": -3.3878774642944336, "logps/chosen": -26.462953567504883, "logps/ref_chosen": -26.55130386352539, "logps/ref_rejected": -82.15496063232422, "logps/rejected": -82.455078125, "loss": 1.3496, "rewards/accuracies": 0.90625, "rewards/chosen": 0.008510860614478588, "rewards/margins": 0.03741515427827835, "rewards/rejected": -0.028904292732477188, "step": 18 }, { "epoch": 0.027900146842878122, "epsilon_dpo/beta": 0.0959613099694252, "epsilon_dpo/beta_margin_grad_mean": -0.4871312379837036, "epsilon_dpo/beta_margin_grad_std": 0.014039521105587482, "epsilon_dpo/beta_margin_mean": 0.05153882876038551, "epsilon_dpo/beta_margin_std": 0.05628020688891411, "epsilon_dpo/loss_margin_mean": 0.5397706031799316, "grad_norm": 158.5471649169922, "kl/avg_steps": 0.625, "kl/beta": 0.0965551808476448, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.3043478260869563e-07, "logits/chosen": -3.5245118141174316, "logits/rejected": -3.4201385974884033, "logps/chosen": -51.44847869873047, "logps/ref_chosen": -51.548377990722656, "logps/ref_rejected": -95.98385620117188, "logps/rejected": -96.42372131347656, "loss": 1.3362, "rewards/accuracies": 0.828125, "rewards/chosen": 0.009450599551200867, "rewards/margins": 0.0515388660132885, "rewards/rejected": -0.04208826646208763, "step": 19 }, { "epoch": 0.02936857562408223, "epsilon_dpo/beta": 0.09518533945083618, "epsilon_dpo/beta_margin_grad_mean": -0.4861924350261688, "epsilon_dpo/beta_margin_grad_std": 0.01256165187805891, "epsilon_dpo/beta_margin_mean": 0.05528602749109268, "epsilon_dpo/beta_margin_std": 0.05034392327070236, "epsilon_dpo/loss_margin_mean": 0.5822827816009521, "grad_norm": 148.6184844970703, "kl/avg_steps": 0.8125, "kl/beta": 0.09595546126365662, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.3768115942028986e-07, "logits/chosen": -3.5266153812408447, "logits/rejected": -3.454347848892212, "logps/chosen": -32.44221878051758, "logps/ref_chosen": -32.57563781738281, "logps/ref_rejected": -83.72441101074219, "logps/rejected": -84.17327880859375, "loss": 1.3324, "rewards/accuracies": 0.90625, "rewards/chosen": 0.012646486982703209, "rewards/margins": 0.055285997688770294, "rewards/rejected": -0.042639512568712234, "step": 20 }, { "epoch": 0.030837004405286344, "epsilon_dpo/beta": 0.09435869753360748, "epsilon_dpo/beta_margin_grad_mean": -0.485392689704895, "epsilon_dpo/beta_margin_grad_std": 0.01355255488306284, "epsilon_dpo/beta_margin_mean": 0.058501191437244415, "epsilon_dpo/beta_margin_std": 0.054356660693883896, "epsilon_dpo/loss_margin_mean": 0.6211739182472229, "grad_norm": 138.44097900390625, "kl/avg_steps": 0.875, "kl/beta": 0.09518210589885712, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -3.416736602783203, "logits/rejected": -3.4210104942321777, "logps/chosen": -36.161651611328125, "logps/ref_chosen": -36.24628448486328, "logps/ref_rejected": -82.68882751464844, "logps/rejected": -83.22535705566406, "loss": 1.3294, "rewards/accuracies": 0.9375, "rewards/chosen": 0.007934953086078167, "rewards/margins": 0.05850117653608322, "rewards/rejected": -0.05056622251868248, "step": 21 }, { "epoch": 0.032305433186490456, "epsilon_dpo/beta": 0.09351073950529099, "epsilon_dpo/beta_margin_grad_mean": -0.48178407549858093, "epsilon_dpo/beta_margin_grad_std": 0.01465072762221098, "epsilon_dpo/beta_margin_mean": 0.07297518849372864, "epsilon_dpo/beta_margin_std": 0.05880572646856308, "epsilon_dpo/loss_margin_mean": 0.7812974452972412, "grad_norm": 163.26019287109375, "kl/avg_steps": 0.90625, "kl/beta": 0.0943564921617508, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.5217391304347825e-07, "logits/chosen": -3.4850616455078125, "logits/rejected": -3.4672698974609375, "logps/chosen": -44.6229248046875, "logps/ref_chosen": -44.70884704589844, "logps/ref_rejected": -103.42787170410156, "logps/rejected": -104.12324523925781, "loss": 1.3155, "rewards/accuracies": 0.953125, "rewards/chosen": 0.007986144162714481, "rewards/margins": 0.07297517359256744, "rewards/rejected": -0.06498903036117554, "step": 22 }, { "epoch": 0.033773861967694566, "epsilon_dpo/beta": 0.09270013123750687, "epsilon_dpo/beta_margin_grad_mean": -0.47907260060310364, "epsilon_dpo/beta_margin_grad_std": 0.01780831441283226, "epsilon_dpo/beta_margin_mean": 0.08389898389577866, "epsilon_dpo/beta_margin_std": 0.07160933315753937, "epsilon_dpo/loss_margin_mean": 0.9064887762069702, "grad_norm": 153.97116088867188, "kl/avg_steps": 0.875, "kl/beta": 0.09350906312465668, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.5942028985507245e-07, "logits/chosen": -3.4643330574035645, "logits/rejected": -3.442781686782837, "logps/chosen": -40.120094299316406, "logps/ref_chosen": -40.26862335205078, "logps/ref_rejected": -85.85059356689453, "logps/rejected": -86.60855102539062, "loss": 1.3054, "rewards/accuracies": 0.9375, "rewards/chosen": 0.013693436048924923, "rewards/margins": 0.08389902114868164, "rewards/rejected": -0.07020558416843414, "step": 23 }, { "epoch": 0.03524229074889868, "epsilon_dpo/beta": 0.0919250100851059, "epsilon_dpo/beta_margin_grad_mean": -0.4745597541332245, "epsilon_dpo/beta_margin_grad_std": 0.019036216661334038, "epsilon_dpo/beta_margin_mean": 0.10201232135295868, "epsilon_dpo/beta_margin_std": 0.07649125903844833, "epsilon_dpo/loss_margin_mean": 1.1116865873336792, "grad_norm": 161.09317016601562, "kl/avg_steps": 0.84375, "kl/beta": 0.09269795566797256, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -3.4035227298736572, "logits/rejected": -3.4438557624816895, "logps/chosen": -27.038997650146484, "logps/ref_chosen": -27.20970916748047, "logps/ref_rejected": -106.27947998046875, "logps/rejected": -107.220458984375, "loss": 1.2883, "rewards/accuracies": 0.9375, "rewards/chosen": 0.015648098662495613, "rewards/margins": 0.10201247036457062, "rewards/rejected": -0.08636436611413956, "step": 24 }, { "epoch": 0.03671071953010279, "epsilon_dpo/beta": 0.09127078950405121, "epsilon_dpo/beta_margin_grad_mean": -0.47578126192092896, "epsilon_dpo/beta_margin_grad_std": 0.023359432816505432, "epsilon_dpo/beta_margin_mean": 0.09719490259885788, "epsilon_dpo/beta_margin_std": 0.09393724054098129, "epsilon_dpo/loss_margin_mean": 1.0689027309417725, "grad_norm": 128.98117065429688, "kl/avg_steps": 0.71875, "kl/beta": 0.09192235767841339, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.7391304347826085e-07, "logits/chosen": -3.487727403640747, "logits/rejected": -3.466651439666748, "logps/chosen": -36.42045593261719, "logps/ref_chosen": -36.47064208984375, "logps/ref_rejected": -93.89593505859375, "logps/rejected": -94.91465759277344, "loss": 1.2937, "rewards/accuracies": 0.859375, "rewards/chosen": 0.004450969398021698, "rewards/margins": 0.09719488024711609, "rewards/rejected": -0.0927439033985138, "step": 25 }, { "epoch": 0.0381791483113069, "epsilon_dpo/beta": 0.0905909463763237, "epsilon_dpo/beta_margin_grad_mean": -0.4600984454154968, "epsilon_dpo/beta_margin_grad_std": 0.04119768738746643, "epsilon_dpo/beta_margin_mean": 0.16133488714694977, "epsilon_dpo/beta_margin_std": 0.1687132865190506, "epsilon_dpo/loss_margin_mean": 1.7871633768081665, "grad_norm": 150.258056640625, "kl/avg_steps": 0.75, "kl/beta": 0.09126638621091843, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.8115942028985507e-07, "logits/chosen": -3.4571170806884766, "logits/rejected": -3.5464980602264404, "logps/chosen": -39.704803466796875, "logps/ref_chosen": -39.82624816894531, "logps/ref_rejected": -109.0130615234375, "logps/rejected": -110.67877197265625, "loss": 1.2385, "rewards/accuracies": 0.875, "rewards/chosen": 0.010786900296807289, "rewards/margins": 0.16133487224578857, "rewards/rejected": -0.15054798126220703, "step": 26 }, { "epoch": 0.039647577092511016, "epsilon_dpo/beta": 0.08971839398145676, "epsilon_dpo/beta_margin_grad_mean": -0.4532409906387329, "epsilon_dpo/beta_margin_grad_std": 0.02980455383658409, "epsilon_dpo/beta_margin_mean": 0.1883021891117096, "epsilon_dpo/beta_margin_std": 0.1207134947180748, "epsilon_dpo/loss_margin_mean": 2.099519729614258, "grad_norm": 152.13011169433594, "kl/avg_steps": 0.96875, "kl/beta": 0.09058698266744614, "kl/n_epsilon_steps": 0.015625, "kl/p_epsilon_steps": 0.984375, "learning_rate": 1.8840579710144927e-07, "logits/chosen": -3.417879104614258, "logits/rejected": -3.4293429851531982, "logps/chosen": -19.790260314941406, "logps/ref_chosen": -19.93426513671875, "logps/ref_rejected": -107.32525634765625, "logps/rejected": -109.28077697753906, "loss": 1.2104, "rewards/accuracies": 0.984375, "rewards/chosen": 0.012909766286611557, "rewards/margins": 0.18830212950706482, "rewards/rejected": -0.17539237439632416, "step": 27 }, { "epoch": 0.041116005873715125, "epsilon_dpo/beta": 0.08888562768697739, "epsilon_dpo/beta_margin_grad_mean": -0.4523911476135254, "epsilon_dpo/beta_margin_grad_std": 0.03366325423121452, "epsilon_dpo/beta_margin_mean": 0.1920655220746994, "epsilon_dpo/beta_margin_std": 0.13740301132202148, "epsilon_dpo/loss_margin_mean": 2.162940263748169, "grad_norm": 144.26730346679688, "kl/avg_steps": 0.9375, "kl/beta": 0.08971784263849258, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 1.9565217391304347e-07, "logits/chosen": -3.5261058807373047, "logits/rejected": -3.475046157836914, "logps/chosen": -43.32648849487305, "logps/ref_chosen": -43.6025390625, "logps/ref_rejected": -96.1494140625, "logps/rejected": -98.03630065917969, "loss": 1.2081, "rewards/accuracies": 0.96875, "rewards/chosen": 0.02439035288989544, "rewards/margins": 0.19206556677818298, "rewards/rejected": -0.1676751971244812, "step": 28 }, { "epoch": 0.042584434654919234, "epsilon_dpo/beta": 0.08808783441781998, "epsilon_dpo/beta_margin_grad_mean": -0.4418832063674927, "epsilon_dpo/beta_margin_grad_std": 0.04080166295170784, "epsilon_dpo/beta_margin_mean": 0.23540179431438446, "epsilon_dpo/beta_margin_std": 0.1678141951560974, "epsilon_dpo/loss_margin_mean": 2.675217866897583, "grad_norm": 141.24661254882812, "kl/avg_steps": 0.90625, "kl/beta": 0.08888454735279083, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.028985507246377e-07, "logits/chosen": -3.4727797508239746, "logits/rejected": -3.4470648765563965, "logps/chosen": -32.09503936767578, "logps/ref_chosen": -32.44408416748047, "logps/ref_rejected": -101.98307037353516, "logps/rejected": -104.30924224853516, "loss": 1.1716, "rewards/accuracies": 0.953125, "rewards/chosen": 0.03065740317106247, "rewards/margins": 0.2354017198085785, "rewards/rejected": -0.20474430918693542, "step": 29 }, { "epoch": 0.04405286343612335, "epsilon_dpo/beta": 0.08729670941829681, "epsilon_dpo/beta_margin_grad_mean": -0.44231948256492615, "epsilon_dpo/beta_margin_grad_std": 0.04499894008040428, "epsilon_dpo/beta_margin_mean": 0.23390042781829834, "epsilon_dpo/beta_margin_std": 0.1849268078804016, "epsilon_dpo/loss_margin_mean": 2.6831705570220947, "grad_norm": 139.83612060546875, "kl/avg_steps": 0.90625, "kl/beta": 0.08808626234531403, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.1014492753623187e-07, "logits/chosen": -3.5117549896240234, "logits/rejected": -3.389063835144043, "logps/chosen": -39.259483337402344, "logps/ref_chosen": -39.2830810546875, "logps/ref_rejected": -103.8922119140625, "logps/rejected": -106.5517807006836, "loss": 1.1744, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0018627983517944813, "rewards/margins": 0.23390044271945953, "rewards/rejected": -0.23203766345977783, "step": 30 }, { "epoch": 0.04552129221732746, "epsilon_dpo/beta": 0.08656726032495499, "epsilon_dpo/beta_margin_grad_mean": -0.44433295726776123, "epsilon_dpo/beta_margin_grad_std": 0.04770767688751221, "epsilon_dpo/beta_margin_mean": 0.22623342275619507, "epsilon_dpo/beta_margin_std": 0.1985906958580017, "epsilon_dpo/loss_margin_mean": 2.6196024417877197, "grad_norm": 121.26469421386719, "kl/avg_steps": 0.84375, "kl/beta": 0.08729515224695206, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -3.4779539108276367, "logits/rejected": -3.431878089904785, "logps/chosen": -35.737491607666016, "logps/ref_chosen": -36.05577850341797, "logps/ref_rejected": -78.35195922851562, "logps/rejected": -80.65327453613281, "loss": 1.1824, "rewards/accuracies": 0.921875, "rewards/chosen": 0.02715596929192543, "rewards/margins": 0.22623339295387268, "rewards/rejected": -0.19907742738723755, "step": 31 }, { "epoch": 0.04698972099853157, "epsilon_dpo/beta": 0.08576179295778275, "epsilon_dpo/beta_margin_grad_mean": -0.42864087224006653, "epsilon_dpo/beta_margin_grad_std": 0.05508127063512802, "epsilon_dpo/beta_margin_mean": 0.292092889547348, "epsilon_dpo/beta_margin_std": 0.2318342626094818, "epsilon_dpo/loss_margin_mean": 3.4083411693573, "grad_norm": 131.14877319335938, "kl/avg_steps": 0.9375, "kl/beta": 0.08656476438045502, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 2.2463768115942027e-07, "logits/chosen": -3.4760398864746094, "logits/rejected": -3.5285589694976807, "logps/chosen": -32.82677459716797, "logps/ref_chosen": -33.10527420043945, "logps/ref_rejected": -95.47318267822266, "logps/rejected": -98.60302734375, "loss": 1.1284, "rewards/accuracies": 0.96875, "rewards/chosen": 0.023832818493247032, "rewards/margins": 0.2920929789543152, "rewards/rejected": -0.2682601809501648, "step": 32 }, { "epoch": 0.048458149779735685, "epsilon_dpo/beta": 0.08496525138616562, "epsilon_dpo/beta_margin_grad_mean": -0.43119847774505615, "epsilon_dpo/beta_margin_grad_std": 0.058498919010162354, "epsilon_dpo/beta_margin_mean": 0.2807844281196594, "epsilon_dpo/beta_margin_std": 0.24786221981048584, "epsilon_dpo/loss_margin_mean": 3.309786796569824, "grad_norm": 127.04460906982422, "kl/avg_steps": 0.9375, "kl/beta": 0.08576075732707977, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 2.318840579710145e-07, "logits/chosen": -3.5147736072540283, "logits/rejected": -3.4672505855560303, "logps/chosen": -40.38764953613281, "logps/ref_chosen": -40.39752960205078, "logps/ref_rejected": -94.48348999023438, "logps/rejected": -97.78340148925781, "loss": 1.1399, "rewards/accuracies": 0.96875, "rewards/chosen": 0.00047481246292591095, "rewards/margins": 0.28078436851501465, "rewards/rejected": -0.2803095579147339, "step": 33 }, { "epoch": 0.049926578560939794, "epsilon_dpo/beta": 0.08430886268615723, "epsilon_dpo/beta_margin_grad_mean": -0.42384782433509827, "epsilon_dpo/beta_margin_grad_std": 0.06483103334903717, "epsilon_dpo/beta_margin_mean": 0.3134419322013855, "epsilon_dpo/beta_margin_std": 0.27257490158081055, "epsilon_dpo/loss_margin_mean": 3.726898193359375, "grad_norm": 127.52790069580078, "kl/avg_steps": 0.78125, "kl/beta": 0.08496421575546265, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.391304347826087e-07, "logits/chosen": -3.4469704627990723, "logits/rejected": -3.5294151306152344, "logps/chosen": -35.16386413574219, "logps/ref_chosen": -35.10262680053711, "logps/ref_rejected": -106.70514678955078, "logps/rejected": -110.49327850341797, "loss": 1.1152, "rewards/accuracies": 0.921875, "rewards/chosen": -0.0053707570768892765, "rewards/margins": 0.3134418725967407, "rewards/rejected": -0.31881266832351685, "step": 34 }, { "epoch": 0.0513950073421439, "epsilon_dpo/beta": 0.0835762619972229, "epsilon_dpo/beta_margin_grad_mean": -0.4079773724079132, "epsilon_dpo/beta_margin_grad_std": 0.07277688384056091, "epsilon_dpo/beta_margin_mean": 0.3832019567489624, "epsilon_dpo/beta_margin_std": 0.3155387043952942, "epsilon_dpo/loss_margin_mean": 4.590485095977783, "grad_norm": 120.23053741455078, "kl/avg_steps": 0.875, "kl/beta": 0.08430557698011398, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.463768115942029e-07, "logits/chosen": -3.4881181716918945, "logits/rejected": -3.5153722763061523, "logps/chosen": -34.6436767578125, "logps/ref_chosen": -34.41180419921875, "logps/ref_rejected": -111.88399505615234, "logps/rejected": -116.70634460449219, "loss": 1.0629, "rewards/accuracies": 0.953125, "rewards/chosen": -0.019653314724564552, "rewards/margins": 0.3832019567489624, "rewards/rejected": -0.4028552770614624, "step": 35 }, { "epoch": 0.05286343612334802, "epsilon_dpo/beta": 0.08292967081069946, "epsilon_dpo/beta_margin_grad_mean": -0.3959466516971588, "epsilon_dpo/beta_margin_grad_std": 0.09478563815355301, "epsilon_dpo/beta_margin_mean": 0.4413573443889618, "epsilon_dpo/beta_margin_std": 0.41608670353889465, "epsilon_dpo/loss_margin_mean": 5.337898254394531, "grad_norm": 105.79562377929688, "kl/avg_steps": 0.78125, "kl/beta": 0.08357430249452591, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.536231884057971e-07, "logits/chosen": -3.4250617027282715, "logits/rejected": -3.506716251373291, "logps/chosen": -32.58708572387695, "logps/ref_chosen": -32.743473052978516, "logps/ref_rejected": -90.1633529663086, "logps/rejected": -95.34486389160156, "loss": 1.0334, "rewards/accuracies": 0.921875, "rewards/chosen": 0.012363580986857414, "rewards/margins": 0.4413573741912842, "rewards/rejected": -0.428993821144104, "step": 36 }, { "epoch": 0.05433186490455213, "epsilon_dpo/beta": 0.08228681236505508, "epsilon_dpo/beta_margin_grad_mean": -0.3971908390522003, "epsilon_dpo/beta_margin_grad_std": 0.09092327207326889, "epsilon_dpo/beta_margin_mean": 0.4376983940601349, "epsilon_dpo/beta_margin_std": 0.4152112901210785, "epsilon_dpo/loss_margin_mean": 5.334095478057861, "grad_norm": 100.5900650024414, "kl/avg_steps": 0.78125, "kl/beta": 0.082926444709301, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.6086956521739126e-07, "logits/chosen": -3.4775028228759766, "logits/rejected": -3.44918155670166, "logps/chosen": -40.18016052246094, "logps/ref_chosen": -39.88025665283203, "logps/ref_rejected": -83.95890808105469, "logps/rejected": -89.59291076660156, "loss": 1.0351, "rewards/accuracies": 0.921875, "rewards/chosen": -0.025536730885505676, "rewards/margins": 0.4376984238624573, "rewards/rejected": -0.46323513984680176, "step": 37 }, { "epoch": 0.055800293685756244, "epsilon_dpo/beta": 0.08167463541030884, "epsilon_dpo/beta_margin_grad_mean": -0.36130231618881226, "epsilon_dpo/beta_margin_grad_std": 0.11581370234489441, "epsilon_dpo/beta_margin_mean": 0.6206669211387634, "epsilon_dpo/beta_margin_std": 0.5732053518295288, "epsilon_dpo/loss_margin_mean": 7.621293067932129, "grad_norm": 99.69669342041016, "kl/avg_steps": 0.75, "kl/beta": 0.08228360116481781, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.681159420289855e-07, "logits/chosen": -3.436084270477295, "logits/rejected": -3.5204224586486816, "logps/chosen": -33.92012405395508, "logps/ref_chosen": -33.85154342651367, "logps/ref_rejected": -104.96053314208984, "logps/rejected": -112.65040588378906, "loss": 0.9294, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0062013790011405945, "rewards/margins": 0.6206669807434082, "rewards/rejected": -0.6268683671951294, "step": 38 }, { "epoch": 0.05726872246696035, "epsilon_dpo/beta": 0.0809134915471077, "epsilon_dpo/beta_margin_grad_mean": -0.35245731472969055, "epsilon_dpo/beta_margin_grad_std": 0.09772588312625885, "epsilon_dpo/beta_margin_mean": 0.6451180577278137, "epsilon_dpo/beta_margin_std": 0.48367542028427124, "epsilon_dpo/loss_margin_mean": 7.98002290725708, "grad_norm": 102.14472198486328, "kl/avg_steps": 0.9375, "kl/beta": 0.08167106658220291, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 2.753623188405797e-07, "logits/chosen": -3.435272216796875, "logits/rejected": -3.441884994506836, "logps/chosen": -31.436540603637695, "logps/ref_chosen": -31.883747100830078, "logps/ref_rejected": -84.24908447265625, "logps/rejected": -91.78189086914062, "loss": 0.8922, "rewards/accuracies": 0.96875, "rewards/chosen": 0.035918496549129486, "rewards/margins": 0.645117998123169, "rewards/rejected": -0.6091995239257812, "step": 39 }, { "epoch": 0.05873715124816446, "epsilon_dpo/beta": 0.08031369745731354, "epsilon_dpo/beta_margin_grad_mean": -0.35732436180114746, "epsilon_dpo/beta_margin_grad_std": 0.10807473212480545, "epsilon_dpo/beta_margin_mean": 0.6359905004501343, "epsilon_dpo/beta_margin_std": 0.5488929152488708, "epsilon_dpo/loss_margin_mean": 7.938919544219971, "grad_norm": 94.2869644165039, "kl/avg_steps": 0.75, "kl/beta": 0.0809125155210495, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.8260869565217386e-07, "logits/chosen": -3.48494291305542, "logits/rejected": -3.507589101791382, "logps/chosen": -36.815399169921875, "logps/ref_chosen": -36.59412384033203, "logps/ref_rejected": -80.92609405517578, "logps/rejected": -89.08628845214844, "loss": 0.9118, "rewards/accuracies": 0.953125, "rewards/chosen": -0.018150903284549713, "rewards/margins": 0.6359904408454895, "rewards/rejected": -0.654141366481781, "step": 40 }, { "epoch": 0.06020558002936858, "epsilon_dpo/beta": 0.07975336164236069, "epsilon_dpo/beta_margin_grad_mean": -0.3376636803150177, "epsilon_dpo/beta_margin_grad_std": 0.1441744863986969, "epsilon_dpo/beta_margin_mean": 0.762270450592041, "epsilon_dpo/beta_margin_std": 0.7963690161705017, "epsilon_dpo/loss_margin_mean": 9.593799591064453, "grad_norm": 101.93110656738281, "kl/avg_steps": 0.703125, "kl/beta": 0.08031018823385239, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.898550724637681e-07, "logits/chosen": -3.483686923980713, "logits/rejected": -3.4968438148498535, "logps/chosen": -41.283260345458984, "logps/ref_chosen": -39.986053466796875, "logps/ref_rejected": -105.53334045410156, "logps/rejected": -116.42434692382812, "loss": 0.889, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10497994720935822, "rewards/margins": 0.762270450592041, "rewards/rejected": -0.8672504425048828, "step": 41 }, { "epoch": 0.06167400881057269, "epsilon_dpo/beta": 0.07915924489498138, "epsilon_dpo/beta_margin_grad_mean": -0.3248225450515747, "epsilon_dpo/beta_margin_grad_std": 0.14031846821308136, "epsilon_dpo/beta_margin_mean": 0.8673705458641052, "epsilon_dpo/beta_margin_std": 0.8684898018836975, "epsilon_dpo/loss_margin_mean": 10.988822937011719, "grad_norm": 98.95552062988281, "kl/avg_steps": 0.75, "kl/beta": 0.0797494500875473, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.971014492753623e-07, "logits/chosen": -3.436079502105713, "logits/rejected": -3.551501989364624, "logps/chosen": -47.57762145996094, "logps/ref_chosen": -45.769351959228516, "logps/ref_rejected": -118.03570556640625, "logps/rejected": -130.83279418945312, "loss": 0.8297, "rewards/accuracies": 0.921875, "rewards/chosen": -0.1441619098186493, "rewards/margins": 0.86737060546875, "rewards/rejected": -1.0115324258804321, "step": 42 }, { "epoch": 0.0631424375917768, "epsilon_dpo/beta": 0.07852049171924591, "epsilon_dpo/beta_margin_grad_mean": -0.29879364371299744, "epsilon_dpo/beta_margin_grad_std": 0.14372758567333221, "epsilon_dpo/beta_margin_mean": 0.9733436703681946, "epsilon_dpo/beta_margin_std": 0.7786263227462769, "epsilon_dpo/loss_margin_mean": 12.422151565551758, "grad_norm": 86.0045166015625, "kl/avg_steps": 0.8125, "kl/beta": 0.07915578037500381, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.043478260869565e-07, "logits/chosen": -3.4531750679016113, "logits/rejected": -3.502927303314209, "logps/chosen": -36.81052017211914, "logps/ref_chosen": -36.684478759765625, "logps/ref_rejected": -104.91730499267578, "logps/rejected": -117.46549224853516, "loss": 0.7552, "rewards/accuracies": 0.953125, "rewards/chosen": -0.010491969995200634, "rewards/margins": 0.9733436703681946, "rewards/rejected": -0.9838355779647827, "step": 43 }, { "epoch": 0.06461086637298091, "epsilon_dpo/beta": 0.07783857733011246, "epsilon_dpo/beta_margin_grad_mean": -0.2732856273651123, "epsilon_dpo/beta_margin_grad_std": 0.15621285140514374, "epsilon_dpo/beta_margin_mean": 1.1567952632904053, "epsilon_dpo/beta_margin_std": 0.9610118269920349, "epsilon_dpo/loss_margin_mean": 14.889684677124023, "grad_norm": 85.54306030273438, "kl/avg_steps": 0.875, "kl/beta": 0.07851782441139221, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.115942028985507e-07, "logits/chosen": -3.3832082748413086, "logits/rejected": -3.4922008514404297, "logps/chosen": -28.09503936767578, "logps/ref_chosen": -27.785930633544922, "logps/ref_rejected": -117.58551788330078, "logps/rejected": -132.7843017578125, "loss": 0.6985, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02512773871421814, "rewards/margins": 1.1567951440811157, "rewards/rejected": -1.1819229125976562, "step": 44 }, { "epoch": 0.06607929515418502, "epsilon_dpo/beta": 0.07733368128538132, "epsilon_dpo/beta_margin_grad_mean": -0.3332192301750183, "epsilon_dpo/beta_margin_grad_std": 0.15007154643535614, "epsilon_dpo/beta_margin_mean": 0.8284286260604858, "epsilon_dpo/beta_margin_std": 0.8817453980445862, "epsilon_dpo/loss_margin_mean": 10.75483226776123, "grad_norm": 86.84064483642578, "kl/avg_steps": 0.65625, "kl/beta": 0.07783675193786621, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.188405797101449e-07, "logits/chosen": -3.4090023040771484, "logits/rejected": -3.488529920578003, "logps/chosen": -38.12892150878906, "logps/ref_chosen": -36.33074951171875, "logps/ref_rejected": -83.34062194824219, "logps/rejected": -95.89362335205078, "loss": 0.8642, "rewards/accuracies": 0.875, "rewards/chosen": -0.14034625887870789, "rewards/margins": 0.8284286260604858, "rewards/rejected": -0.9687749147415161, "step": 45 }, { "epoch": 0.06754772393538913, "epsilon_dpo/beta": 0.07675698399543762, "epsilon_dpo/beta_margin_grad_mean": -0.2891290783882141, "epsilon_dpo/beta_margin_grad_std": 0.1580805480480194, "epsilon_dpo/beta_margin_mean": 1.1003601551055908, "epsilon_dpo/beta_margin_std": 1.0318849086761475, "epsilon_dpo/loss_margin_mean": 14.38028621673584, "grad_norm": 96.92198181152344, "kl/avg_steps": 0.75, "kl/beta": 0.07732927799224854, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.260869565217391e-07, "logits/chosen": -3.375389814376831, "logits/rejected": -3.439635992050171, "logps/chosen": -37.75640106201172, "logps/ref_chosen": -36.0171012878418, "logps/ref_rejected": -101.40194702148438, "logps/rejected": -117.52153015136719, "loss": 0.7386, "rewards/accuracies": 0.90625, "rewards/chosen": -0.13517028093338013, "rewards/margins": 1.1003601551055908, "rewards/rejected": -1.2355304956436157, "step": 46 }, { "epoch": 0.06901615271659324, "epsilon_dpo/beta": 0.07616160064935684, "epsilon_dpo/beta_margin_grad_mean": -0.31451866030693054, "epsilon_dpo/beta_margin_grad_std": 0.16808247566223145, "epsilon_dpo/beta_margin_mean": 0.9304282069206238, "epsilon_dpo/beta_margin_std": 1.0573234558105469, "epsilon_dpo/loss_margin_mean": 12.260807037353516, "grad_norm": 94.45354461669922, "kl/avg_steps": 0.78125, "kl/beta": 0.07675362378358841, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.333333333333333e-07, "logits/chosen": -3.4388890266418457, "logits/rejected": -3.5206193923950195, "logps/chosen": -44.316593170166016, "logps/ref_chosen": -41.82904815673828, "logps/ref_rejected": -101.29283142089844, "logps/rejected": -116.04118347167969, "loss": 0.8604, "rewards/accuracies": 0.890625, "rewards/chosen": -0.19145357608795166, "rewards/margins": 0.9304282665252686, "rewards/rejected": -1.1218818426132202, "step": 47 }, { "epoch": 0.07048458149779736, "epsilon_dpo/beta": 0.07571401447057724, "epsilon_dpo/beta_margin_grad_mean": -0.29027602076530457, "epsilon_dpo/beta_margin_grad_std": 0.17632031440734863, "epsilon_dpo/beta_margin_mean": 1.1656516790390015, "epsilon_dpo/beta_margin_std": 1.2263123989105225, "epsilon_dpo/loss_margin_mean": 15.469880104064941, "grad_norm": 90.87052154541016, "kl/avg_steps": 0.59375, "kl/beta": 0.07615863531827927, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.4057971014492755e-07, "logits/chosen": -3.463271141052246, "logits/rejected": -3.4467902183532715, "logps/chosen": -38.7716064453125, "logps/ref_chosen": -36.18339920043945, "logps/ref_rejected": -95.41502380371094, "logps/rejected": -113.47309875488281, "loss": 0.7607, "rewards/accuracies": 0.859375, "rewards/chosen": -0.19789756834506989, "rewards/margins": 1.1656516790390015, "rewards/rejected": -1.3635492324829102, "step": 48 }, { "epoch": 0.07195301027900147, "epsilon_dpo/beta": 0.075148805975914, "epsilon_dpo/beta_margin_grad_mean": -0.2587027847766876, "epsilon_dpo/beta_margin_grad_std": 0.17777220904827118, "epsilon_dpo/beta_margin_mean": 1.3605566024780273, "epsilon_dpo/beta_margin_std": 1.2057913541793823, "epsilon_dpo/loss_margin_mean": 18.15953826904297, "grad_norm": 81.80361938476562, "kl/avg_steps": 0.75, "kl/beta": 0.07570911198854446, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.478260869565217e-07, "logits/chosen": -3.472126007080078, "logits/rejected": -3.5396313667297363, "logps/chosen": -37.51390075683594, "logps/ref_chosen": -35.96546936035156, "logps/ref_rejected": -91.31820678710938, "logps/rejected": -111.02618408203125, "loss": 0.6703, "rewards/accuracies": 0.90625, "rewards/chosen": -0.118332639336586, "rewards/margins": 1.3605566024780273, "rewards/rejected": -1.4788892269134521, "step": 49 }, { "epoch": 0.07342143906020558, "epsilon_dpo/beta": 0.07468333095312119, "epsilon_dpo/beta_margin_grad_mean": -0.27890822291374207, "epsilon_dpo/beta_margin_grad_std": 0.21263383328914642, "epsilon_dpo/beta_margin_mean": 1.3234366178512573, "epsilon_dpo/beta_margin_std": 1.4268280267715454, "epsilon_dpo/loss_margin_mean": 17.81157684326172, "grad_norm": 100.63481903076172, "kl/avg_steps": 0.625, "kl/beta": 0.07514552026987076, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.5507246376811595e-07, "logits/chosen": -3.4349403381347656, "logits/rejected": -3.533860445022583, "logps/chosen": -46.16408920288086, "logps/ref_chosen": -42.138206481933594, "logps/ref_rejected": -100.4173583984375, "logps/rejected": -122.25482177734375, "loss": 0.7726, "rewards/accuracies": 0.84375, "rewards/chosen": -0.30415183305740356, "rewards/margins": 1.3234367370605469, "rewards/rejected": -1.6275885105133057, "step": 50 }, { "epoch": 0.07488986784140969, "epsilon_dpo/beta": 0.07421945780515671, "epsilon_dpo/beta_margin_grad_mean": -0.28537219762802124, "epsilon_dpo/beta_margin_grad_std": 0.19426609575748444, "epsilon_dpo/beta_margin_mean": 1.3490850925445557, "epsilon_dpo/beta_margin_std": 1.5637013912200928, "epsilon_dpo/loss_margin_mean": 18.262516021728516, "grad_norm": 91.88065338134766, "kl/avg_steps": 0.625, "kl/beta": 0.07467877864837646, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -3.377105236053467, "logits/rejected": -3.4192728996276855, "logps/chosen": -43.83557891845703, "logps/ref_chosen": -39.016597747802734, "logps/ref_rejected": -80.60652160644531, "logps/rejected": -103.68801879882812, "loss": 0.7605, "rewards/accuracies": 0.875, "rewards/chosen": -0.3597980737686157, "rewards/margins": 1.3490850925445557, "rewards/rejected": -1.7088831663131714, "step": 51 }, { "epoch": 0.0763582966226138, "epsilon_dpo/beta": 0.07364249974489212, "epsilon_dpo/beta_margin_grad_mean": -0.23005272448062897, "epsilon_dpo/beta_margin_grad_std": 0.1819276660680771, "epsilon_dpo/beta_margin_mean": 1.7073160409927368, "epsilon_dpo/beta_margin_std": 1.5912011861801147, "epsilon_dpo/loss_margin_mean": 23.246597290039062, "grad_norm": 87.76582336425781, "kl/avg_steps": 0.78125, "kl/beta": 0.07421493530273438, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.695652173913043e-07, "logits/chosen": -3.4000444412231445, "logits/rejected": -3.473278284072876, "logps/chosen": -39.097511291503906, "logps/ref_chosen": -34.285945892333984, "logps/ref_rejected": -85.96109008789062, "logps/rejected": -114.01925659179688, "loss": 0.6001, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3565700650215149, "rewards/margins": 1.7073161602020264, "rewards/rejected": -2.0638861656188965, "step": 52 }, { "epoch": 0.07782672540381791, "epsilon_dpo/beta": 0.07307162135839462, "epsilon_dpo/beta_margin_grad_mean": -0.21757791936397552, "epsilon_dpo/beta_margin_grad_std": 0.17687389254570007, "epsilon_dpo/beta_margin_mean": 1.8149864673614502, "epsilon_dpo/beta_margin_std": 1.6574300527572632, "epsilon_dpo/loss_margin_mean": 24.905494689941406, "grad_norm": 85.12645721435547, "kl/avg_steps": 0.78125, "kl/beta": 0.07363962382078171, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.7681159420289855e-07, "logits/chosen": -3.3786673545837402, "logits/rejected": -3.4282350540161133, "logps/chosen": -42.113983154296875, "logps/ref_chosen": -34.706817626953125, "logps/ref_rejected": -97.64952087402344, "logps/rejected": -129.96218872070312, "loss": 0.558, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5423527359962463, "rewards/margins": 1.8149864673614502, "rewards/rejected": -2.3573391437530518, "step": 53 }, { "epoch": 0.07929515418502203, "epsilon_dpo/beta": 0.07252801209688187, "epsilon_dpo/beta_margin_grad_mean": -0.23283910751342773, "epsilon_dpo/beta_margin_grad_std": 0.18114687502384186, "epsilon_dpo/beta_margin_mean": 1.7452175617218018, "epsilon_dpo/beta_margin_std": 1.7455625534057617, "epsilon_dpo/loss_margin_mean": 24.13280487060547, "grad_norm": 83.51258850097656, "kl/avg_steps": 0.75, "kl/beta": 0.0730687752366066, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.8405797101449274e-07, "logits/chosen": -3.400343656539917, "logits/rejected": -3.5012292861938477, "logps/chosen": -46.09777069091797, "logps/ref_chosen": -39.777854919433594, "logps/ref_rejected": -98.70614624023438, "logps/rejected": -129.1588592529297, "loss": 0.6017, "rewards/accuracies": 0.90625, "rewards/chosen": -0.46132320165634155, "rewards/margins": 1.7452175617218018, "rewards/rejected": -2.206540822982788, "step": 54 }, { "epoch": 0.08076358296622614, "epsilon_dpo/beta": 0.07214676588773727, "epsilon_dpo/beta_margin_grad_mean": -0.2661394476890564, "epsilon_dpo/beta_margin_grad_std": 0.21455174684524536, "epsilon_dpo/beta_margin_mean": 1.643436074256897, "epsilon_dpo/beta_margin_std": 1.8868768215179443, "epsilon_dpo/loss_margin_mean": 22.906444549560547, "grad_norm": 104.0904541015625, "kl/avg_steps": 0.53125, "kl/beta": 0.07252483814954758, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.9130434782608694e-07, "logits/chosen": -3.390665054321289, "logits/rejected": -3.4186453819274902, "logps/chosen": -58.18317794799805, "logps/ref_chosen": -48.42914962768555, "logps/ref_rejected": -93.72831726074219, "logps/rejected": -126.38877868652344, "loss": 0.7261, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7076321840286255, "rewards/margins": 1.643436074256897, "rewards/rejected": -2.3510682582855225, "step": 55 }, { "epoch": 0.08223201174743025, "epsilon_dpo/beta": 0.07168648391962051, "epsilon_dpo/beta_margin_grad_mean": -0.2267267107963562, "epsilon_dpo/beta_margin_grad_std": 0.24484246969223022, "epsilon_dpo/beta_margin_mean": 1.977746605873108, "epsilon_dpo/beta_margin_std": 2.0250532627105713, "epsilon_dpo/loss_margin_mean": 27.73088836669922, "grad_norm": 123.8470458984375, "kl/avg_steps": 0.640625, "kl/beta": 0.07214158773422241, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.9855072463768114e-07, "logits/chosen": -3.2844839096069336, "logits/rejected": -3.3951363563537598, "logps/chosen": -38.82612991333008, "logps/ref_chosen": -31.692344665527344, "logps/ref_rejected": -100.61968994140625, "logps/rejected": -135.484375, "loss": 0.713, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5154626369476318, "rewards/margins": 1.9777467250823975, "rewards/rejected": -2.4932093620300293, "step": 56 }, { "epoch": 0.08370044052863436, "epsilon_dpo/beta": 0.07115186750888824, "epsilon_dpo/beta_margin_grad_mean": -0.21097464859485626, "epsilon_dpo/beta_margin_grad_std": 0.22179222106933594, "epsilon_dpo/beta_margin_mean": 2.176724910736084, "epsilon_dpo/beta_margin_std": 2.1261887550354004, "epsilon_dpo/loss_margin_mean": 30.698436737060547, "grad_norm": 102.96115112304688, "kl/avg_steps": 0.75, "kl/beta": 0.07168237119913101, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.057971014492754e-07, "logits/chosen": -3.386384963989258, "logits/rejected": -3.439547538757324, "logps/chosen": -45.702720642089844, "logps/ref_chosen": -38.302345275878906, "logps/ref_rejected": -101.74482727050781, "logps/rejected": -139.84365844726562, "loss": 0.6021, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5308903455734253, "rewards/margins": 2.176724910736084, "rewards/rejected": -2.707615375518799, "step": 57 }, { "epoch": 0.08516886930983847, "epsilon_dpo/beta": 0.07062220573425293, "epsilon_dpo/beta_margin_grad_mean": -0.1960882991552353, "epsilon_dpo/beta_margin_grad_std": 0.17183293402194977, "epsilon_dpo/beta_margin_mean": 2.15674090385437, "epsilon_dpo/beta_margin_std": 2.0642378330230713, "epsilon_dpo/loss_margin_mean": 30.6201114654541, "grad_norm": 87.39006805419922, "kl/avg_steps": 0.75, "kl/beta": 0.07114876061677933, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.1304347826086954e-07, "logits/chosen": -3.3047280311584473, "logits/rejected": -3.393859386444092, "logps/chosen": -46.135406494140625, "logps/ref_chosen": -38.44845962524414, "logps/ref_rejected": -89.55912780761719, "logps/rejected": -127.86618041992188, "loss": 0.4979, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5473577976226807, "rewards/margins": 2.15674090385437, "rewards/rejected": -2.704098701477051, "step": 58 }, { "epoch": 0.08663729809104258, "epsilon_dpo/beta": 0.0700632631778717, "epsilon_dpo/beta_margin_grad_mean": -0.21717199683189392, "epsilon_dpo/beta_margin_grad_std": 0.2148067206144333, "epsilon_dpo/beta_margin_mean": 2.177267074584961, "epsilon_dpo/beta_margin_std": 2.287863254547119, "epsilon_dpo/loss_margin_mean": 31.16427993774414, "grad_norm": 87.90422058105469, "kl/avg_steps": 0.796875, "kl/beta": 0.0706191137433052, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.2028985507246374e-07, "logits/chosen": -3.3299951553344727, "logits/rejected": -3.408975124359131, "logps/chosen": -46.44553756713867, "logps/ref_chosen": -38.029998779296875, "logps/ref_rejected": -94.10072326660156, "logps/rejected": -133.6805419921875, "loss": 0.6172, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5922181010246277, "rewards/margins": 2.177267074584961, "rewards/rejected": -2.7694854736328125, "step": 59 }, { "epoch": 0.0881057268722467, "epsilon_dpo/beta": 0.06962990015745163, "epsilon_dpo/beta_margin_grad_mean": -0.24627114832401276, "epsilon_dpo/beta_margin_grad_std": 0.2656000852584839, "epsilon_dpo/beta_margin_mean": 1.7643014192581177, "epsilon_dpo/beta_margin_std": 2.2814714908599854, "epsilon_dpo/loss_margin_mean": 25.50347328186035, "grad_norm": 167.74856567382812, "kl/avg_steps": 0.625, "kl/beta": 0.07006081938743591, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.2753623188405794e-07, "logits/chosen": -3.3077573776245117, "logits/rejected": -3.2960987091064453, "logps/chosen": -62.78964614868164, "logps/ref_chosen": -48.789947509765625, "logps/ref_rejected": -91.3543701171875, "logps/rejected": -130.8575439453125, "loss": 0.9335, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9826135635375977, "rewards/margins": 1.7643013000488281, "rewards/rejected": -2.746914863586426, "step": 60 }, { "epoch": 0.08957415565345081, "epsilon_dpo/beta": 0.06911037117242813, "epsilon_dpo/beta_margin_grad_mean": -0.20471161603927612, "epsilon_dpo/beta_margin_grad_std": 0.22111034393310547, "epsilon_dpo/beta_margin_mean": 2.207209825515747, "epsilon_dpo/beta_margin_std": 2.3618977069854736, "epsilon_dpo/loss_margin_mean": 32.056705474853516, "grad_norm": 96.6867446899414, "kl/avg_steps": 0.75, "kl/beta": 0.06962565332651138, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -3.313138484954834, "logits/rejected": -3.4350461959838867, "logps/chosen": -47.05156707763672, "logps/ref_chosen": -35.972103118896484, "logps/ref_rejected": -95.45098876953125, "logps/rejected": -138.587158203125, "loss": 0.6385, "rewards/accuracies": 0.890625, "rewards/chosen": -0.7702499628067017, "rewards/margins": 2.207209825515747, "rewards/rejected": -2.9774599075317383, "step": 61 }, { "epoch": 0.09104258443465492, "epsilon_dpo/beta": 0.0686391070485115, "epsilon_dpo/beta_margin_grad_mean": -0.2324148714542389, "epsilon_dpo/beta_margin_grad_std": 0.20171168446540833, "epsilon_dpo/beta_margin_mean": 1.8510175943374634, "epsilon_dpo/beta_margin_std": 1.8581922054290771, "epsilon_dpo/loss_margin_mean": 27.06845474243164, "grad_norm": 91.73394775390625, "kl/avg_steps": 0.6875, "kl/beta": 0.06910735368728638, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.420289855072464e-07, "logits/chosen": -3.2718987464904785, "logits/rejected": -3.3174455165863037, "logps/chosen": -45.381500244140625, "logps/ref_chosen": -35.904327392578125, "logps/ref_rejected": -82.9093017578125, "logps/rejected": -119.45492553710938, "loss": 0.6249, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6530295610427856, "rewards/margins": 1.8510175943374634, "rewards/rejected": -2.504047155380249, "step": 62 }, { "epoch": 0.09251101321585903, "epsilon_dpo/beta": 0.06817042827606201, "epsilon_dpo/beta_margin_grad_mean": -0.22793808579444885, "epsilon_dpo/beta_margin_grad_std": 0.2142634242773056, "epsilon_dpo/beta_margin_mean": 2.1623551845550537, "epsilon_dpo/beta_margin_std": 2.539600372314453, "epsilon_dpo/loss_margin_mean": 31.848644256591797, "grad_norm": 102.05279541015625, "kl/avg_steps": 0.6875, "kl/beta": 0.0686354786157608, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.4927536231884053e-07, "logits/chosen": -3.3514161109924316, "logits/rejected": -3.4100210666656494, "logps/chosen": -60.41229248046875, "logps/ref_chosen": -46.25957107543945, "logps/ref_rejected": -104.15571594238281, "logps/rejected": -150.15708923339844, "loss": 0.6897, "rewards/accuracies": 0.921875, "rewards/chosen": -0.9690430164337158, "rewards/margins": 2.1623551845550537, "rewards/rejected": -3.1313982009887695, "step": 63 }, { "epoch": 0.09397944199706314, "epsilon_dpo/beta": 0.06770496070384979, "epsilon_dpo/beta_margin_grad_mean": -0.19847899675369263, "epsilon_dpo/beta_margin_grad_std": 0.19424547255039215, "epsilon_dpo/beta_margin_mean": 2.3565456867218018, "epsilon_dpo/beta_margin_std": 2.4017789363861084, "epsilon_dpo/loss_margin_mean": 34.92362594604492, "grad_norm": 95.53367614746094, "kl/avg_steps": 0.6875, "kl/beta": 0.0681668370962143, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.5652173913043473e-07, "logits/chosen": -3.3310346603393555, "logits/rejected": -3.3657891750335693, "logps/chosen": -44.803466796875, "logps/ref_chosen": -34.512210845947266, "logps/ref_rejected": -101.21166229248047, "logps/rejected": -146.42654418945312, "loss": 0.5255, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6991441249847412, "rewards/margins": 2.3565456867218018, "rewards/rejected": -3.055689811706543, "step": 64 }, { "epoch": 0.09544787077826726, "epsilon_dpo/beta": 0.06730613857507706, "epsilon_dpo/beta_margin_grad_mean": -0.21261604130268097, "epsilon_dpo/beta_margin_grad_std": 0.22306111454963684, "epsilon_dpo/beta_margin_mean": 2.48264741897583, "epsilon_dpo/beta_margin_std": 2.5546720027923584, "epsilon_dpo/loss_margin_mean": 37.058799743652344, "grad_norm": 93.06818389892578, "kl/avg_steps": 0.59375, "kl/beta": 0.06770138442516327, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.63768115942029e-07, "logits/chosen": -3.3164443969726562, "logits/rejected": -3.343629837036133, "logps/chosen": -53.97098159790039, "logps/ref_chosen": -43.620361328125, "logps/ref_rejected": -115.21531677246094, "logps/rejected": -162.62474060058594, "loss": 0.5986, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6990154981613159, "rewards/margins": 2.48264741897583, "rewards/rejected": -3.1816627979278564, "step": 65 }, { "epoch": 0.09691629955947137, "epsilon_dpo/beta": 0.0669299066066742, "epsilon_dpo/beta_margin_grad_mean": -0.24468040466308594, "epsilon_dpo/beta_margin_grad_std": 0.2503945231437683, "epsilon_dpo/beta_margin_mean": 2.0747146606445312, "epsilon_dpo/beta_margin_std": 2.3843464851379395, "epsilon_dpo/loss_margin_mean": 31.18071746826172, "grad_norm": 116.8039321899414, "kl/avg_steps": 0.5625, "kl/beta": 0.06730178743600845, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.7101449275362313e-07, "logits/chosen": -3.2685537338256836, "logits/rejected": -3.3573737144470215, "logps/chosen": -49.09389877319336, "logps/ref_chosen": -37.514625549316406, "logps/ref_rejected": -80.34272766113281, "logps/rejected": -123.10272216796875, "loss": 0.7582, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7809337973594666, "rewards/margins": 2.0747146606445312, "rewards/rejected": -2.8556485176086426, "step": 66 }, { "epoch": 0.09838472834067548, "epsilon_dpo/beta": 0.0665346160531044, "epsilon_dpo/beta_margin_grad_mean": -0.2480592578649521, "epsilon_dpo/beta_margin_grad_std": 0.23476538062095642, "epsilon_dpo/beta_margin_mean": 1.869264006614685, "epsilon_dpo/beta_margin_std": 2.120335340499878, "epsilon_dpo/loss_margin_mean": 28.2425479888916, "grad_norm": 115.71697998046875, "kl/avg_steps": 0.59375, "kl/beta": 0.06692533195018768, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.782608695652174e-07, "logits/chosen": -3.2035374641418457, "logits/rejected": -3.216823101043701, "logps/chosen": -51.816200256347656, "logps/ref_chosen": -38.82200622558594, "logps/ref_rejected": -78.41658782958984, "logps/rejected": -119.65333557128906, "loss": 0.7213, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8721305727958679, "rewards/margins": 1.8692641258239746, "rewards/rejected": -2.741394519805908, "step": 67 }, { "epoch": 0.09985315712187959, "epsilon_dpo/beta": 0.06605872511863708, "epsilon_dpo/beta_margin_grad_mean": -0.19077911972999573, "epsilon_dpo/beta_margin_grad_std": 0.20632153749465942, "epsilon_dpo/beta_margin_mean": 2.237042188644409, "epsilon_dpo/beta_margin_std": 1.8940068483352661, "epsilon_dpo/loss_margin_mean": 33.978355407714844, "grad_norm": 89.41032409667969, "kl/avg_steps": 0.71875, "kl/beta": 0.06653030216693878, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.855072463768116e-07, "logits/chosen": -3.2750229835510254, "logits/rejected": -3.2742607593536377, "logps/chosen": -53.25446319580078, "logps/ref_chosen": -41.910316467285156, "logps/ref_rejected": -82.59764862060547, "logps/rejected": -127.92015075683594, "loss": 0.5268, "rewards/accuracies": 0.875, "rewards/chosen": -0.7523297071456909, "rewards/margins": 2.237042188644409, "rewards/rejected": -2.9893717765808105, "step": 68 }, { "epoch": 0.1013215859030837, "epsilon_dpo/beta": 0.06550473719835281, "epsilon_dpo/beta_margin_grad_mean": -0.1768295019865036, "epsilon_dpo/beta_margin_grad_std": 0.19139200448989868, "epsilon_dpo/beta_margin_mean": 2.4585933685302734, "epsilon_dpo/beta_margin_std": 2.252986431121826, "epsilon_dpo/loss_margin_mean": 37.620914459228516, "grad_norm": 85.71179962158203, "kl/avg_steps": 0.84375, "kl/beta": 0.066055528819561, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.927536231884058e-07, "logits/chosen": -3.309999704360962, "logits/rejected": -3.2920103073120117, "logps/chosen": -56.83376693725586, "logps/ref_chosen": -42.98963165283203, "logps/ref_rejected": -111.28137969970703, "logps/rejected": -162.74642944335938, "loss": 0.5513, "rewards/accuracies": 0.921875, "rewards/chosen": -0.910045325756073, "rewards/margins": 2.4585933685302734, "rewards/rejected": -3.3686389923095703, "step": 69 }, { "epoch": 0.1027900146842878, "epsilon_dpo/beta": 0.06497713923454285, "epsilon_dpo/beta_margin_grad_mean": -0.1917823851108551, "epsilon_dpo/beta_margin_grad_std": 0.18596571683883667, "epsilon_dpo/beta_margin_mean": 2.3181331157684326, "epsilon_dpo/beta_margin_std": 2.1517505645751953, "epsilon_dpo/loss_margin_mean": 35.75630187988281, "grad_norm": 88.73102569580078, "kl/avg_steps": 0.8125, "kl/beta": 0.06550285220146179, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 5e-07, "logits/chosen": -3.2341670989990234, "logits/rejected": -3.3183038234710693, "logps/chosen": -55.655487060546875, "logps/ref_chosen": -43.68109130859375, "logps/ref_rejected": -97.17718505859375, "logps/rejected": -144.90789794921875, "loss": 0.5016, "rewards/accuracies": 0.90625, "rewards/chosen": -0.781528651714325, "rewards/margins": 2.3181333541870117, "rewards/rejected": -3.0996618270874023, "step": 70 }, { "epoch": 0.10425844346549193, "epsilon_dpo/beta": 0.0644940659403801, "epsilon_dpo/beta_margin_grad_mean": -0.18719229102134705, "epsilon_dpo/beta_margin_grad_std": 0.23163102567195892, "epsilon_dpo/beta_margin_mean": 2.6485307216644287, "epsilon_dpo/beta_margin_std": 2.504869222640991, "epsilon_dpo/loss_margin_mean": 41.206172943115234, "grad_norm": 100.56282806396484, "kl/avg_steps": 0.75, "kl/beta": 0.06497492641210556, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.999967061337492e-07, "logits/chosen": -3.228590488433838, "logits/rejected": -3.3432466983795166, "logps/chosen": -54.0555419921875, "logps/ref_chosen": -40.898582458496094, "logps/ref_rejected": -104.50498962402344, "logps/rejected": -158.8681182861328, "loss": 0.5652, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8535667657852173, "rewards/margins": 2.6485307216644287, "rewards/rejected": -3.5020976066589355, "step": 71 }, { "epoch": 0.10572687224669604, "epsilon_dpo/beta": 0.06397364288568497, "epsilon_dpo/beta_margin_grad_mean": -0.15650521218776703, "epsilon_dpo/beta_margin_grad_std": 0.17222045361995697, "epsilon_dpo/beta_margin_mean": 2.5931060314178467, "epsilon_dpo/beta_margin_std": 2.0323598384857178, "epsilon_dpo/loss_margin_mean": 40.60845184326172, "grad_norm": 71.43647766113281, "kl/avg_steps": 0.8125, "kl/beta": 0.06449124217033386, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.999868246217933e-07, "logits/chosen": -3.2843093872070312, "logits/rejected": -3.316100835800171, "logps/chosen": -54.79957580566406, "logps/ref_chosen": -42.15618896484375, "logps/ref_rejected": -102.02656555175781, "logps/rejected": -155.27841186523438, "loss": 0.4074, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8116366863250732, "rewards/margins": 2.593106269836426, "rewards/rejected": -3.40474271774292, "step": 72 }, { "epoch": 0.10719530102790015, "epsilon_dpo/beta": 0.06355801969766617, "epsilon_dpo/beta_margin_grad_mean": -0.2032501995563507, "epsilon_dpo/beta_margin_grad_std": 0.21837185323238373, "epsilon_dpo/beta_margin_mean": 2.5123815536499023, "epsilon_dpo/beta_margin_std": 2.4150218963623047, "epsilon_dpo/loss_margin_mean": 39.681556701660156, "grad_norm": 75.35091400146484, "kl/avg_steps": 0.65625, "kl/beta": 0.06397147476673126, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.999703557245192e-07, "logits/chosen": -3.2197837829589844, "logits/rejected": -3.2296009063720703, "logps/chosen": -55.1505126953125, "logps/ref_chosen": -43.86912155151367, "logps/ref_rejected": -96.146728515625, "logps/rejected": -147.10968017578125, "loss": 0.5577, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7197288870811462, "rewards/margins": 2.5123815536499023, "rewards/rejected": -3.2321105003356934, "step": 73 }, { "epoch": 0.10866372980910426, "epsilon_dpo/beta": 0.06304432451725006, "epsilon_dpo/beta_margin_grad_mean": -0.15927036106586456, "epsilon_dpo/beta_margin_grad_std": 0.18017472326755524, "epsilon_dpo/beta_margin_mean": 2.8419299125671387, "epsilon_dpo/beta_margin_std": 2.680656909942627, "epsilon_dpo/loss_margin_mean": 45.170841217041016, "grad_norm": 65.50677490234375, "kl/avg_steps": 0.8125, "kl/beta": 0.06355439871549606, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.999472998758977e-07, "logits/chosen": -3.147824764251709, "logits/rejected": -3.291006088256836, "logps/chosen": -38.77723693847656, "logps/ref_chosen": -29.008399963378906, "logps/ref_rejected": -102.72833251953125, "logps/rejected": -157.6680145263672, "loss": 0.4061, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6169643998146057, "rewards/margins": 2.8419299125671387, "rewards/rejected": -3.4588942527770996, "step": 74 }, { "epoch": 0.11013215859030837, "epsilon_dpo/beta": 0.062496818602085114, "epsilon_dpo/beta_margin_grad_mean": -0.15049783885478973, "epsilon_dpo/beta_margin_grad_std": 0.1585647463798523, "epsilon_dpo/beta_margin_mean": 2.8132150173187256, "epsilon_dpo/beta_margin_std": 2.3551828861236572, "epsilon_dpo/loss_margin_mean": 45.07414245605469, "grad_norm": 62.55946731567383, "kl/avg_steps": 0.875, "kl/beta": 0.06304218620061874, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.999176576834721e-07, "logits/chosen": -3.151859760284424, "logits/rejected": -3.281860828399658, "logps/chosen": -45.443458557128906, "logps/ref_chosen": -30.710708618164062, "logps/ref_rejected": -117.44107818603516, "logps/rejected": -177.2479705810547, "loss": 0.3702, "rewards/accuracies": 0.953125, "rewards/chosen": -0.9213298559188843, "rewards/margins": 2.8132152557373047, "rewards/rejected": -3.7345452308654785, "step": 75 }, { "epoch": 0.11160058737151249, "epsilon_dpo/beta": 0.062013305723667145, "epsilon_dpo/beta_margin_grad_mean": -0.19430118799209595, "epsilon_dpo/beta_margin_grad_std": 0.2106785923242569, "epsilon_dpo/beta_margin_mean": 2.20247745513916, "epsilon_dpo/beta_margin_std": 1.9251240491867065, "epsilon_dpo/loss_margin_mean": 35.627044677734375, "grad_norm": 96.22077941894531, "kl/avg_steps": 0.78125, "kl/beta": 0.06249535083770752, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.998814299283415e-07, "logits/chosen": -3.207365036010742, "logits/rejected": -3.260375499725342, "logps/chosen": -48.6707763671875, "logps/ref_chosen": -35.03684997558594, "logps/ref_rejected": -84.84458923339844, "logps/rejected": -134.10556030273438, "loss": 0.5828, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8483954668045044, "rewards/margins": 2.20247745513916, "rewards/rejected": -3.050872802734375, "step": 76 }, { "epoch": 0.1130690161527166, "epsilon_dpo/beta": 0.06151320040225983, "epsilon_dpo/beta_margin_grad_mean": -0.16589321196079254, "epsilon_dpo/beta_margin_grad_std": 0.19776158034801483, "epsilon_dpo/beta_margin_mean": 2.548051595687866, "epsilon_dpo/beta_margin_std": 2.222637414932251, "epsilon_dpo/loss_margin_mean": 41.525611877441406, "grad_norm": 109.90064239501953, "kl/avg_steps": 0.8125, "kl/beta": 0.06201088801026344, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.998386175651409e-07, "logits/chosen": -3.159731864929199, "logits/rejected": -3.2670416831970215, "logps/chosen": -57.969879150390625, "logps/ref_chosen": -44.09752655029297, "logps/ref_rejected": -98.75190734863281, "logps/rejected": -154.14987182617188, "loss": 0.5577, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8574961423873901, "rewards/margins": 2.548051357269287, "rewards/rejected": -3.405547618865967, "step": 77 }, { "epoch": 0.1145374449339207, "epsilon_dpo/beta": 0.06109432876110077, "epsilon_dpo/beta_margin_grad_mean": -0.19816353917121887, "epsilon_dpo/beta_margin_grad_std": 0.19675467908382416, "epsilon_dpo/beta_margin_mean": 2.1238291263580322, "epsilon_dpo/beta_margin_std": 1.9207454919815063, "epsilon_dpo/loss_margin_mean": 34.886810302734375, "grad_norm": 88.5641860961914, "kl/avg_steps": 0.6875, "kl/beta": 0.06151111051440239, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.997892217220159e-07, "logits/chosen": -3.2637405395507812, "logits/rejected": -3.250288963317871, "logps/chosen": -52.061004638671875, "logps/ref_chosen": -38.710906982421875, "logps/ref_rejected": -91.00759887695312, "logps/rejected": -139.2445068359375, "loss": 0.5417, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8202856779098511, "rewards/margins": 2.123828887939453, "rewards/rejected": -2.9441146850585938, "step": 78 }, { "epoch": 0.11600587371512482, "epsilon_dpo/beta": 0.060581713914871216, "epsilon_dpo/beta_margin_grad_mean": -0.1641714721918106, "epsilon_dpo/beta_margin_grad_std": 0.17812295258045197, "epsilon_dpo/beta_margin_mean": 2.5997278690338135, "epsilon_dpo/beta_margin_std": 2.096586227416992, "epsilon_dpo/loss_margin_mean": 42.99335479736328, "grad_norm": 86.2493667602539, "kl/avg_steps": 0.84375, "kl/beta": 0.0610911101102829, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.997332437005931e-07, "logits/chosen": -3.1998233795166016, "logits/rejected": -3.2696633338928223, "logps/chosen": -42.79743957519531, "logps/ref_chosen": -32.905845642089844, "logps/ref_rejected": -95.70394897460938, "logps/rejected": -148.58889770507812, "loss": 0.4274, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6022886633872986, "rewards/margins": 2.5997278690338135, "rewards/rejected": -3.202016592025757, "step": 79 }, { "epoch": 0.11747430249632893, "epsilon_dpo/beta": 0.06015056371688843, "epsilon_dpo/beta_margin_grad_mean": -0.20884603261947632, "epsilon_dpo/beta_margin_grad_std": 0.22885563969612122, "epsilon_dpo/beta_margin_mean": 2.3807008266448975, "epsilon_dpo/beta_margin_std": 2.437171220779419, "epsilon_dpo/loss_margin_mean": 39.722938537597656, "grad_norm": 94.2277603149414, "kl/avg_steps": 0.71875, "kl/beta": 0.06057996675372124, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.996706849759452e-07, "logits/chosen": -3.2733001708984375, "logits/rejected": -3.3127360343933105, "logps/chosen": -55.867835998535156, "logps/ref_chosen": -42.08654022216797, "logps/ref_rejected": -93.93815612792969, "logps/rejected": -147.4423828125, "loss": 0.6202, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8336386680603027, "rewards/margins": 2.3807008266448975, "rewards/rejected": -3.2143394947052, "step": 80 }, { "epoch": 0.11894273127753303, "epsilon_dpo/beta": 0.05966491997241974, "epsilon_dpo/beta_margin_grad_mean": -0.17548146843910217, "epsilon_dpo/beta_margin_grad_std": 0.19263611733913422, "epsilon_dpo/beta_margin_mean": 2.601060628890991, "epsilon_dpo/beta_margin_std": 2.4161243438720703, "epsilon_dpo/loss_margin_mean": 43.69050598144531, "grad_norm": 70.84126281738281, "kl/avg_steps": 0.8125, "kl/beta": 0.06014765426516533, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.996015471965529e-07, "logits/chosen": -3.1918153762817383, "logits/rejected": -3.2612764835357666, "logps/chosen": -52.02825927734375, "logps/ref_chosen": -39.808433532714844, "logps/ref_rejected": -132.9473876953125, "logps/rejected": -188.8577117919922, "loss": 0.4684, "rewards/accuracies": 0.953125, "rewards/chosen": -0.7309558391571045, "rewards/margins": 2.601060628890991, "rewards/rejected": -3.3320164680480957, "step": 81 }, { "epoch": 0.12041116005873716, "epsilon_dpo/beta": 0.05922134220600128, "epsilon_dpo/beta_margin_grad_mean": -0.18756245076656342, "epsilon_dpo/beta_margin_grad_std": 0.23345990478992462, "epsilon_dpo/beta_margin_mean": 2.376243829727173, "epsilon_dpo/beta_margin_std": 2.1569864749908447, "epsilon_dpo/loss_margin_mean": 40.271209716796875, "grad_norm": 98.73262023925781, "kl/avg_steps": 0.75, "kl/beta": 0.059662893414497375, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.995258321842611e-07, "logits/chosen": -3.1882076263427734, "logits/rejected": -3.236912488937378, "logps/chosen": -46.98686218261719, "logps/ref_chosen": -33.495845794677734, "logps/ref_rejected": -96.71635437011719, "logps/rejected": -150.47857666015625, "loss": 0.6066, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8024958372116089, "rewards/margins": 2.376243829727173, "rewards/rejected": -3.178739547729492, "step": 82 }, { "epoch": 0.12187958883994127, "epsilon_dpo/beta": 0.05872496962547302, "epsilon_dpo/beta_margin_grad_mean": -0.16398201882839203, "epsilon_dpo/beta_margin_grad_std": 0.18266217410564423, "epsilon_dpo/beta_margin_mean": 2.464262008666992, "epsilon_dpo/beta_margin_std": 1.7969151735305786, "epsilon_dpo/loss_margin_mean": 42.03822326660156, "grad_norm": 67.05672454833984, "kl/avg_steps": 0.84375, "kl/beta": 0.05921875312924385, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.994435419342304e-07, "logits/chosen": -3.163329839706421, "logits/rejected": -3.297056198120117, "logps/chosen": -36.729583740234375, "logps/ref_chosen": -25.916236877441406, "logps/ref_rejected": -108.29981994628906, "logps/rejected": -161.15139770507812, "loss": 0.424, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6363058090209961, "rewards/margins": 2.464262008666992, "rewards/rejected": -3.1005678176879883, "step": 83 }, { "epoch": 0.12334801762114538, "epsilon_dpo/beta": 0.058215267956256866, "epsilon_dpo/beta_margin_grad_mean": -0.1662423461675644, "epsilon_dpo/beta_margin_grad_std": 0.16180217266082764, "epsilon_dpo/beta_margin_mean": 2.257798910140991, "epsilon_dpo/beta_margin_std": 1.5832382440567017, "epsilon_dpo/loss_margin_mean": 38.83984375, "grad_norm": 68.96977996826172, "kl/avg_steps": 0.875, "kl/beta": 0.05872327461838722, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.993546786148857e-07, "logits/chosen": -3.2171850204467773, "logits/rejected": -3.2014272212982178, "logps/chosen": -51.26725769042969, "logps/ref_chosen": -36.62953567504883, "logps/ref_rejected": -95.27814483642578, "logps/rejected": -148.75570678710938, "loss": 0.4147, "rewards/accuracies": 0.953125, "rewards/chosen": -0.8531267642974854, "rewards/margins": 2.2577991485595703, "rewards/rejected": -3.1109256744384766, "step": 84 }, { "epoch": 0.12481644640234948, "epsilon_dpo/beta": 0.05781027674674988, "epsilon_dpo/beta_margin_grad_mean": -0.20659653842449188, "epsilon_dpo/beta_margin_grad_std": 0.2083936482667923, "epsilon_dpo/beta_margin_mean": 2.0086328983306885, "epsilon_dpo/beta_margin_std": 1.7025460004806519, "epsilon_dpo/loss_margin_mean": 34.859657287597656, "grad_norm": 81.10045623779297, "kl/avg_steps": 0.703125, "kl/beta": 0.05821390450000763, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.992592445678582e-07, "logits/chosen": -3.2968966960906982, "logits/rejected": -3.2394418716430664, "logps/chosen": -59.8691291809082, "logps/ref_chosen": -43.555397033691406, "logps/ref_rejected": -83.9044418334961, "logps/rejected": -135.07781982421875, "loss": 0.564, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9453153610229492, "rewards/margins": 2.0086328983306885, "rewards/rejected": -2.953948497772217, "step": 85 }, { "epoch": 0.1262848751835536, "epsilon_dpo/beta": 0.057433828711509705, "epsilon_dpo/beta_margin_grad_mean": -0.2296195775270462, "epsilon_dpo/beta_margin_grad_std": 0.22347640991210938, "epsilon_dpo/beta_margin_mean": 2.128232479095459, "epsilon_dpo/beta_margin_std": 2.417414903640747, "epsilon_dpo/loss_margin_mean": 37.22154998779297, "grad_norm": 107.6495132446289, "kl/avg_steps": 0.65625, "kl/beta": 0.05780744552612305, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.991572423079235e-07, "logits/chosen": -3.272704601287842, "logits/rejected": -3.2372689247131348, "logps/chosen": -54.787696838378906, "logps/ref_chosen": -38.846839904785156, "logps/ref_rejected": -92.59671020507812, "logps/rejected": -145.7591094970703, "loss": 0.6967, "rewards/accuracies": 0.875, "rewards/chosen": -0.9202961921691895, "rewards/margins": 2.128232479095459, "rewards/rejected": -3.0485289096832275, "step": 86 }, { "epoch": 0.1277533039647577, "epsilon_dpo/beta": 0.05698757991194725, "epsilon_dpo/beta_margin_grad_mean": -0.18504740297794342, "epsilon_dpo/beta_margin_grad_std": 0.18343256413936615, "epsilon_dpo/beta_margin_mean": 2.3569390773773193, "epsilon_dpo/beta_margin_std": 1.9608224630355835, "epsilon_dpo/loss_margin_mean": 41.45563507080078, "grad_norm": 59.761165618896484, "kl/avg_steps": 0.78125, "kl/beta": 0.057430557906627655, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.990486745229364e-07, "logits/chosen": -3.230192184448242, "logits/rejected": -3.3376736640930176, "logps/chosen": -52.74066162109375, "logps/ref_chosen": -38.653236389160156, "logps/ref_rejected": -102.44976043701172, "logps/rejected": -157.99281311035156, "loss": 0.4717, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8040882349014282, "rewards/margins": 2.3569388389587402, "rewards/rejected": -3.161027193069458, "step": 87 }, { "epoch": 0.12922173274596183, "epsilon_dpo/beta": 0.05656362697482109, "epsilon_dpo/beta_margin_grad_mean": -0.1606673151254654, "epsilon_dpo/beta_margin_grad_std": 0.18233220279216766, "epsilon_dpo/beta_margin_mean": 2.5598440170288086, "epsilon_dpo/beta_margin_std": 1.910704493522644, "epsilon_dpo/loss_margin_mean": 45.38097381591797, "grad_norm": 58.7109375, "kl/avg_steps": 0.75, "kl/beta": 0.05698535963892937, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.989335440737586e-07, "logits/chosen": -3.27421236038208, "logits/rejected": -3.3343310356140137, "logps/chosen": -51.133506774902344, "logps/ref_chosen": -37.23695373535156, "logps/ref_rejected": -116.12947082519531, "logps/rejected": -175.40699768066406, "loss": 0.4141, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7879468202590942, "rewards/margins": 2.5598437786102295, "rewards/rejected": -3.3477907180786133, "step": 88 }, { "epoch": 0.13069016152716592, "epsilon_dpo/beta": 0.05614255368709564, "epsilon_dpo/beta_margin_grad_mean": -0.18962043523788452, "epsilon_dpo/beta_margin_grad_std": 0.18511323630809784, "epsilon_dpo/beta_margin_mean": 2.2266876697540283, "epsilon_dpo/beta_margin_std": 1.9912763833999634, "epsilon_dpo/loss_margin_mean": 39.772979736328125, "grad_norm": 69.53501892089844, "kl/avg_steps": 0.75, "kl/beta": 0.056561149656772614, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.988118539941847e-07, "logits/chosen": -3.2087016105651855, "logits/rejected": -3.23443603515625, "logps/chosen": -52.141876220703125, "logps/ref_chosen": -39.35747146606445, "logps/ref_rejected": -88.01043701171875, "logps/rejected": -140.5678253173828, "loss": 0.4951, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7197608947753906, "rewards/margins": 2.2266876697540283, "rewards/rejected": -2.946448802947998, "step": 89 }, { "epoch": 0.13215859030837004, "epsilon_dpo/beta": 0.055777255445718765, "epsilon_dpo/beta_margin_grad_mean": -0.19303679466247559, "epsilon_dpo/beta_margin_grad_std": 0.21628044545650482, "epsilon_dpo/beta_margin_mean": 2.5097200870513916, "epsilon_dpo/beta_margin_std": 2.4309592247009277, "epsilon_dpo/loss_margin_mean": 45.174312591552734, "grad_norm": 91.28634643554688, "kl/avg_steps": 0.65625, "kl/beta": 0.056140098720788956, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.986836074908615e-07, "logits/chosen": -3.129134178161621, "logits/rejected": -3.3651089668273926, "logps/chosen": -46.801185607910156, "logps/ref_chosen": -30.30811882019043, "logps/ref_rejected": -119.35741424560547, "logps/rejected": -181.02479553222656, "loss": 0.5447, "rewards/accuracies": 0.875, "rewards/chosen": -0.9231457710266113, "rewards/margins": 2.5097203254699707, "rewards/rejected": -3.432866096496582, "step": 90 }, { "epoch": 0.13362701908957417, "epsilon_dpo/beta": 0.05532645061612129, "epsilon_dpo/beta_margin_grad_mean": -0.16726556420326233, "epsilon_dpo/beta_margin_grad_std": 0.17135490477085114, "epsilon_dpo/beta_margin_mean": 2.357635736465454, "epsilon_dpo/beta_margin_std": 1.7421871423721313, "epsilon_dpo/loss_margin_mean": 42.69865036010742, "grad_norm": 68.51940155029297, "kl/avg_steps": 0.8125, "kl/beta": 0.055774081498384476, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.985488079432037e-07, "logits/chosen": -3.2310478687286377, "logits/rejected": -3.2350800037384033, "logps/chosen": -50.61489486694336, "logps/ref_chosen": -35.484596252441406, "logps/ref_rejected": -94.16378784179688, "logps/rejected": -151.99273681640625, "loss": 0.4213, "rewards/accuracies": 0.953125, "rewards/chosen": -0.8394383192062378, "rewards/margins": 2.357635736465454, "rewards/rejected": -3.1970739364624023, "step": 91 }, { "epoch": 0.13509544787077826, "epsilon_dpo/beta": 0.05489783734083176, "epsilon_dpo/beta_margin_grad_mean": -0.2024046629667282, "epsilon_dpo/beta_margin_grad_std": 0.220162495970726, "epsilon_dpo/beta_margin_mean": 2.2357397079467773, "epsilon_dpo/beta_margin_std": 2.0685596466064453, "epsilon_dpo/loss_margin_mean": 40.8467903137207, "grad_norm": 87.93189239501953, "kl/avg_steps": 0.78125, "kl/beta": 0.05532456934452057, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.984074589033043e-07, "logits/chosen": -3.121596574783325, "logits/rejected": -3.1575112342834473, "logps/chosen": -52.659706115722656, "logps/ref_chosen": -37.970062255859375, "logps/ref_rejected": -84.28839111328125, "logps/rejected": -139.8248291015625, "loss": 0.5756, "rewards/accuracies": 0.890625, "rewards/chosen": -0.808807373046875, "rewards/margins": 2.2357397079467773, "rewards/rejected": -3.0445470809936523, "step": 92 }, { "epoch": 0.13656387665198239, "epsilon_dpo/beta": 0.05447227507829666, "epsilon_dpo/beta_margin_grad_mean": -0.18667982518672943, "epsilon_dpo/beta_margin_grad_std": 0.18083734810352325, "epsilon_dpo/beta_margin_mean": 2.251617908477783, "epsilon_dpo/beta_margin_std": 1.9179073572158813, "epsilon_dpo/loss_margin_mean": 41.435726165771484, "grad_norm": 66.98503875732422, "kl/avg_steps": 0.78125, "kl/beta": 0.05489569902420044, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.982595640958425e-07, "logits/chosen": -3.2347559928894043, "logits/rejected": -3.2382359504699707, "logps/chosen": -51.651126861572266, "logps/ref_chosen": -35.3890266418457, "logps/ref_rejected": -81.84159851074219, "logps/rejected": -139.5394287109375, "loss": 0.4779, "rewards/accuracies": 0.921875, "rewards/chosen": -0.88683021068573, "rewards/margins": 2.251617908477783, "rewards/rejected": -3.1384482383728027, "step": 93 }, { "epoch": 0.13803230543318648, "epsilon_dpo/beta": 0.05399893596768379, "epsilon_dpo/beta_margin_grad_mean": -0.15303176641464233, "epsilon_dpo/beta_margin_grad_std": 0.14751987159252167, "epsilon_dpo/beta_margin_mean": 2.574958562850952, "epsilon_dpo/beta_margin_std": 1.9459197521209717, "epsilon_dpo/loss_margin_mean": 47.74397659301758, "grad_norm": 63.9381103515625, "kl/avg_steps": 0.875, "kl/beta": 0.05447014793753624, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.98105127417984e-07, "logits/chosen": -3.194207191467285, "logits/rejected": -3.2492258548736572, "logps/chosen": -53.30908203125, "logps/ref_chosen": -38.974853515625, "logps/ref_rejected": -106.16789245605469, "logps/rejected": -168.24609375, "loss": 0.3679, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7756279706954956, "rewards/margins": 2.574958324432373, "rewards/rejected": -3.350586414337158, "step": 94 }, { "epoch": 0.1395007342143906, "epsilon_dpo/beta": 0.05351366475224495, "epsilon_dpo/beta_margin_grad_mean": -0.176425963640213, "epsilon_dpo/beta_margin_grad_std": 0.15948951244354248, "epsilon_dpo/beta_margin_mean": 2.165757417678833, "epsilon_dpo/beta_margin_std": 1.6699368953704834, "epsilon_dpo/loss_margin_mean": 40.51607131958008, "grad_norm": 63.63764572143555, "kl/avg_steps": 0.90625, "kl/beta": 0.053997669368982315, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.979441529392784e-07, "logits/chosen": -3.180877208709717, "logits/rejected": -3.2767512798309326, "logps/chosen": -41.924564361572266, "logps/ref_chosen": -29.644317626953125, "logps/ref_rejected": -80.65695190429688, "logps/rejected": -133.45326232910156, "loss": 0.4364, "rewards/accuracies": 0.953125, "rewards/chosen": -0.6584010124206543, "rewards/margins": 2.165757417678833, "rewards/rejected": -2.824158191680908, "step": 95 }, { "epoch": 0.14096916299559473, "epsilon_dpo/beta": 0.05311667546629906, "epsilon_dpo/beta_margin_grad_mean": -0.18641680479049683, "epsilon_dpo/beta_margin_grad_std": 0.20043620467185974, "epsilon_dpo/beta_margin_mean": 2.554248094558716, "epsilon_dpo/beta_margin_std": 2.2736546993255615, "epsilon_dpo/loss_margin_mean": 48.22751998901367, "grad_norm": 85.53929901123047, "kl/avg_steps": 0.75, "kl/beta": 0.05351271107792854, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.977766449015534e-07, "logits/chosen": -3.2188198566436768, "logits/rejected": -3.264829635620117, "logps/chosen": -65.12588500976562, "logps/ref_chosen": -48.42084884643555, "logps/ref_rejected": -102.56741333007812, "logps/rejected": -167.49996948242188, "loss": 0.4965, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8906632661819458, "rewards/margins": 2.554248094558716, "rewards/rejected": -3.444911479949951, "step": 96 }, { "epoch": 0.14243759177679882, "epsilon_dpo/beta": 0.05267146974802017, "epsilon_dpo/beta_margin_grad_mean": -0.15445828437805176, "epsilon_dpo/beta_margin_grad_std": 0.16520872712135315, "epsilon_dpo/beta_margin_mean": 2.608966827392578, "epsilon_dpo/beta_margin_std": 1.9469996690750122, "epsilon_dpo/loss_margin_mean": 49.61277770996094, "grad_norm": 52.84062194824219, "kl/avg_steps": 0.84375, "kl/beta": 0.053114354610443115, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.976026077188012e-07, "logits/chosen": -3.178095817565918, "logits/rejected": -3.169985294342041, "logps/chosen": -47.99535369873047, "logps/ref_chosen": -35.1164436340332, "logps/ref_rejected": -83.36341857910156, "logps/rejected": -145.8551025390625, "loss": 0.3851, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6797794103622437, "rewards/margins": 2.6089670658111572, "rewards/rejected": -3.2887463569641113, "step": 97 }, { "epoch": 0.14390602055800295, "epsilon_dpo/beta": 0.05228015407919884, "epsilon_dpo/beta_margin_grad_mean": -0.19064751267433167, "epsilon_dpo/beta_margin_grad_std": 0.22201648354530334, "epsilon_dpo/beta_margin_mean": 2.2540199756622314, "epsilon_dpo/beta_margin_std": 1.9207807779312134, "epsilon_dpo/loss_margin_mean": 43.26023864746094, "grad_norm": 101.56369018554688, "kl/avg_steps": 0.75, "kl/beta": 0.0526699498295784, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.974220459770639e-07, "logits/chosen": -3.108753204345703, "logits/rejected": -3.2454357147216797, "logps/chosen": -65.423828125, "logps/ref_chosen": -44.868499755859375, "logps/ref_rejected": -101.7425537109375, "logps/rejected": -165.55812072753906, "loss": 0.5783, "rewards/accuracies": 0.875, "rewards/chosen": -1.079300880432129, "rewards/margins": 2.2540199756622314, "rewards/rejected": -3.3333208560943604, "step": 98 }, { "epoch": 0.14537444933920704, "epsilon_dpo/beta": 0.051825616508722305, "epsilon_dpo/beta_margin_grad_mean": -0.1456242799758911, "epsilon_dpo/beta_margin_grad_std": 0.16527007520198822, "epsilon_dpo/beta_margin_mean": 2.7669012546539307, "epsilon_dpo/beta_margin_std": 2.086336135864258, "epsilon_dpo/loss_margin_mean": 53.46054458618164, "grad_norm": 56.95939254760742, "kl/avg_steps": 0.875, "kl/beta": 0.05227786675095558, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.972349644343108e-07, "logits/chosen": -3.0614562034606934, "logits/rejected": -3.1564629077911377, "logps/chosen": -40.343666076660156, "logps/ref_chosen": -25.89197540283203, "logps/ref_rejected": -91.06307983398438, "logps/rejected": -158.97531127929688, "loss": 0.3636, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7500526905059814, "rewards/margins": 2.7669014930725098, "rewards/rejected": -3.516953945159912, "step": 99 }, { "epoch": 0.14684287812041116, "epsilon_dpo/beta": 0.05150564759969711, "epsilon_dpo/beta_margin_grad_mean": -0.23118621110916138, "epsilon_dpo/beta_margin_grad_std": 0.23917478322982788, "epsilon_dpo/beta_margin_mean": 1.9697211980819702, "epsilon_dpo/beta_margin_std": 1.9218071699142456, "epsilon_dpo/loss_margin_mean": 38.432437896728516, "grad_norm": 109.34486389160156, "kl/avg_steps": 0.625, "kl/beta": 0.05182440206408501, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.970413680203148e-07, "logits/chosen": -3.0767757892608643, "logits/rejected": -3.0463757514953613, "logps/chosen": -60.350921630859375, "logps/ref_chosen": -41.60627746582031, "logps/ref_rejected": -79.52035522460938, "logps/rejected": -136.6974334716797, "loss": 0.6663, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9696159362792969, "rewards/margins": 1.9697211980819702, "rewards/rejected": -2.9393372535705566, "step": 100 }, { "epoch": 0.14684287812041116, "eval_epsilon_dpo/beta": 0.051277898252010345, "eval_epsilon_dpo/beta_margin_grad_mean": -0.30362221598625183, "eval_epsilon_dpo/beta_margin_grad_std": 0.2585260272026062, "eval_epsilon_dpo/beta_margin_mean": 1.5045814514160156, "eval_epsilon_dpo/beta_margin_std": 2.153695583343506, "eval_epsilon_dpo/loss_margin_mean": 29.590499877929688, "eval_kl/n_epsilon_steps": 0.2765410840511322, "eval_kl/p_epsilon_steps": 0.7226027250289917, "eval_logits/chosen": -3.276595115661621, "eval_logits/rejected": -3.2684147357940674, "eval_logps/chosen": -96.53782653808594, "eval_logps/ref_chosen": -68.29110717773438, "eval_logps/ref_rejected": -92.08038330078125, "eval_logps/rejected": -149.91758728027344, "eval_loss": 0.49363189935684204, "eval_rewards/accuracies": 0.7555650472640991, "eval_rewards/chosen": -1.4540938138961792, "eval_rewards/margins": 1.5045814514160156, "eval_rewards/rejected": -2.9586753845214844, "eval_runtime": 38.4605, "eval_samples_per_second": 60.816, "eval_steps_per_second": 1.924, "step": 100 }, { "epoch": 0.14831130690161526, "epsilon_dpo/beta": 0.051185738295316696, "epsilon_dpo/beta_margin_grad_mean": -0.23383933305740356, "epsilon_dpo/beta_margin_grad_std": 0.23400039970874786, "epsilon_dpo/beta_margin_mean": 1.9769922494888306, "epsilon_dpo/beta_margin_std": 2.0824813842773438, "epsilon_dpo/loss_margin_mean": 38.81412124633789, "grad_norm": 104.73916625976562, "kl/avg_steps": 0.625, "kl/beta": 0.05150251090526581, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.968412618365215e-07, "logits/chosen": -3.2080070972442627, "logits/rejected": -3.206873655319214, "logps/chosen": -67.021484375, "logps/ref_chosen": -46.34172439575195, "logps/ref_rejected": -89.14061737060547, "logps/rejected": -148.63449096679688, "loss": 0.6759, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0630580186843872, "rewards/margins": 1.976992130279541, "rewards/rejected": -3.0400502681732178, "step": 101 }, { "epoch": 0.14977973568281938, "epsilon_dpo/beta": 0.05080382898449898, "epsilon_dpo/beta_margin_grad_mean": -0.20627860724925995, "epsilon_dpo/beta_margin_grad_std": 0.19808727502822876, "epsilon_dpo/beta_margin_mean": 2.0157113075256348, "epsilon_dpo/beta_margin_std": 1.7352498769760132, "epsilon_dpo/loss_margin_mean": 39.79585647583008, "grad_norm": 70.99539184570312, "kl/avg_steps": 0.75, "kl/beta": 0.051182620227336884, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.966346511559149e-07, "logits/chosen": -3.2234458923339844, "logits/rejected": -3.1278076171875, "logps/chosen": -59.05921936035156, "logps/ref_chosen": -40.793182373046875, "logps/ref_rejected": -74.26336669921875, "logps/rejected": -132.32525634765625, "loss": 0.5495, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9300694465637207, "rewards/margins": 2.0157113075256348, "rewards/rejected": -2.9457807540893555, "step": 102 }, { "epoch": 0.1512481644640235, "epsilon_dpo/beta": 0.05037800967693329, "epsilon_dpo/beta_margin_grad_mean": -0.14925150573253632, "epsilon_dpo/beta_margin_grad_std": 0.1886918544769287, "epsilon_dpo/beta_margin_mean": 2.8233063220977783, "epsilon_dpo/beta_margin_std": 2.1322638988494873, "epsilon_dpo/loss_margin_mean": 56.148807525634766, "grad_norm": 70.43810272216797, "kl/avg_steps": 0.84375, "kl/beta": 0.050801608711481094, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.964215414228785e-07, "logits/chosen": -3.152428388595581, "logits/rejected": -3.057560920715332, "logps/chosen": -53.713714599609375, "logps/ref_chosen": -37.85403823852539, "logps/ref_rejected": -101.43891906738281, "logps/rejected": -173.4473876953125, "loss": 0.3972, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8012863397598267, "rewards/margins": 2.823306083679199, "rewards/rejected": -3.6245927810668945, "step": 103 }, { "epoch": 0.1527165932452276, "epsilon_dpo/beta": 0.04997224360704422, "epsilon_dpo/beta_margin_grad_mean": -0.16119545698165894, "epsilon_dpo/beta_margin_grad_std": 0.1801510900259018, "epsilon_dpo/beta_margin_mean": 2.779792547225952, "epsilon_dpo/beta_margin_std": 2.224303960800171, "epsilon_dpo/loss_margin_mean": 55.73902893066406, "grad_norm": 61.21420669555664, "kl/avg_steps": 0.8125, "kl/beta": 0.05037655681371689, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.96201938253052e-07, "logits/chosen": -3.150313377380371, "logits/rejected": -3.136101245880127, "logps/chosen": -51.264808654785156, "logps/ref_chosen": -35.89470672607422, "logps/ref_rejected": -97.03848266601562, "logps/rejected": -168.14761352539062, "loss": 0.4096, "rewards/accuracies": 0.921875, "rewards/chosen": -0.769996166229248, "rewards/margins": 2.779792308807373, "rewards/rejected": -3.549788475036621, "step": 104 }, { "epoch": 0.15418502202643172, "epsilon_dpo/beta": 0.04952263832092285, "epsilon_dpo/beta_margin_grad_mean": -0.12934987246990204, "epsilon_dpo/beta_margin_grad_std": 0.15336252748966217, "epsilon_dpo/beta_margin_mean": 2.9811060428619385, "epsilon_dpo/beta_margin_std": 2.177426338195801, "epsilon_dpo/loss_margin_mean": 60.252281188964844, "grad_norm": 61.274295806884766, "kl/avg_steps": 0.90625, "kl/beta": 0.04997054487466812, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.959758474331832e-07, "logits/chosen": -3.1032373905181885, "logits/rejected": -3.1879420280456543, "logps/chosen": -56.80089569091797, "logps/ref_chosen": -39.414154052734375, "logps/ref_rejected": -105.14703369140625, "logps/rejected": -182.7860565185547, "loss": 0.3212, "rewards/accuracies": 0.953125, "rewards/chosen": -0.8625220060348511, "rewards/margins": 2.9811058044433594, "rewards/rejected": -3.8436279296875, "step": 105 }, { "epoch": 0.15565345080763582, "epsilon_dpo/beta": 0.04914744198322296, "epsilon_dpo/beta_margin_grad_mean": -0.18032847344875336, "epsilon_dpo/beta_margin_grad_std": 0.19534119963645935, "epsilon_dpo/beta_margin_mean": 2.3009119033813477, "epsilon_dpo/beta_margin_std": 1.8214956521987915, "epsilon_dpo/loss_margin_mean": 46.947818756103516, "grad_norm": 60.775062561035156, "kl/avg_steps": 0.765625, "kl/beta": 0.049521755427122116, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.957432749209755e-07, "logits/chosen": -3.1399245262145996, "logits/rejected": -3.1790456771850586, "logps/chosen": -58.34198760986328, "logps/ref_chosen": -39.36569595336914, "logps/ref_rejected": -90.00059509277344, "logps/rejected": -155.92471313476562, "loss": 0.4771, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9345280528068542, "rewards/margins": 2.3009119033813477, "rewards/rejected": -3.2354397773742676, "step": 106 }, { "epoch": 0.15712187958883994, "epsilon_dpo/beta": 0.04868961498141289, "epsilon_dpo/beta_margin_grad_mean": -0.14175310730934143, "epsilon_dpo/beta_margin_grad_std": 0.17217276990413666, "epsilon_dpo/beta_margin_mean": 2.8992745876312256, "epsilon_dpo/beta_margin_std": 2.3069019317626953, "epsilon_dpo/loss_margin_mean": 59.605445861816406, "grad_norm": 64.9974365234375, "kl/avg_steps": 0.9375, "kl/beta": 0.04914548620581627, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.955042268449307e-07, "logits/chosen": -3.1247940063476562, "logits/rejected": -3.1252992153167725, "logps/chosen": -56.38127899169922, "logps/ref_chosen": -40.38301086425781, "logps/ref_rejected": -100.88917541503906, "logps/rejected": -176.49288940429688, "loss": 0.383, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7801457643508911, "rewards/margins": 2.8992745876312256, "rewards/rejected": -3.6794204711914062, "step": 107 }, { "epoch": 0.15859030837004406, "epsilon_dpo/beta": 0.048267822712659836, "epsilon_dpo/beta_margin_grad_mean": -0.13621723651885986, "epsilon_dpo/beta_margin_grad_std": 0.16647490859031677, "epsilon_dpo/beta_margin_mean": 3.0122811794281006, "epsilon_dpo/beta_margin_std": 2.167310953140259, "epsilon_dpo/loss_margin_mean": 62.49602508544922, "grad_norm": 53.83271026611328, "kl/avg_steps": 0.875, "kl/beta": 0.04868902638554573, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.952587095041881e-07, "logits/chosen": -3.1003808975219727, "logits/rejected": -3.143059492111206, "logps/chosen": -49.35261917114258, "logps/ref_chosen": -31.513410568237305, "logps/ref_rejected": -104.42971801757812, "logps/rejected": -184.76495361328125, "loss": 0.3473, "rewards/accuracies": 0.953125, "rewards/chosen": -0.8619762063026428, "rewards/margins": 3.0122811794281006, "rewards/rejected": -3.8742575645446777, "step": 108 }, { "epoch": 0.16005873715124816, "epsilon_dpo/beta": 0.04787931218743324, "epsilon_dpo/beta_margin_grad_mean": -0.17085351049900055, "epsilon_dpo/beta_margin_grad_std": 0.21672260761260986, "epsilon_dpo/beta_margin_mean": 2.650977611541748, "epsilon_dpo/beta_margin_std": 2.2566730976104736, "epsilon_dpo/loss_margin_mean": 55.51750564575195, "grad_norm": 100.95081329345703, "kl/avg_steps": 0.8125, "kl/beta": 0.0482666902244091, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.95006729368358e-07, "logits/chosen": -3.113433361053467, "logits/rejected": -3.186359167098999, "logps/chosen": -64.14362335205078, "logps/ref_chosen": -45.64976501464844, "logps/ref_rejected": -104.12217712402344, "logps/rejected": -178.133544921875, "loss": 0.5342, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8883336782455444, "rewards/margins": 2.650977611541748, "rewards/rejected": -3.539311408996582, "step": 109 }, { "epoch": 0.16152716593245228, "epsilon_dpo/beta": 0.04758320748806, "epsilon_dpo/beta_margin_grad_mean": -0.20529796183109283, "epsilon_dpo/beta_margin_grad_std": 0.21916480362415314, "epsilon_dpo/beta_margin_mean": 2.1833033561706543, "epsilon_dpo/beta_margin_std": 2.0075676441192627, "epsilon_dpo/loss_margin_mean": 46.09016799926758, "grad_norm": 87.10065460205078, "kl/avg_steps": 0.625, "kl/beta": 0.047877684235572815, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.947482930773511e-07, "logits/chosen": -3.0851223468780518, "logits/rejected": -3.0515449047088623, "logps/chosen": -74.82107543945312, "logps/ref_chosen": -50.388492584228516, "logps/ref_rejected": -85.45807647705078, "logps/rejected": -155.9808349609375, "loss": 0.5765, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1659729480743408, "rewards/margins": 2.1833033561706543, "rewards/rejected": -3.349276065826416, "step": 110 }, { "epoch": 0.16299559471365638, "epsilon_dpo/beta": 0.04724304750561714, "epsilon_dpo/beta_margin_grad_mean": -0.17633913457393646, "epsilon_dpo/beta_margin_grad_std": 0.20233316719532013, "epsilon_dpo/beta_margin_mean": 2.6269123554229736, "epsilon_dpo/beta_margin_std": 2.2809414863586426, "epsilon_dpo/loss_margin_mean": 55.77882766723633, "grad_norm": 77.24600219726562, "kl/avg_steps": 0.71875, "kl/beta": 0.047580309212207794, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.944834074412042e-07, "logits/chosen": -3.054626941680908, "logits/rejected": -3.125295877456665, "logps/chosen": -75.80860900878906, "logps/ref_chosen": -50.71482849121094, "logps/ref_rejected": -103.22978210449219, "logps/rejected": -184.10238647460938, "loss": 0.4814, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1880950927734375, "rewards/margins": 2.6269125938415527, "rewards/rejected": -3.8150076866149902, "step": 111 }, { "epoch": 0.1644640234948605, "epsilon_dpo/beta": 0.04686162248253822, "epsilon_dpo/beta_margin_grad_mean": -0.19428886473178864, "epsilon_dpo/beta_margin_grad_std": 0.19283361732959747, "epsilon_dpo/beta_margin_mean": 2.084317207336426, "epsilon_dpo/beta_margin_std": 1.6938108205795288, "epsilon_dpo/loss_margin_mean": 44.58252716064453, "grad_norm": 59.31022262573242, "kl/avg_steps": 0.8125, "kl/beta": 0.04724076762795448, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.942120794399002e-07, "logits/chosen": -3.0048656463623047, "logits/rejected": -3.0176172256469727, "logps/chosen": -50.82137680053711, "logps/ref_chosen": -31.809959411621094, "logps/ref_rejected": -68.63668823242188, "logps/rejected": -132.23062133789062, "loss": 0.5174, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8925973176956177, "rewards/margins": 2.084317207336426, "rewards/rejected": -2.976914405822754, "step": 112 }, { "epoch": 0.16593245227606462, "epsilon_dpo/beta": 0.04648394137620926, "epsilon_dpo/beta_margin_grad_mean": -0.1680552214384079, "epsilon_dpo/beta_margin_grad_std": 0.16932794451713562, "epsilon_dpo/beta_margin_mean": 2.274296760559082, "epsilon_dpo/beta_margin_std": 1.6202584505081177, "epsilon_dpo/loss_margin_mean": 49.026641845703125, "grad_norm": 57.292938232421875, "kl/avg_steps": 0.8125, "kl/beta": 0.04686002805829048, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.939343162231841e-07, "logits/chosen": -3.054755687713623, "logits/rejected": -3.0067625045776367, "logps/chosen": -64.85562896728516, "logps/ref_chosen": -42.08271408081055, "logps/ref_rejected": -83.40155029296875, "logps/rejected": -155.20111083984375, "loss": 0.4213, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0600247383117676, "rewards/margins": 2.274296760559082, "rewards/rejected": -3.3343217372894287, "step": 113 }, { "epoch": 0.16740088105726872, "epsilon_dpo/beta": 0.04610930010676384, "epsilon_dpo/beta_margin_grad_mean": -0.1812613159418106, "epsilon_dpo/beta_margin_grad_std": 0.20109733939170837, "epsilon_dpo/beta_margin_mean": 2.5884387493133545, "epsilon_dpo/beta_margin_std": 2.3022119998931885, "epsilon_dpo/loss_margin_mean": 56.26390075683594, "grad_norm": 70.24754333496094, "kl/avg_steps": 0.8125, "kl/beta": 0.04648235812783241, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.936501251103751e-07, "logits/chosen": -2.998676300048828, "logits/rejected": -3.029796600341797, "logps/chosen": -63.09971237182617, "logps/ref_chosen": -42.380760192871094, "logps/ref_rejected": -91.27511596679688, "logps/rejected": -168.25796508789062, "loss": 0.4844, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9574916362762451, "rewards/margins": 2.5884387493133545, "rewards/rejected": -3.5459303855895996, "step": 114 }, { "epoch": 0.16886930983847284, "epsilon_dpo/beta": 0.04570886492729187, "epsilon_dpo/beta_margin_grad_mean": -0.16286557912826538, "epsilon_dpo/beta_margin_grad_std": 0.18301641941070557, "epsilon_dpo/beta_margin_mean": 2.5425362586975098, "epsilon_dpo/beta_margin_std": 1.9873340129852295, "epsilon_dpo/loss_margin_mean": 55.71139144897461, "grad_norm": 66.69357299804688, "kl/avg_steps": 0.875, "kl/beta": 0.046107735484838486, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.933595135901732e-07, "logits/chosen": -3.068523406982422, "logits/rejected": -3.093740463256836, "logps/chosen": -61.14916229248047, "logps/ref_chosen": -40.20535659790039, "logps/ref_rejected": -106.33647155761719, "logps/rejected": -182.99166870117188, "loss": 0.4575, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9589067697525024, "rewards/margins": 2.5425362586975098, "rewards/rejected": -3.5014431476593018, "step": 115 }, { "epoch": 0.17033773861967694, "epsilon_dpo/beta": 0.04532666504383087, "epsilon_dpo/beta_margin_grad_mean": -0.17079366743564606, "epsilon_dpo/beta_margin_grad_std": 0.1683167964220047, "epsilon_dpo/beta_margin_mean": 2.3608319759368896, "epsilon_dpo/beta_margin_std": 1.805658221244812, "epsilon_dpo/loss_margin_mean": 52.17644500732422, "grad_norm": 69.93063354492188, "kl/avg_steps": 0.84375, "kl/beta": 0.04570779204368591, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.930624893204624e-07, "logits/chosen": -2.8386454582214355, "logits/rejected": -2.940783977508545, "logps/chosen": -52.52592086791992, "logps/ref_chosen": -30.788612365722656, "logps/ref_rejected": -85.05826568603516, "logps/rejected": -158.97201538085938, "loss": 0.4288, "rewards/accuracies": 0.953125, "rewards/chosen": -0.9880549907684326, "rewards/margins": 2.3608322143554688, "rewards/rejected": -3.3488869667053223, "step": 116 }, { "epoch": 0.17180616740088106, "epsilon_dpo/beta": 0.04501824453473091, "epsilon_dpo/beta_margin_grad_mean": -0.18530869483947754, "epsilon_dpo/beta_margin_grad_std": 0.18177969753742218, "epsilon_dpo/beta_margin_mean": 2.1722805500030518, "epsilon_dpo/beta_margin_std": 1.6980078220367432, "epsilon_dpo/loss_margin_mean": 48.40717697143555, "grad_norm": 82.57999420166016, "kl/avg_steps": 0.6875, "kl/beta": 0.04532535746693611, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.927590601281083e-07, "logits/chosen": -2.9002761840820312, "logits/rejected": -2.8406801223754883, "logps/chosen": -77.98645782470703, "logps/ref_chosen": -49.41318893432617, "logps/ref_rejected": -72.67740631103516, "logps/rejected": -149.6578369140625, "loss": 0.4764, "rewards/accuracies": 0.921875, "rewards/chosen": -1.2893564701080322, "rewards/margins": 2.1722805500030518, "rewards/rejected": -3.461637020111084, "step": 117 }, { "epoch": 0.17327459618208516, "epsilon_dpo/beta": 0.044626448303461075, "epsilon_dpo/beta_margin_grad_mean": -0.15689337253570557, "epsilon_dpo/beta_margin_grad_std": 0.17096935212612152, "epsilon_dpo/beta_margin_mean": 2.458299398422241, "epsilon_dpo/beta_margin_std": 1.716274380683899, "epsilon_dpo/loss_margin_mean": 55.162723541259766, "grad_norm": 55.6863899230957, "kl/avg_steps": 0.875, "kl/beta": 0.04501587525010109, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.924492340087524e-07, "logits/chosen": -2.8982632160186768, "logits/rejected": -2.9912490844726562, "logps/chosen": -58.43132400512695, "logps/ref_chosen": -35.58433532714844, "logps/ref_rejected": -80.31232452392578, "logps/rejected": -158.32205200195312, "loss": 0.3965, "rewards/accuracies": 0.953125, "rewards/chosen": -1.0198127031326294, "rewards/margins": 2.458299398422241, "rewards/rejected": -3.47811222076416, "step": 118 }, { "epoch": 0.17474302496328928, "epsilon_dpo/beta": 0.04427415132522583, "epsilon_dpo/beta_margin_grad_mean": -0.16944344341754913, "epsilon_dpo/beta_margin_grad_std": 0.20828606188297272, "epsilon_dpo/beta_margin_mean": 2.63626766204834, "epsilon_dpo/beta_margin_std": 2.4180660247802734, "epsilon_dpo/loss_margin_mean": 59.708961486816406, "grad_norm": 105.04483795166016, "kl/avg_steps": 0.796875, "kl/beta": 0.04462540149688721, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.92133019126601e-07, "logits/chosen": -2.9290332794189453, "logits/rejected": -2.974984645843506, "logps/chosen": -68.64191436767578, "logps/ref_chosen": -41.200706481933594, "logps/ref_rejected": -109.11971282958984, "logps/rejected": -196.2698974609375, "loss": 0.5632, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2193212509155273, "rewards/margins": 2.63626766204834, "rewards/rejected": -3.855588912963867, "step": 119 }, { "epoch": 0.1762114537444934, "epsilon_dpo/beta": 0.043917279690504074, "epsilon_dpo/beta_margin_grad_mean": -0.1446925848722458, "epsilon_dpo/beta_margin_grad_std": 0.17125862836837769, "epsilon_dpo/beta_margin_mean": 2.938814163208008, "epsilon_dpo/beta_margin_std": 2.2117486000061035, "epsilon_dpo/loss_margin_mean": 67.05432891845703, "grad_norm": 58.40270233154297, "kl/avg_steps": 0.8125, "kl/beta": 0.04427260532975197, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.918104238142103e-07, "logits/chosen": -2.980833053588867, "logits/rejected": -2.9574685096740723, "logps/chosen": -79.7890625, "logps/ref_chosen": -53.9609489440918, "logps/ref_rejected": -114.34416961669922, "logps/rejected": -207.2266082763672, "loss": 0.3704, "rewards/accuracies": 0.953125, "rewards/chosen": -1.1369682550430298, "rewards/margins": 2.938814163208008, "rewards/rejected": -4.075782775878906, "step": 120 }, { "epoch": 0.1776798825256975, "epsilon_dpo/beta": 0.04350843280553818, "epsilon_dpo/beta_margin_grad_mean": -0.12654456496238708, "epsilon_dpo/beta_margin_grad_std": 0.1407029926776886, "epsilon_dpo/beta_margin_mean": 3.131282091140747, "epsilon_dpo/beta_margin_std": 2.3148951530456543, "epsilon_dpo/loss_margin_mean": 72.01702117919922, "grad_norm": 43.766937255859375, "kl/avg_steps": 0.9375, "kl/beta": 0.04391578957438469, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.91481456572267e-07, "logits/chosen": -2.7587661743164062, "logits/rejected": -2.9042224884033203, "logps/chosen": -57.087093353271484, "logps/ref_chosen": -32.518829345703125, "logps/ref_rejected": -119.22189331054688, "logps/rejected": -215.80718994140625, "loss": 0.3024, "rewards/accuracies": 0.984375, "rewards/chosen": -1.0692138671875, "rewards/margins": 3.131281852722168, "rewards/rejected": -4.200495719909668, "step": 121 }, { "epoch": 0.17914831130690162, "epsilon_dpo/beta": 0.04314511641860008, "epsilon_dpo/beta_margin_grad_mean": -0.13571257889270782, "epsilon_dpo/beta_margin_grad_std": 0.1690312772989273, "epsilon_dpo/beta_margin_mean": 3.182982921600342, "epsilon_dpo/beta_margin_std": 2.4074745178222656, "epsilon_dpo/loss_margin_mean": 73.9016342163086, "grad_norm": 72.01390075683594, "kl/avg_steps": 0.84375, "kl/beta": 0.0435079000890255, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.911461260693638e-07, "logits/chosen": -2.679837942123413, "logits/rejected": -2.8655993938446045, "logps/chosen": -59.01881790161133, "logps/ref_chosen": -31.410724639892578, "logps/ref_rejected": -114.03694152832031, "logps/rejected": -215.54666137695312, "loss": 0.3434, "rewards/accuracies": 0.921875, "rewards/chosen": -1.1941652297973633, "rewards/margins": 3.182982921600342, "rewards/rejected": -4.377148151397705, "step": 122 }, { "epoch": 0.18061674008810572, "epsilon_dpo/beta": 0.04283806309103966, "epsilon_dpo/beta_margin_grad_mean": -0.16673751175403595, "epsilon_dpo/beta_margin_grad_std": 0.19917066395282745, "epsilon_dpo/beta_margin_mean": 2.654533624649048, "epsilon_dpo/beta_margin_std": 2.180321216583252, "epsilon_dpo/loss_margin_mean": 62.16706085205078, "grad_norm": 75.45635223388672, "kl/avg_steps": 0.71875, "kl/beta": 0.043143875896930695, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.908044411417711e-07, "logits/chosen": -2.7718982696533203, "logits/rejected": -2.8315348625183105, "logps/chosen": -72.34667205810547, "logps/ref_chosen": -45.39820098876953, "logps/ref_rejected": -95.36239624023438, "logps/rejected": -184.47792053222656, "loss": 0.4491, "rewards/accuracies": 0.875, "rewards/chosen": -1.1574070453643799, "rewards/margins": 2.654533624649048, "rewards/rejected": -3.8119406700134277, "step": 123 }, { "epoch": 0.18208516886930984, "epsilon_dpo/beta": 0.04243864864110947, "epsilon_dpo/beta_margin_grad_mean": -0.12405575811862946, "epsilon_dpo/beta_margin_grad_std": 0.14530256390571594, "epsilon_dpo/beta_margin_mean": 3.2046613693237305, "epsilon_dpo/beta_margin_std": 2.2337818145751953, "epsilon_dpo/loss_margin_mean": 75.55775451660156, "grad_norm": 42.3536376953125, "kl/avg_steps": 0.9375, "kl/beta": 0.04283599182963371, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.904564107932048e-07, "logits/chosen": -2.7853384017944336, "logits/rejected": -2.904179096221924, "logps/chosen": -68.72981262207031, "logps/ref_chosen": -42.682899475097656, "logps/ref_rejected": -124.48335266113281, "logps/rejected": -226.0880126953125, "loss": 0.2979, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1055363416671753, "rewards/margins": 3.2046613693237305, "rewards/rejected": -4.310197830200195, "step": 124 }, { "epoch": 0.18355359765051396, "epsilon_dpo/beta": 0.04211079701781273, "epsilon_dpo/beta_margin_grad_mean": -0.16087745130062103, "epsilon_dpo/beta_margin_grad_std": 0.1763782650232315, "epsilon_dpo/beta_margin_mean": 2.658567428588867, "epsilon_dpo/beta_margin_std": 2.0561933517456055, "epsilon_dpo/loss_margin_mean": 63.280696868896484, "grad_norm": 81.41658020019531, "kl/avg_steps": 0.78125, "kl/beta": 0.04243813455104828, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.90102044194588e-07, "logits/chosen": -2.5554890632629395, "logits/rejected": -2.767099618911743, "logps/chosen": -63.22058868408203, "logps/ref_chosen": -30.618457794189453, "logps/ref_rejected": -88.96886444091797, "logps/rejected": -184.8516845703125, "loss": 0.4111, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3759876489639282, "rewards/margins": 2.658567428588867, "rewards/rejected": -4.034554958343506, "step": 125 }, { "epoch": 0.18502202643171806, "epsilon_dpo/beta": 0.041758038103580475, "epsilon_dpo/beta_margin_grad_mean": -0.1536438912153244, "epsilon_dpo/beta_margin_grad_std": 0.21458838880062103, "epsilon_dpo/beta_margin_mean": 2.926898956298828, "epsilon_dpo/beta_margin_std": 2.228339433670044, "epsilon_dpo/loss_margin_mean": 70.24527740478516, "grad_norm": 81.50450897216797, "kl/avg_steps": 0.84375, "kl/beta": 0.042109157890081406, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.897413506838102e-07, "logits/chosen": -2.6770665645599365, "logits/rejected": -2.785977840423584, "logps/chosen": -72.39849090576172, "logps/ref_chosen": -38.92976379394531, "logps/ref_rejected": -104.5777587890625, "logps/rejected": -208.291748046875, "loss": 0.4561, "rewards/accuracies": 0.921875, "rewards/chosen": -1.4002439975738525, "rewards/margins": 2.926898956298828, "rewards/rejected": -4.327142715454102, "step": 126 }, { "epoch": 0.18649045521292218, "epsilon_dpo/beta": 0.04140865430235863, "epsilon_dpo/beta_margin_grad_mean": -0.1614929735660553, "epsilon_dpo/beta_margin_grad_std": 0.1582755744457245, "epsilon_dpo/beta_margin_mean": 2.3239240646362305, "epsilon_dpo/beta_margin_std": 1.5966986417770386, "epsilon_dpo/loss_margin_mean": 56.2106819152832, "grad_norm": 61.22636413574219, "kl/avg_steps": 0.84375, "kl/beta": 0.04175683483481407, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.89374339765481e-07, "logits/chosen": -2.705301523208618, "logits/rejected": -2.7008934020996094, "logps/chosen": -69.19281005859375, "logps/ref_chosen": -39.06876754760742, "logps/ref_rejected": -81.60107421875, "logps/rejected": -167.935791015625, "loss": 0.395, "rewards/accuracies": 0.921875, "rewards/chosen": -1.2484067678451538, "rewards/margins": 2.3239240646362305, "rewards/rejected": -3.5723307132720947, "step": 127 }, { "epoch": 0.18795888399412627, "epsilon_dpo/beta": 0.04112689569592476, "epsilon_dpo/beta_margin_grad_mean": -0.18886899948120117, "epsilon_dpo/beta_margin_grad_std": 0.20472285151481628, "epsilon_dpo/beta_margin_mean": 2.6414670944213867, "epsilon_dpo/beta_margin_std": 2.408226251602173, "epsilon_dpo/loss_margin_mean": 64.43711853027344, "grad_norm": 69.2333984375, "kl/avg_steps": 0.6875, "kl/beta": 0.04140745848417282, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.890010211106795e-07, "logits/chosen": -2.7107484340667725, "logits/rejected": -2.624185085296631, "logps/chosen": -70.23330688476562, "logps/ref_chosen": -37.46366882324219, "logps/ref_rejected": -81.10179138183594, "logps/rejected": -178.3085479736328, "loss": 0.5065, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3495298624038696, "rewards/margins": 2.6414670944213867, "rewards/rejected": -3.990996837615967, "step": 128 }, { "epoch": 0.1894273127753304, "epsilon_dpo/beta": 0.040820371359586716, "epsilon_dpo/beta_margin_grad_mean": -0.16870415210723877, "epsilon_dpo/beta_margin_grad_std": 0.19453568756580353, "epsilon_dpo/beta_margin_mean": 2.5504720211029053, "epsilon_dpo/beta_margin_std": 2.0157787799835205, "epsilon_dpo/loss_margin_mean": 62.65648651123047, "grad_norm": 62.1826057434082, "kl/avg_steps": 0.75, "kl/beta": 0.041124723851680756, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.88621404556699e-07, "logits/chosen": -2.753101348876953, "logits/rejected": -2.694021701812744, "logps/chosen": -82.27946472167969, "logps/ref_chosen": -48.18524932861328, "logps/ref_rejected": -98.1295166015625, "logps/rejected": -194.88021850585938, "loss": 0.4566, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3921856880187988, "rewards/margins": 2.5504720211029053, "rewards/rejected": -3.942657709121704, "step": 129 }, { "epoch": 0.19089574155653452, "epsilon_dpo/beta": 0.04049098491668701, "epsilon_dpo/beta_margin_grad_mean": -0.14500190317630768, "epsilon_dpo/beta_margin_grad_std": 0.1896047294139862, "epsilon_dpo/beta_margin_mean": 2.9150590896606445, "epsilon_dpo/beta_margin_std": 2.0989155769348145, "epsilon_dpo/loss_margin_mean": 72.14749145507812, "grad_norm": 67.861572265625, "kl/avg_steps": 0.8125, "kl/beta": 0.04081858694553375, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.882355001067891e-07, "logits/chosen": -2.629741668701172, "logits/rejected": -2.686960220336914, "logps/chosen": -60.728355407714844, "logps/ref_chosen": -28.03460121154785, "logps/ref_rejected": -87.78228759765625, "logps/rejected": -192.62353515625, "loss": 0.3878, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3256336450576782, "rewards/margins": 2.9150590896606445, "rewards/rejected": -4.240693092346191, "step": 130 }, { "epoch": 0.19236417033773862, "epsilon_dpo/beta": 0.040177300572395325, "epsilon_dpo/beta_margin_grad_mean": -0.18838103115558624, "epsilon_dpo/beta_margin_grad_std": 0.19518207013607025, "epsilon_dpo/beta_margin_mean": 2.35014009475708, "epsilon_dpo/beta_margin_std": 1.9053841829299927, "epsilon_dpo/loss_margin_mean": 58.63149642944336, "grad_norm": 55.52901077270508, "kl/avg_steps": 0.78125, "kl/beta": 0.04048960655927658, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.878433179298909e-07, "logits/chosen": -2.6353015899658203, "logits/rejected": -2.799935817718506, "logps/chosen": -64.12167358398438, "logps/ref_chosen": -34.23711013793945, "logps/ref_rejected": -94.21966552734375, "logps/rejected": -182.7357177734375, "loss": 0.4887, "rewards/accuracies": 0.890625, "rewards/chosen": -1.201073169708252, "rewards/margins": 2.35014009475708, "rewards/rejected": -3.551213264465332, "step": 131 }, { "epoch": 0.19383259911894274, "epsilon_dpo/beta": 0.0398343950510025, "epsilon_dpo/beta_margin_grad_mean": -0.15264593064785004, "epsilon_dpo/beta_margin_grad_std": 0.15929141640663147, "epsilon_dpo/beta_margin_mean": 2.4843099117279053, "epsilon_dpo/beta_margin_std": 1.845669150352478, "epsilon_dpo/loss_margin_mean": 62.46304702758789, "grad_norm": 69.0653305053711, "kl/avg_steps": 0.859375, "kl/beta": 0.04017573595046997, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.874448683603694e-07, "logits/chosen": -2.7254421710968018, "logits/rejected": -2.769134044647217, "logps/chosen": -86.7735824584961, "logps/ref_chosen": -48.03201675415039, "logps/ref_rejected": -93.43745422363281, "logps/rejected": -194.64205932617188, "loss": 0.3811, "rewards/accuracies": 0.96875, "rewards/chosen": -1.5439248085021973, "rewards/margins": 2.4843099117279053, "rewards/rejected": -4.028234958648682, "step": 132 }, { "epoch": 0.19530102790014683, "epsilon_dpo/beta": 0.03946392610669136, "epsilon_dpo/beta_margin_grad_mean": -0.14380493760108948, "epsilon_dpo/beta_margin_grad_std": 0.15240231156349182, "epsilon_dpo/beta_margin_mean": 2.68829607963562, "epsilon_dpo/beta_margin_std": 1.8531904220581055, "epsilon_dpo/loss_margin_mean": 68.16297149658203, "grad_norm": 58.320655822753906, "kl/avg_steps": 0.9375, "kl/beta": 0.03983341529965401, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.870401618977415e-07, "logits/chosen": -2.6911277770996094, "logits/rejected": -2.8012609481811523, "logps/chosen": -73.2670669555664, "logps/ref_chosen": -38.89875793457031, "logps/ref_rejected": -103.64884948730469, "logps/rejected": -206.1801300048828, "loss": 0.3486, "rewards/accuracies": 0.984375, "rewards/chosen": -1.3570879697799683, "rewards/margins": 2.688295841217041, "rewards/rejected": -4.045383930206299, "step": 133 }, { "epoch": 0.19676945668135096, "epsilon_dpo/beta": 0.03912205249071121, "epsilon_dpo/beta_margin_grad_mean": -0.15723571181297302, "epsilon_dpo/beta_margin_grad_std": 0.16265857219696045, "epsilon_dpo/beta_margin_mean": 2.531024217605591, "epsilon_dpo/beta_margin_std": 1.835780143737793, "epsilon_dpo/loss_margin_mean": 64.7786865234375, "grad_norm": 53.25428009033203, "kl/avg_steps": 0.875, "kl/beta": 0.03946344554424286, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.866292092063986e-07, "logits/chosen": -2.665362596511841, "logits/rejected": -2.729599952697754, "logps/chosen": -71.48147583007812, "logps/ref_chosen": -37.08332061767578, "logps/ref_rejected": -93.31185150146484, "logps/rejected": -192.48870849609375, "loss": 0.3876, "rewards/accuracies": 0.953125, "rewards/chosen": -1.346604824066162, "rewards/margins": 2.531024217605591, "rewards/rejected": -3.877628803253174, "step": 134 }, { "epoch": 0.19823788546255505, "epsilon_dpo/beta": 0.03875825181603432, "epsilon_dpo/beta_margin_grad_mean": -0.14636199176311493, "epsilon_dpo/beta_margin_grad_std": 0.1549866497516632, "epsilon_dpo/beta_margin_mean": 2.7066292762756348, "epsilon_dpo/beta_margin_std": 2.084244966506958, "epsilon_dpo/loss_margin_mean": 69.87920379638672, "grad_norm": 58.53507995605469, "kl/avg_steps": 0.9375, "kl/beta": 0.03912113606929779, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.862120211153265e-07, "logits/chosen": -2.4796104431152344, "logits/rejected": -2.7320666313171387, "logps/chosen": -68.14396667480469, "logps/ref_chosen": -28.130008697509766, "logps/ref_rejected": -121.04113006591797, "logps/rejected": -230.93429565429688, "loss": 0.3559, "rewards/accuracies": 0.984375, "rewards/chosen": -1.5515646934509277, "rewards/margins": 2.7066292762756348, "rewards/rejected": -4.2581939697265625, "step": 135 }, { "epoch": 0.19970631424375918, "epsilon_dpo/beta": 0.03843460604548454, "epsilon_dpo/beta_margin_grad_mean": -0.1646306961774826, "epsilon_dpo/beta_margin_grad_std": 0.19557887315750122, "epsilon_dpo/beta_margin_mean": 2.8479745388031006, "epsilon_dpo/beta_margin_std": 2.358449935913086, "epsilon_dpo/loss_margin_mean": 74.22594451904297, "grad_norm": 71.29747009277344, "kl/avg_steps": 0.84375, "kl/beta": 0.03875778242945671, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.857886086178193e-07, "logits/chosen": -2.642530918121338, "logits/rejected": -2.726252794265747, "logps/chosen": -80.63860321044922, "logps/ref_chosen": -41.21574401855469, "logps/ref_rejected": -103.0965805053711, "logps/rejected": -216.74537658691406, "loss": 0.4334, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5163445472717285, "rewards/margins": 2.8479745388031006, "rewards/rejected": -4.36431884765625, "step": 136 }, { "epoch": 0.2011747430249633, "epsilon_dpo/beta": 0.03807699307799339, "epsilon_dpo/beta_margin_grad_mean": -0.125637486577034, "epsilon_dpo/beta_margin_grad_std": 0.15655429661273956, "epsilon_dpo/beta_margin_mean": 3.427241325378418, "epsilon_dpo/beta_margin_std": 2.608391523361206, "epsilon_dpo/loss_margin_mean": 90.0654296875, "grad_norm": 50.14603805541992, "kl/avg_steps": 0.9375, "kl/beta": 0.0384334996342659, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.853589828711902e-07, "logits/chosen": -2.384611129760742, "logits/rejected": -2.656006336212158, "logps/chosen": -75.71855163574219, "logps/ref_chosen": -32.656131744384766, "logps/ref_rejected": -119.52920532226562, "logps/rejected": -252.65704345703125, "loss": 0.3075, "rewards/accuracies": 0.96875, "rewards/chosen": -1.6396162509918213, "rewards/margins": 3.427241325378418, "rewards/rejected": -5.06685733795166, "step": 137 }, { "epoch": 0.2026431718061674, "epsilon_dpo/beta": 0.03774713724851608, "epsilon_dpo/beta_margin_grad_mean": -0.13305923342704773, "epsilon_dpo/beta_margin_grad_std": 0.17672809958457947, "epsilon_dpo/beta_margin_mean": 3.189133644104004, "epsilon_dpo/beta_margin_std": 2.1990420818328857, "epsilon_dpo/loss_margin_mean": 84.6056137084961, "grad_norm": 64.70891571044922, "kl/avg_steps": 0.875, "kl/beta": 0.038076531141996384, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.849231551964771e-07, "logits/chosen": -2.3941986560821533, "logits/rejected": -2.439361095428467, "logps/chosen": -81.6050796508789, "logps/ref_chosen": -31.68457794189453, "logps/ref_rejected": -99.58272552490234, "logps/rejected": -234.10885620117188, "loss": 0.3421, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8868746757507324, "rewards/margins": 3.189133644104004, "rewards/rejected": -5.076008319854736, "step": 138 }, { "epoch": 0.20411160058737152, "epsilon_dpo/beta": 0.03741971403360367, "epsilon_dpo/beta_margin_grad_mean": -0.10873990505933762, "epsilon_dpo/beta_margin_grad_std": 0.18296301364898682, "epsilon_dpo/beta_margin_mean": 3.4576268196105957, "epsilon_dpo/beta_margin_std": 2.2132863998413086, "epsilon_dpo/loss_margin_mean": 92.54183959960938, "grad_norm": 64.89179992675781, "kl/avg_steps": 0.875, "kl/beta": 0.03774625062942505, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.844811370781446e-07, "logits/chosen": -2.3702569007873535, "logits/rejected": -2.3330612182617188, "logps/chosen": -74.70332336425781, "logps/ref_chosen": -30.504302978515625, "logps/ref_rejected": -85.29287719726562, "logps/rejected": -222.0337371826172, "loss": 0.3096, "rewards/accuracies": 0.953125, "rewards/chosen": -1.656851053237915, "rewards/margins": 3.4576268196105957, "rewards/rejected": -5.11447811126709, "step": 139 }, { "epoch": 0.2055800293685756, "epsilon_dpo/beta": 0.037118520587682724, "epsilon_dpo/beta_margin_grad_mean": -0.14209268987178802, "epsilon_dpo/beta_margin_grad_std": 0.18320953845977783, "epsilon_dpo/beta_margin_mean": 3.001073122024536, "epsilon_dpo/beta_margin_std": 2.1810362339019775, "epsilon_dpo/loss_margin_mean": 81.0239486694336, "grad_norm": 64.97393035888672, "kl/avg_steps": 0.8125, "kl/beta": 0.03741883859038353, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.840329401637809e-07, "logits/chosen": -2.463718891143799, "logits/rejected": -2.454908609390259, "logps/chosen": -87.36968994140625, "logps/ref_chosen": -36.32595443725586, "logps/ref_rejected": -89.32574462890625, "logps/rejected": -221.3934326171875, "loss": 0.3733, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8963526487350464, "rewards/margins": 3.001072883605957, "rewards/rejected": -4.897425651550293, "step": 140 }, { "epoch": 0.20704845814977973, "epsilon_dpo/beta": 0.03681936860084534, "epsilon_dpo/beta_margin_grad_mean": -0.12768100202083588, "epsilon_dpo/beta_margin_grad_std": 0.1734439581632614, "epsilon_dpo/beta_margin_mean": 3.1639914512634277, "epsilon_dpo/beta_margin_std": 2.328209638595581, "epsilon_dpo/loss_margin_mean": 86.11234283447266, "grad_norm": 73.34292602539062, "kl/avg_steps": 0.8125, "kl/beta": 0.037117261439561844, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.83578576263792e-07, "logits/chosen": -2.636939287185669, "logits/rejected": -2.580254316329956, "logps/chosen": -108.33308410644531, "logps/ref_chosen": -54.31216049194336, "logps/ref_rejected": -105.19206237792969, "logps/rejected": -245.32533264160156, "loss": 0.3351, "rewards/accuracies": 0.9375, "rewards/chosen": -1.991283655166626, "rewards/margins": 3.1639914512634277, "rewards/rejected": -5.155275344848633, "step": 141 }, { "epoch": 0.20851688693098386, "epsilon_dpo/beta": 0.03649960458278656, "epsilon_dpo/beta_margin_grad_mean": -0.14174862205982208, "epsilon_dpo/beta_margin_grad_std": 0.17707964777946472, "epsilon_dpo/beta_margin_mean": 3.183145761489868, "epsilon_dpo/beta_margin_std": 2.435292959213257, "epsilon_dpo/loss_margin_mean": 87.32428741455078, "grad_norm": 72.56526947021484, "kl/avg_steps": 0.875, "kl/beta": 0.036818113178014755, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.83118057351089e-07, "logits/chosen": -2.3550641536712646, "logits/rejected": -2.433764934539795, "logps/chosen": -91.45923614501953, "logps/ref_chosen": -37.70738220214844, "logps/ref_rejected": -100.41778564453125, "logps/rejected": -241.49392700195312, "loss": 0.3654, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9631787538528442, "rewards/margins": 3.183145523071289, "rewards/rejected": -5.146324157714844, "step": 142 }, { "epoch": 0.20998531571218795, "epsilon_dpo/beta": 0.03624003753066063, "epsilon_dpo/beta_margin_grad_mean": -0.1898718923330307, "epsilon_dpo/beta_margin_grad_std": 0.2213219702243805, "epsilon_dpo/beta_margin_mean": 2.72891902923584, "epsilon_dpo/beta_margin_std": 2.580869674682617, "epsilon_dpo/loss_margin_mean": 75.5594482421875, "grad_norm": 89.1434555053711, "kl/avg_steps": 0.71875, "kl/beta": 0.036498747766017914, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.826513955607734e-07, "logits/chosen": -2.482374668121338, "logits/rejected": -2.3714542388916016, "logps/chosen": -89.65013122558594, "logps/ref_chosen": -40.36812210083008, "logps/ref_rejected": -88.28787994384766, "logps/rejected": -213.12933349609375, "loss": 0.5517, "rewards/accuracies": 0.90625, "rewards/chosen": -1.7896441221237183, "rewards/margins": 2.72891902923584, "rewards/rejected": -4.518563270568848, "step": 143 }, { "epoch": 0.21145374449339208, "epsilon_dpo/beta": 0.03593612089753151, "epsilon_dpo/beta_margin_grad_mean": -0.14966890215873718, "epsilon_dpo/beta_margin_grad_std": 0.1911356896162033, "epsilon_dpo/beta_margin_mean": 2.930758237838745, "epsilon_dpo/beta_margin_std": 2.2233402729034424, "epsilon_dpo/loss_margin_mean": 81.71424102783203, "grad_norm": 74.61936950683594, "kl/avg_steps": 0.84375, "kl/beta": 0.03623828664422035, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.821786031898176e-07, "logits/chosen": -2.5220513343811035, "logits/rejected": -2.4428181648254395, "logps/chosen": -90.16825103759766, "logps/ref_chosen": -41.09404373168945, "logps/ref_rejected": -87.43101501464844, "logps/rejected": -218.21946716308594, "loss": 0.4112, "rewards/accuracies": 0.9375, "rewards/chosen": -1.765446662902832, "rewards/margins": 2.930758476257324, "rewards/rejected": -4.696205139160156, "step": 144 }, { "epoch": 0.21292217327459617, "epsilon_dpo/beta": 0.035612985491752625, "epsilon_dpo/beta_margin_grad_mean": -0.11719842255115509, "epsilon_dpo/beta_margin_grad_std": 0.13962174952030182, "epsilon_dpo/beta_margin_mean": 3.221985101699829, "epsilon_dpo/beta_margin_std": 2.2744228839874268, "epsilon_dpo/loss_margin_mean": 90.55402374267578, "grad_norm": 62.392303466796875, "kl/avg_steps": 0.90625, "kl/beta": 0.03593508526682854, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.816996926967401e-07, "logits/chosen": -2.4150798320770264, "logits/rejected": -2.3624231815338135, "logps/chosen": -86.23258972167969, "logps/ref_chosen": -35.7225341796875, "logps/ref_rejected": -84.17701721191406, "logps/rejected": -225.2410888671875, "loss": 0.2806, "rewards/accuracies": 0.96875, "rewards/chosen": -1.800328016281128, "rewards/margins": 3.221985340118408, "rewards/rejected": -5.022313117980957, "step": 145 }, { "epoch": 0.2143906020558003, "epsilon_dpo/beta": 0.03531539812684059, "epsilon_dpo/beta_margin_grad_mean": -0.1365576684474945, "epsilon_dpo/beta_margin_grad_std": 0.18428391218185425, "epsilon_dpo/beta_margin_mean": 3.0302278995513916, "epsilon_dpo/beta_margin_std": 2.1277804374694824, "epsilon_dpo/loss_margin_mean": 85.95875549316406, "grad_norm": 61.81754684448242, "kl/avg_steps": 0.84375, "kl/beta": 0.03561234846711159, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.812146767012779e-07, "logits/chosen": -2.4440078735351562, "logits/rejected": -2.263402223587036, "logps/chosen": -100.90562438964844, "logps/ref_chosen": -41.776546478271484, "logps/ref_rejected": -87.99478149414062, "logps/rejected": -233.08261108398438, "loss": 0.3739, "rewards/accuracies": 0.953125, "rewards/chosen": -2.0904293060302734, "rewards/margins": 3.0302281379699707, "rewards/rejected": -5.120657444000244, "step": 146 }, { "epoch": 0.21585903083700442, "epsilon_dpo/beta": 0.03503095358610153, "epsilon_dpo/beta_margin_grad_mean": -0.14339102804660797, "epsilon_dpo/beta_margin_grad_std": 0.21028229594230652, "epsilon_dpo/beta_margin_mean": 3.1163737773895264, "epsilon_dpo/beta_margin_std": 2.363018274307251, "epsilon_dpo/loss_margin_mean": 89.17818450927734, "grad_norm": 94.37625122070312, "kl/avg_steps": 0.8125, "kl/beta": 0.03531438112258911, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.807235679840536e-07, "logits/chosen": -2.4340085983276367, "logits/rejected": -2.291194200515747, "logps/chosen": -89.63932800292969, "logps/ref_chosen": -35.02455139160156, "logps/ref_rejected": -76.69776153564453, "logps/rejected": -220.49072265625, "loss": 0.4282, "rewards/accuracies": 0.921875, "rewards/chosen": -1.9178650379180908, "rewards/margins": 3.1163740158081055, "rewards/rejected": -5.034238815307617, "step": 147 }, { "epoch": 0.2173274596182085, "epsilon_dpo/beta": 0.03473767638206482, "epsilon_dpo/beta_margin_grad_mean": -0.18217557668685913, "epsilon_dpo/beta_margin_grad_std": 0.1949581652879715, "epsilon_dpo/beta_margin_mean": 2.4017815589904785, "epsilon_dpo/beta_margin_std": 2.13431715965271, "epsilon_dpo/loss_margin_mean": 69.28678894042969, "grad_norm": 83.76101684570312, "kl/avg_steps": 0.84375, "kl/beta": 0.03502976521849632, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.802263794862384e-07, "logits/chosen": -2.529165744781494, "logits/rejected": -2.569955825805664, "logps/chosen": -100.62034606933594, "logps/ref_chosen": -43.158294677734375, "logps/ref_rejected": -109.0255126953125, "logps/rejected": -235.7743682861328, "loss": 0.4897, "rewards/accuracies": 0.90625, "rewards/chosen": -1.9978649616241455, "rewards/margins": 2.4017815589904785, "rewards/rejected": -4.399646759033203, "step": 148 }, { "epoch": 0.21879588839941264, "epsilon_dpo/beta": 0.0344361737370491, "epsilon_dpo/beta_margin_grad_mean": -0.15145255625247955, "epsilon_dpo/beta_margin_grad_std": 0.1748467981815338, "epsilon_dpo/beta_margin_mean": 2.92539644241333, "epsilon_dpo/beta_margin_std": 2.3435420989990234, "epsilon_dpo/loss_margin_mean": 85.06315612792969, "grad_norm": 69.50299835205078, "kl/avg_steps": 0.875, "kl/beta": 0.03473667427897453, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.797231243092118e-07, "logits/chosen": -2.407914638519287, "logits/rejected": -2.4749755859375, "logps/chosen": -93.2674331665039, "logps/ref_chosen": -34.801856994628906, "logps/ref_rejected": -101.17015075683594, "logps/rejected": -244.69888305664062, "loss": 0.383, "rewards/accuracies": 0.953125, "rewards/chosen": -2.014852285385132, "rewards/margins": 2.92539644241333, "rewards/rejected": -4.940248489379883, "step": 149 }, { "epoch": 0.22026431718061673, "epsilon_dpo/beta": 0.034148234874010086, "epsilon_dpo/beta_margin_grad_mean": -0.13967867195606232, "epsilon_dpo/beta_margin_grad_std": 0.1724717617034912, "epsilon_dpo/beta_margin_mean": 2.8002302646636963, "epsilon_dpo/beta_margin_std": 1.951874852180481, "epsilon_dpo/loss_margin_mean": 82.1502456665039, "grad_norm": 67.23019409179688, "kl/avg_steps": 0.84375, "kl/beta": 0.03443536534905434, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.792138157142157e-07, "logits/chosen": -2.3103485107421875, "logits/rejected": -2.399022102355957, "logps/chosen": -79.86786651611328, "logps/ref_chosen": -27.168495178222656, "logps/ref_rejected": -88.6546859741211, "logps/rejected": -223.50430297851562, "loss": 0.3576, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8020522594451904, "rewards/margins": 2.800230026245117, "rewards/rejected": -4.602282524108887, "step": 150 }, { "epoch": 0.22173274596182085, "epsilon_dpo/beta": 0.033841170370578766, "epsilon_dpo/beta_margin_grad_mean": -0.12026865780353546, "epsilon_dpo/beta_margin_grad_std": 0.14584773778915405, "epsilon_dpo/beta_margin_mean": 2.9734950065612793, "epsilon_dpo/beta_margin_std": 1.9034627676010132, "epsilon_dpo/loss_margin_mean": 87.95604705810547, "grad_norm": 56.315635681152344, "kl/avg_steps": 0.90625, "kl/beta": 0.034147247672080994, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.786984671220053e-07, "logits/chosen": -2.5218310356140137, "logits/rejected": -2.627288341522217, "logps/chosen": -98.35040283203125, "logps/ref_chosen": -44.517967224121094, "logps/ref_rejected": -107.56175994873047, "logps/rejected": -249.35023498535156, "loss": 0.2918, "rewards/accuracies": 0.953125, "rewards/chosen": -1.8228228092193604, "rewards/margins": 2.9734950065612793, "rewards/rejected": -4.796318054199219, "step": 151 }, { "epoch": 0.22320117474302498, "epsilon_dpo/beta": 0.033547814935445786, "epsilon_dpo/beta_margin_grad_mean": -0.12259558588266373, "epsilon_dpo/beta_margin_grad_std": 0.1732029914855957, "epsilon_dpo/beta_margin_mean": 3.1676061153411865, "epsilon_dpo/beta_margin_std": 2.1665868759155273, "epsilon_dpo/loss_margin_mean": 94.56542205810547, "grad_norm": 68.73706817626953, "kl/avg_steps": 0.875, "kl/beta": 0.033840566873550415, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.78177092112495e-07, "logits/chosen": -2.4150760173797607, "logits/rejected": -2.4403088092803955, "logps/chosen": -102.739501953125, "logps/ref_chosen": -45.36680603027344, "logps/ref_rejected": -112.48715209960938, "logps/rejected": -264.4252624511719, "loss": 0.3243, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9261062145233154, "rewards/margins": 3.1676058769226074, "rewards/rejected": -5.093712329864502, "step": 152 }, { "epoch": 0.22466960352422907, "epsilon_dpo/beta": 0.033309243619441986, "epsilon_dpo/beta_margin_grad_mean": -0.17181988060474396, "epsilon_dpo/beta_margin_grad_std": 0.2117866724729538, "epsilon_dpo/beta_margin_mean": 2.878479480743408, "epsilon_dpo/beta_margin_std": 2.4571328163146973, "epsilon_dpo/loss_margin_mean": 86.69181060791016, "grad_norm": 84.02366638183594, "kl/avg_steps": 0.71875, "kl/beta": 0.03354702889919281, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.776497044244016e-07, "logits/chosen": -2.4134445190429688, "logits/rejected": -2.408932685852051, "logps/chosen": -99.41288757324219, "logps/ref_chosen": -39.70823669433594, "logps/ref_rejected": -91.66893768310547, "logps/rejected": -238.06539916992188, "loss": 0.468, "rewards/accuracies": 0.84375, "rewards/chosen": -1.9907963275909424, "rewards/margins": 2.878479480743408, "rewards/rejected": -4.86927604675293, "step": 153 }, { "epoch": 0.2261380323054332, "epsilon_dpo/beta": 0.033061131834983826, "epsilon_dpo/beta_margin_grad_mean": -0.15857023000717163, "epsilon_dpo/beta_margin_grad_std": 0.21813619136810303, "epsilon_dpo/beta_margin_mean": 3.233847141265869, "epsilon_dpo/beta_margin_std": 2.5847911834716797, "epsilon_dpo/loss_margin_mean": 98.10140991210938, "grad_norm": 101.56253051757812, "kl/avg_steps": 0.75, "kl/beta": 0.03330763056874275, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.771163179548808e-07, "logits/chosen": -2.3385679721832275, "logits/rejected": -2.41574764251709, "logps/chosen": -91.09129333496094, "logps/ref_chosen": -35.708839416503906, "logps/ref_rejected": -111.15846252441406, "logps/rejected": -264.642333984375, "loss": 0.4488, "rewards/accuracies": 0.859375, "rewards/chosen": -1.834795594215393, "rewards/margins": 3.233847141265869, "rewards/rejected": -5.068642616271973, "step": 154 }, { "epoch": 0.2276064610866373, "epsilon_dpo/beta": 0.03278402239084244, "epsilon_dpo/beta_margin_grad_mean": -0.1480940580368042, "epsilon_dpo/beta_margin_grad_std": 0.18447977304458618, "epsilon_dpo/beta_margin_mean": 2.7261838912963867, "epsilon_dpo/beta_margin_std": 1.8952271938323975, "epsilon_dpo/loss_margin_mean": 83.30589294433594, "grad_norm": 63.52682876586914, "kl/avg_steps": 0.84375, "kl/beta": 0.033059682697057724, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.7657694675916247e-07, "logits/chosen": -2.438190460205078, "logits/rejected": -2.357145071029663, "logps/chosen": -102.16056823730469, "logps/ref_chosen": -42.04505157470703, "logps/ref_rejected": -84.00552368164062, "logps/rejected": -227.42694091796875, "loss": 0.384, "rewards/accuracies": 0.90625, "rewards/chosen": -1.9715728759765625, "rewards/margins": 2.7261838912963867, "rewards/rejected": -4.697756767272949, "step": 155 }, { "epoch": 0.2290748898678414, "epsilon_dpo/beta": 0.03256094828248024, "epsilon_dpo/beta_margin_grad_mean": -0.16920383274555206, "epsilon_dpo/beta_margin_grad_std": 0.22653749585151672, "epsilon_dpo/beta_margin_mean": 2.8284294605255127, "epsilon_dpo/beta_margin_std": 2.360419750213623, "epsilon_dpo/loss_margin_mean": 87.19612121582031, "grad_norm": 146.7109375, "kl/avg_steps": 0.6875, "kl/beta": 0.03278307616710663, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.7603160505017893e-07, "logits/chosen": -2.4288077354431152, "logits/rejected": -2.2443032264709473, "logps/chosen": -115.40594482421875, "logps/ref_chosen": -49.038185119628906, "logps/ref_rejected": -84.26383209228516, "logps/rejected": -237.8277130126953, "loss": 0.5071, "rewards/accuracies": 0.890625, "rewards/chosen": -2.1660971641540527, "rewards/margins": 2.828429698944092, "rewards/rejected": -4.9945268630981445, "step": 156 }, { "epoch": 0.2305433186490455, "epsilon_dpo/beta": 0.03226739168167114, "epsilon_dpo/beta_margin_grad_mean": -0.11788501590490341, "epsilon_dpo/beta_margin_grad_std": 0.1734883189201355, "epsilon_dpo/beta_margin_mean": 3.567206382751465, "epsilon_dpo/beta_margin_std": 2.317911386489868, "epsilon_dpo/loss_margin_mean": 110.67003631591797, "grad_norm": 74.27545928955078, "kl/avg_steps": 0.90625, "kl/beta": 0.03255923092365265, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.7548030719819154e-07, "logits/chosen": -2.416853904724121, "logits/rejected": -2.5221433639526367, "logps/chosen": -85.81341552734375, "logps/ref_chosen": -30.534713745117188, "logps/ref_rejected": -114.2703628540039, "logps/rejected": -280.2191162109375, "loss": 0.3039, "rewards/accuracies": 0.953125, "rewards/chosen": -1.7866594791412354, "rewards/margins": 3.567206382751465, "rewards/rejected": -5.353865623474121, "step": 157 }, { "epoch": 0.23201174743024963, "epsilon_dpo/beta": 0.03198767825961113, "epsilon_dpo/beta_margin_grad_mean": -0.11316555738449097, "epsilon_dpo/beta_margin_grad_std": 0.1629510372877121, "epsilon_dpo/beta_margin_mean": 3.3184967041015625, "epsilon_dpo/beta_margin_std": 2.208585739135742, "epsilon_dpo/loss_margin_mean": 103.88768768310547, "grad_norm": 52.514793395996094, "kl/avg_steps": 0.875, "kl/beta": 0.03226681426167488, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.7492306773041136e-07, "logits/chosen": -2.434067726135254, "logits/rejected": -2.5916080474853516, "logps/chosen": -77.87710571289062, "logps/ref_chosen": -30.166948318481445, "logps/ref_rejected": -119.61189270019531, "logps/rejected": -271.2097473144531, "loss": 0.2905, "rewards/accuracies": 0.953125, "rewards/chosen": -1.5289018154144287, "rewards/margins": 3.3184967041015625, "rewards/rejected": -4.84739875793457, "step": 158 }, { "epoch": 0.23348017621145375, "epsilon_dpo/beta": 0.03173020854592323, "epsilon_dpo/beta_margin_grad_mean": -0.1506548821926117, "epsilon_dpo/beta_margin_grad_std": 0.20841404795646667, "epsilon_dpo/beta_margin_mean": 3.110417604446411, "epsilon_dpo/beta_margin_std": 2.455885410308838, "epsilon_dpo/loss_margin_mean": 98.24658203125, "grad_norm": 72.99739074707031, "kl/avg_steps": 0.8125, "kl/beta": 0.03198692947626114, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.743599013306165e-07, "logits/chosen": -2.645659923553467, "logits/rejected": -2.5504307746887207, "logps/chosen": -96.61356353759766, "logps/ref_chosen": -43.95257568359375, "logps/ref_rejected": -96.23291015625, "logps/rejected": -247.14047241210938, "loss": 0.4372, "rewards/accuracies": 0.90625, "rewards/chosen": -1.6737987995147705, "rewards/margins": 3.110417366027832, "rewards/rejected": -4.784216403961182, "step": 159 }, { "epoch": 0.23494860499265785, "epsilon_dpo/beta": 0.03151414170861244, "epsilon_dpo/beta_margin_grad_mean": -0.17995795607566833, "epsilon_dpo/beta_margin_grad_std": 0.22594332695007324, "epsilon_dpo/beta_margin_mean": 2.8287112712860107, "epsilon_dpo/beta_margin_std": 2.4387524127960205, "epsilon_dpo/loss_margin_mean": 90.09088134765625, "grad_norm": 85.227294921875, "kl/avg_steps": 0.6875, "kl/beta": 0.03172912821173668, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.737908228387656e-07, "logits/chosen": -2.586042881011963, "logits/rejected": -2.5926198959350586, "logps/chosen": -97.44248962402344, "logps/ref_chosen": -43.95906066894531, "logps/ref_rejected": -104.28060913085938, "logps/rejected": -247.85491943359375, "loss": 0.5131, "rewards/accuracies": 0.875, "rewards/chosen": -1.6889050006866455, "rewards/margins": 2.82871150970459, "rewards/rejected": -4.517616271972656, "step": 160 }, { "epoch": 0.23641703377386197, "epsilon_dpo/beta": 0.031249718740582466, "epsilon_dpo/beta_margin_grad_mean": -0.14812429249286652, "epsilon_dpo/beta_margin_grad_std": 0.21790093183517456, "epsilon_dpo/beta_margin_mean": 2.9674715995788574, "epsilon_dpo/beta_margin_std": 2.1990880966186523, "epsilon_dpo/loss_margin_mean": 95.17440795898438, "grad_norm": 69.41145324707031, "kl/avg_steps": 0.84375, "kl/beta": 0.03151248022913933, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.7321584725060594e-07, "logits/chosen": -2.514247417449951, "logits/rejected": -2.5099971294403076, "logps/chosen": -81.56330108642578, "logps/ref_chosen": -30.6236572265625, "logps/ref_rejected": -88.5151138305664, "logps/rejected": -234.62916564941406, "loss": 0.4746, "rewards/accuracies": 0.921875, "rewards/chosen": -1.5938321352005005, "rewards/margins": 2.9674715995788574, "rewards/rejected": -4.561304092407227, "step": 161 }, { "epoch": 0.23788546255506607, "epsilon_dpo/beta": 0.030998021364212036, "epsilon_dpo/beta_margin_grad_mean": -0.14223085343837738, "epsilon_dpo/beta_margin_grad_std": 0.18433333933353424, "epsilon_dpo/beta_margin_mean": 2.894646406173706, "epsilon_dpo/beta_margin_std": 2.0557403564453125, "epsilon_dpo/loss_margin_mean": 93.58312225341797, "grad_norm": 68.47539520263672, "kl/avg_steps": 0.8125, "kl/beta": 0.031248819082975388, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.7263498971727905e-07, "logits/chosen": -2.5756616592407227, "logits/rejected": -2.518211603164673, "logps/chosen": -85.86666870117188, "logps/ref_chosen": -37.11417007446289, "logps/ref_rejected": -94.5440673828125, "logps/rejected": -236.87969970703125, "loss": 0.3757, "rewards/accuracies": 0.921875, "rewards/chosen": -1.514453411102295, "rewards/margins": 2.894646167755127, "rewards/rejected": -4.409099578857422, "step": 162 }, { "epoch": 0.2393538913362702, "epsilon_dpo/beta": 0.030757879838347435, "epsilon_dpo/beta_margin_grad_mean": -0.15289483964443207, "epsilon_dpo/beta_margin_grad_std": 0.209597647190094, "epsilon_dpo/beta_margin_mean": 2.800816535949707, "epsilon_dpo/beta_margin_std": 1.9919666051864624, "epsilon_dpo/loss_margin_mean": 91.29692077636719, "grad_norm": 56.615814208984375, "kl/avg_steps": 0.78125, "kl/beta": 0.03099696896970272, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.720482655449212e-07, "logits/chosen": -2.5021204948425293, "logits/rejected": -2.4259862899780273, "logps/chosen": -75.5699234008789, "logps/ref_chosen": -30.90496063232422, "logps/ref_rejected": -81.61336517333984, "logps/rejected": -217.57525634765625, "loss": 0.4252, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3745607137680054, "rewards/margins": 2.800816535949707, "rewards/rejected": -4.175376892089844, "step": 163 }, { "epoch": 0.24082232011747431, "epsilon_dpo/beta": 0.03046177327632904, "epsilon_dpo/beta_margin_grad_mean": -0.12999504804611206, "epsilon_dpo/beta_margin_grad_std": 0.14756108820438385, "epsilon_dpo/beta_margin_mean": 2.6089589595794678, "epsilon_dpo/beta_margin_std": 1.5135873556137085, "epsilon_dpo/loss_margin_mean": 85.68292999267578, "grad_norm": 59.15823745727539, "kl/avg_steps": 0.96875, "kl/beta": 0.03075668215751648, "kl/n_epsilon_steps": 0.015625, "kl/p_epsilon_steps": 0.984375, "learning_rate": 4.714556901942599e-07, "logits/chosen": -2.5339465141296387, "logits/rejected": -2.4976041316986084, "logps/chosen": -84.63601684570312, "logps/ref_chosen": -32.731651306152344, "logps/ref_rejected": -87.29302215576172, "logps/rejected": -224.88031005859375, "loss": 0.3178, "rewards/accuracies": 0.96875, "rewards/chosen": -1.5812957286834717, "rewards/margins": 2.6089587211608887, "rewards/rejected": -4.190255165100098, "step": 164 }, { "epoch": 0.2422907488986784, "epsilon_dpo/beta": 0.03020758554339409, "epsilon_dpo/beta_margin_grad_mean": -0.17471088469028473, "epsilon_dpo/beta_margin_grad_std": 0.19172601401805878, "epsilon_dpo/beta_margin_mean": 2.3461360931396484, "epsilon_dpo/beta_margin_std": 1.85222589969635, "epsilon_dpo/loss_margin_mean": 77.82381439208984, "grad_norm": 75.63594055175781, "kl/avg_steps": 0.84375, "kl/beta": 0.030461585149168968, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.708572792802069e-07, "logits/chosen": -2.676835536956787, "logits/rejected": -2.463170051574707, "logps/chosen": -100.64947509765625, "logps/ref_chosen": -42.845787048339844, "logps/ref_rejected": -77.92173767089844, "logps/rejected": -213.5492401123047, "loss": 0.4735, "rewards/accuracies": 0.953125, "rewards/chosen": -1.7484538555145264, "rewards/margins": 2.3461360931396484, "rewards/rejected": -4.094590187072754, "step": 165 }, { "epoch": 0.24375917767988253, "epsilon_dpo/beta": 0.03000204637646675, "epsilon_dpo/beta_margin_grad_mean": -0.1740747094154358, "epsilon_dpo/beta_margin_grad_std": 0.21660596132278442, "epsilon_dpo/beta_margin_mean": 2.6607251167297363, "epsilon_dpo/beta_margin_std": 2.3676037788391113, "epsilon_dpo/loss_margin_mean": 89.0173110961914, "grad_norm": 83.7566909790039, "kl/avg_steps": 0.6875, "kl/beta": 0.030206717550754547, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.702530485714461e-07, "logits/chosen": -2.4904098510742188, "logits/rejected": -2.599632740020752, "logps/chosen": -106.1424560546875, "logps/ref_chosen": -41.773406982421875, "logps/ref_rejected": -105.80018615722656, "logps/rejected": -259.1865234375, "loss": 0.4996, "rewards/accuracies": 0.859375, "rewards/chosen": -1.9366867542266846, "rewards/margins": 2.6607251167297363, "rewards/rejected": -4.597411632537842, "step": 166 }, { "epoch": 0.24522760646108663, "epsilon_dpo/beta": 0.02974093332886696, "epsilon_dpo/beta_margin_grad_mean": -0.12238018214702606, "epsilon_dpo/beta_margin_grad_std": 0.16405010223388672, "epsilon_dpo/beta_margin_mean": 2.9359490871429443, "epsilon_dpo/beta_margin_std": 1.7715035676956177, "epsilon_dpo/loss_margin_mean": 98.85689544677734, "grad_norm": 57.45083999633789, "kl/avg_steps": 0.875, "kl/beta": 0.030000463128089905, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.6964301399001877e-07, "logits/chosen": -2.4179039001464844, "logits/rejected": -2.5366764068603516, "logps/chosen": -93.92123413085938, "logps/ref_chosen": -32.289695739746094, "logps/ref_rejected": -99.84996795654297, "logps/rejected": -260.33837890625, "loss": 0.3122, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8347046375274658, "rewards/margins": 2.9359493255615234, "rewards/rejected": -4.77065372467041, "step": 167 }, { "epoch": 0.24669603524229075, "epsilon_dpo/beta": 0.02948295697569847, "epsilon_dpo/beta_margin_grad_mean": -0.15910814702510834, "epsilon_dpo/beta_margin_grad_std": 0.168760284781456, "epsilon_dpo/beta_margin_mean": 2.5459718704223633, "epsilon_dpo/beta_margin_std": 1.8775861263275146, "epsilon_dpo/loss_margin_mean": 86.47795104980469, "grad_norm": 70.27559661865234, "kl/avg_steps": 0.875, "kl/beta": 0.029740236699581146, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.690271916109034e-07, "logits/chosen": -2.536531448364258, "logits/rejected": -2.3941898345947266, "logps/chosen": -95.75901794433594, "logps/ref_chosen": -35.88120651245117, "logps/ref_rejected": -80.32093811035156, "logps/rejected": -226.67669677734375, "loss": 0.4007, "rewards/accuracies": 0.9375, "rewards/chosen": -1.767836093902588, "rewards/margins": 2.5459718704223633, "rewards/rejected": -4.313807964324951, "step": 168 }, { "epoch": 0.24816446402349487, "epsilon_dpo/beta": 0.029254861176013947, "epsilon_dpo/beta_margin_grad_mean": -0.19129043817520142, "epsilon_dpo/beta_margin_grad_std": 0.22424982488155365, "epsilon_dpo/beta_margin_mean": 2.5321929454803467, "epsilon_dpo/beta_margin_std": 2.3219850063323975, "epsilon_dpo/loss_margin_mean": 86.81468963623047, "grad_norm": 84.80979919433594, "kl/avg_steps": 0.78125, "kl/beta": 0.029482265934348106, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.6840559766159235e-07, "logits/chosen": -2.558346748352051, "logits/rejected": -2.5368800163269043, "logps/chosen": -96.68545532226562, "logps/ref_chosen": -35.75175476074219, "logps/ref_rejected": -88.39851379394531, "logps/rejected": -236.14691162109375, "loss": 0.5701, "rewards/accuracies": 0.890625, "rewards/chosen": -1.786689281463623, "rewards/margins": 2.532193183898926, "rewards/rejected": -4.318882465362549, "step": 169 }, { "epoch": 0.24963289280469897, "epsilon_dpo/beta": 0.028982365503907204, "epsilon_dpo/beta_margin_grad_mean": -0.11600478738546371, "epsilon_dpo/beta_margin_grad_std": 0.14576373994350433, "epsilon_dpo/beta_margin_mean": 2.932541847229004, "epsilon_dpo/beta_margin_std": 1.7402071952819824, "epsilon_dpo/loss_margin_mean": 101.24751281738281, "grad_norm": 43.760440826416016, "kl/avg_steps": 0.9375, "kl/beta": 0.029253721237182617, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -2.3630564212799072, "logits/rejected": -2.373150587081909, "logps/chosen": -83.35134887695312, "logps/ref_chosen": -31.416187286376953, "logps/ref_rejected": -82.51651000976562, "logps/rejected": -235.6991729736328, "loss": 0.2813, "rewards/accuracies": 0.96875, "rewards/chosen": -1.5055475234985352, "rewards/margins": 2.932541847229004, "rewards/rejected": -4.438089370727539, "step": 170 }, { "epoch": 0.2511013215859031, "epsilon_dpo/beta": 0.028740353882312775, "epsilon_dpo/beta_margin_grad_mean": -0.13344722986221313, "epsilon_dpo/beta_margin_grad_std": 0.16077838838100433, "epsilon_dpo/beta_margin_mean": 2.9016919136047363, "epsilon_dpo/beta_margin_std": 2.042875051498413, "epsilon_dpo/loss_margin_mean": 101.12266540527344, "grad_norm": 72.2908706665039, "kl/avg_steps": 0.84375, "kl/beta": 0.028982015326619148, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.6714516072235273e-07, "logits/chosen": -2.617936611175537, "logits/rejected": -2.546191692352295, "logps/chosen": -120.83259582519531, "logps/ref_chosen": -55.1216926574707, "logps/ref_rejected": -117.03608703613281, "logps/rejected": -283.8696594238281, "loss": 0.3355, "rewards/accuracies": 0.953125, "rewards/chosen": -1.8903237581253052, "rewards/margins": 2.9016919136047363, "rewards/rejected": -4.792015552520752, "step": 171 }, { "epoch": 0.2525697503671072, "epsilon_dpo/beta": 0.028499886393547058, "epsilon_dpo/beta_margin_grad_mean": -0.12330116331577301, "epsilon_dpo/beta_margin_grad_std": 0.18182381987571716, "epsilon_dpo/beta_margin_mean": 2.9543943405151367, "epsilon_dpo/beta_margin_std": 1.9407035112380981, "epsilon_dpo/loss_margin_mean": 103.86387634277344, "grad_norm": 62.24739456176758, "kl/avg_steps": 0.84375, "kl/beta": 0.028739525005221367, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -2.4678688049316406, "logits/rejected": -2.340376853942871, "logps/chosen": -103.77349090576172, "logps/ref_chosen": -42.184104919433594, "logps/ref_rejected": -90.7464599609375, "logps/rejected": -256.19970703125, "loss": 0.3457, "rewards/accuracies": 0.921875, "rewards/chosen": -1.7575478553771973, "rewards/margins": 2.9543943405151367, "rewards/rejected": -4.711941719055176, "step": 172 }, { "epoch": 0.2540381791483113, "epsilon_dpo/beta": 0.028279244899749756, "epsilon_dpo/beta_margin_grad_mean": -0.1761482208967209, "epsilon_dpo/beta_margin_grad_std": 0.1999354213476181, "epsilon_dpo/beta_margin_mean": 2.418248414993286, "epsilon_dpo/beta_margin_std": 1.9150099754333496, "epsilon_dpo/loss_margin_mean": 85.72386932373047, "grad_norm": 93.45008850097656, "kl/avg_steps": 0.78125, "kl/beta": 0.028499064967036247, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.6586183602616687e-07, "logits/chosen": -2.362546443939209, "logits/rejected": -2.276439666748047, "logps/chosen": -126.22164154052734, "logps/ref_chosen": -46.40528869628906, "logps/ref_rejected": -84.17361450195312, "logps/rejected": -249.71383666992188, "loss": 0.4831, "rewards/accuracies": 0.890625, "rewards/chosen": -2.261880874633789, "rewards/margins": 2.418248414993286, "rewards/rejected": -4.680129051208496, "step": 173 }, { "epoch": 0.2555066079295154, "epsilon_dpo/beta": 0.028042349964380264, "epsilon_dpo/beta_margin_grad_mean": -0.15671639144420624, "epsilon_dpo/beta_margin_grad_std": 0.1755547821521759, "epsilon_dpo/beta_margin_mean": 2.786201238632202, "epsilon_dpo/beta_margin_std": 2.2568979263305664, "epsilon_dpo/loss_margin_mean": 99.53160858154297, "grad_norm": 68.29830932617188, "kl/avg_steps": 0.84375, "kl/beta": 0.02827814221382141, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.652116329460919e-07, "logits/chosen": -2.276648759841919, "logits/rejected": -2.400620460510254, "logps/chosen": -106.728515625, "logps/ref_chosen": -33.12921142578125, "logps/ref_rejected": -107.51828002929688, "logps/rejected": -280.6492004394531, "loss": 0.4027, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0658488273620605, "rewards/margins": 2.786201238632202, "rewards/rejected": -4.852049827575684, "step": 174 }, { "epoch": 0.25697503671071953, "epsilon_dpo/beta": 0.027807721868157387, "epsilon_dpo/beta_margin_grad_mean": -0.12103898078203201, "epsilon_dpo/beta_margin_grad_std": 0.18520337343215942, "epsilon_dpo/beta_margin_mean": 3.309805393218994, "epsilon_dpo/beta_margin_std": 2.263869524002075, "epsilon_dpo/loss_margin_mean": 119.24435424804688, "grad_norm": 80.84042358398438, "kl/avg_steps": 0.84375, "kl/beta": 0.028041541576385498, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.645557588393406e-07, "logits/chosen": -2.220841884613037, "logits/rejected": -2.1725313663482666, "logps/chosen": -105.72348022460938, "logps/ref_chosen": -31.185977935791016, "logps/ref_rejected": -96.78857421875, "logps/rejected": -290.5704345703125, "loss": 0.3301, "rewards/accuracies": 0.921875, "rewards/chosen": -2.0765433311462402, "rewards/margins": 3.309805393218994, "rewards/rejected": -5.386348724365234, "step": 175 }, { "epoch": 0.25844346549192365, "epsilon_dpo/beta": 0.02758374810218811, "epsilon_dpo/beta_margin_grad_mean": -0.1377795934677124, "epsilon_dpo/beta_margin_grad_std": 0.17307686805725098, "epsilon_dpo/beta_margin_mean": 2.979292154312134, "epsilon_dpo/beta_margin_std": 2.1362128257751465, "epsilon_dpo/loss_margin_mean": 108.23030853271484, "grad_norm": 60.13013458251953, "kl/avg_steps": 0.8125, "kl/beta": 0.027806920930743217, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.638942309888058e-07, "logits/chosen": -2.2297158241271973, "logits/rejected": -2.2983927726745605, "logps/chosen": -116.0774917602539, "logps/ref_chosen": -34.4107666015625, "logps/ref_rejected": -103.36322021484375, "logps/rejected": -293.26025390625, "loss": 0.3514, "rewards/accuracies": 0.921875, "rewards/chosen": -2.254232168197632, "rewards/margins": 2.979292392730713, "rewards/rejected": -5.233524322509766, "step": 176 }, { "epoch": 0.2599118942731278, "epsilon_dpo/beta": 0.027404537424445152, "epsilon_dpo/beta_margin_grad_mean": -0.17347557842731476, "epsilon_dpo/beta_margin_grad_std": 0.22418558597564697, "epsilon_dpo/beta_margin_mean": 2.8900716304779053, "epsilon_dpo/beta_margin_std": 2.369600296020508, "epsilon_dpo/loss_margin_mean": 105.86678314208984, "grad_norm": 88.59068298339844, "kl/avg_steps": 0.65625, "kl/beta": 0.027582809329032898, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.6322706682636137e-07, "logits/chosen": -2.3562376499176025, "logits/rejected": -2.3722541332244873, "logps/chosen": -122.57586669921875, "logps/ref_chosen": -42.30748748779297, "logps/ref_rejected": -101.40332794189453, "logps/rejected": -287.5384826660156, "loss": 0.4875, "rewards/accuracies": 0.828125, "rewards/chosen": -2.2037370204925537, "rewards/margins": 2.8900716304779053, "rewards/rejected": -5.093808650970459, "step": 177 }, { "epoch": 0.26138032305433184, "epsilon_dpo/beta": 0.027148790657520294, "epsilon_dpo/beta_margin_grad_mean": -0.11924877017736435, "epsilon_dpo/beta_margin_grad_std": 0.15244512259960175, "epsilon_dpo/beta_margin_mean": 3.4991955757141113, "epsilon_dpo/beta_margin_std": 2.5016074180603027, "epsilon_dpo/loss_margin_mean": 128.963623046875, "grad_norm": 60.96648025512695, "kl/avg_steps": 0.9375, "kl/beta": 0.027402978390455246, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.6255428393240354e-07, "logits/chosen": -2.2090775966644287, "logits/rejected": -2.2421045303344727, "logps/chosen": -107.5852279663086, "logps/ref_chosen": -32.16440963745117, "logps/ref_rejected": -111.8971939086914, "logps/rejected": -316.2816467285156, "loss": 0.2914, "rewards/accuracies": 0.953125, "rewards/chosen": -2.04891300201416, "rewards/margins": 3.4991955757141113, "rewards/rejected": -5.5481085777282715, "step": 178 }, { "epoch": 0.26284875183553597, "epsilon_dpo/beta": 0.026943257078528404, "epsilon_dpo/beta_margin_grad_mean": -0.16198381781578064, "epsilon_dpo/beta_margin_grad_std": 0.22395355999469757, "epsilon_dpo/beta_margin_mean": 2.953178882598877, "epsilon_dpo/beta_margin_std": 2.3476455211639404, "epsilon_dpo/loss_margin_mean": 109.93001556396484, "grad_norm": 76.76069641113281, "kl/avg_steps": 0.765625, "kl/beta": 0.02714846096932888, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.6187590003538724e-07, "logits/chosen": -2.2808642387390137, "logits/rejected": -2.315495252609253, "logps/chosen": -120.60657501220703, "logps/ref_chosen": -41.662925720214844, "logps/ref_rejected": -96.57734680175781, "logps/rejected": -285.45098876953125, "loss": 0.4834, "rewards/accuracies": 0.890625, "rewards/chosen": -2.1310267448425293, "rewards/margins": 2.953178882598877, "rewards/rejected": -5.084205627441406, "step": 179 }, { "epoch": 0.2643171806167401, "epsilon_dpo/beta": 0.02671753242611885, "epsilon_dpo/beta_margin_grad_mean": -0.13764944672584534, "epsilon_dpo/beta_margin_grad_std": 0.20955896377563477, "epsilon_dpo/beta_margin_mean": 3.045346975326538, "epsilon_dpo/beta_margin_std": 2.1593165397644043, "epsilon_dpo/loss_margin_mean": 114.23480987548828, "grad_norm": 90.68710327148438, "kl/avg_steps": 0.84375, "kl/beta": 0.026942186057567596, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.611919330113591e-07, "logits/chosen": -2.333063840866089, "logits/rejected": -2.4003429412841797, "logps/chosen": -128.1096649169922, "logps/ref_chosen": -45.704586029052734, "logps/ref_rejected": -106.64442443847656, "logps/rejected": -303.2843017578125, "loss": 0.4102, "rewards/accuracies": 0.921875, "rewards/chosen": -2.2053043842315674, "rewards/margins": 3.045346975326538, "rewards/rejected": -5.2506513595581055, "step": 180 }, { "epoch": 0.2657856093979442, "epsilon_dpo/beta": 0.026519039645791054, "epsilon_dpo/beta_margin_grad_mean": -0.17692674696445465, "epsilon_dpo/beta_margin_grad_std": 0.21322111785411835, "epsilon_dpo/beta_margin_mean": 2.4334042072296143, "epsilon_dpo/beta_margin_std": 2.0837597846984863, "epsilon_dpo/loss_margin_mean": 92.05476379394531, "grad_norm": 90.5920181274414, "kl/avg_steps": 0.75, "kl/beta": 0.02671676315367222, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.605024008834863e-07, "logits/chosen": -2.3339180946350098, "logits/rejected": -2.1919121742248535, "logps/chosen": -121.45870971679688, "logps/ref_chosen": -42.68443298339844, "logps/ref_rejected": -66.97550201416016, "logps/rejected": -237.80453491210938, "loss": 0.5481, "rewards/accuracies": 0.90625, "rewards/chosen": -2.093709945678711, "rewards/margins": 2.4334044456481934, "rewards/rejected": -4.527114391326904, "step": 181 }, { "epoch": 0.26725403817914833, "epsilon_dpo/beta": 0.026296764612197876, "epsilon_dpo/beta_margin_grad_mean": -0.14079764485359192, "epsilon_dpo/beta_margin_grad_std": 0.20722897350788116, "epsilon_dpo/beta_margin_mean": 3.104597330093384, "epsilon_dpo/beta_margin_std": 2.1538186073303223, "epsilon_dpo/loss_margin_mean": 118.28819274902344, "grad_norm": 73.83556365966797, "kl/avg_steps": 0.84375, "kl/beta": 0.026517879217863083, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.598073218215817e-07, "logits/chosen": -2.208120822906494, "logits/rejected": -2.3168540000915527, "logps/chosen": -95.87370300292969, "logps/ref_chosen": -22.746728897094727, "logps/ref_rejected": -96.14221954345703, "logps/rejected": -287.557373046875, "loss": 0.402, "rewards/accuracies": 0.90625, "rewards/chosen": -1.9256913661956787, "rewards/margins": 3.104597568511963, "rewards/rejected": -5.0302886962890625, "step": 182 }, { "epoch": 0.2687224669603524, "epsilon_dpo/beta": 0.02606852352619171, "epsilon_dpo/beta_margin_grad_mean": -0.12661683559417725, "epsilon_dpo/beta_margin_grad_std": 0.14383679628372192, "epsilon_dpo/beta_margin_mean": 2.813413619995117, "epsilon_dpo/beta_margin_std": 1.8080170154571533, "epsilon_dpo/loss_margin_mean": 108.05697631835938, "grad_norm": 49.043949127197266, "kl/avg_steps": 0.875, "kl/beta": 0.02629600651562214, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.5910671414162484e-07, "logits/chosen": -2.3619985580444336, "logits/rejected": -2.227496862411499, "logps/chosen": -111.11225891113281, "logps/ref_chosen": -37.3929557800293, "logps/ref_rejected": -82.01314544677734, "logps/rejected": -263.7894287109375, "loss": 0.305, "rewards/accuracies": 0.96875, "rewards/chosen": -1.921753168106079, "rewards/margins": 2.813413619995117, "rewards/rejected": -4.735166549682617, "step": 183 }, { "epoch": 0.2701908957415565, "epsilon_dpo/beta": 0.02590757980942726, "epsilon_dpo/beta_margin_grad_mean": -0.22445106506347656, "epsilon_dpo/beta_margin_grad_std": 0.2272690087556839, "epsilon_dpo/beta_margin_mean": 2.006890058517456, "epsilon_dpo/beta_margin_std": 1.832121729850769, "epsilon_dpo/loss_margin_mean": 77.82298278808594, "grad_norm": 91.00837707519531, "kl/avg_steps": 0.625, "kl/beta": 0.026067912578582764, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.5840059630527985e-07, "logits/chosen": -2.4576053619384766, "logits/rejected": -2.333487033843994, "logps/chosen": -125.68778991699219, "logps/ref_chosen": -43.53949737548828, "logps/ref_rejected": -81.33880615234375, "logps/rejected": -241.31008911132812, "loss": 0.6265, "rewards/accuracies": 0.828125, "rewards/chosen": -2.1323318481445312, "rewards/margins": 2.006890058517456, "rewards/rejected": -4.139222145080566, "step": 184 }, { "epoch": 0.27165932452276065, "epsilon_dpo/beta": 0.0256899856030941, "epsilon_dpo/beta_margin_grad_mean": -0.16735970973968506, "epsilon_dpo/beta_margin_grad_std": 0.17599676549434662, "epsilon_dpo/beta_margin_mean": 2.3658905029296875, "epsilon_dpo/beta_margin_std": 1.7580143213272095, "epsilon_dpo/loss_margin_mean": 92.25860595703125, "grad_norm": 67.63098907470703, "kl/avg_steps": 0.84375, "kl/beta": 0.025905998423695564, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.5768898691940836e-07, "logits/chosen": -2.360046863555908, "logits/rejected": -2.2244629859924316, "logps/chosen": -111.3865737915039, "logps/ref_chosen": -37.372806549072266, "logps/ref_rejected": -78.53436279296875, "logps/rejected": -244.80673217773438, "loss": 0.4255, "rewards/accuracies": 0.921875, "rewards/chosen": -1.9024994373321533, "rewards/margins": 2.3658905029296875, "rewards/rejected": -4.268389701843262, "step": 185 }, { "epoch": 0.27312775330396477, "epsilon_dpo/beta": 0.02545095421373844, "epsilon_dpo/beta_margin_grad_mean": -0.09410841017961502, "epsilon_dpo/beta_margin_grad_std": 0.11750077456235886, "epsilon_dpo/beta_margin_mean": 3.1497642993927, "epsilon_dpo/beta_margin_std": 1.6914327144622803, "epsilon_dpo/loss_margin_mean": 123.8324966430664, "grad_norm": 39.774940490722656, "kl/avg_steps": 0.9375, "kl/beta": 0.025689246132969856, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.5697190473557947e-07, "logits/chosen": -2.5002646446228027, "logits/rejected": -2.3675756454467773, "logps/chosen": -100.51115417480469, "logps/ref_chosen": -41.56720733642578, "logps/ref_rejected": -93.58636474609375, "logps/rejected": -276.3628234863281, "loss": 0.2184, "rewards/accuracies": 1.0, "rewards/chosen": -1.5010137557983398, "rewards/margins": 3.1497642993927, "rewards/rejected": -4.650777816772461, "step": 186 }, { "epoch": 0.2745961820851689, "epsilon_dpo/beta": 0.025262294337153435, "epsilon_dpo/beta_margin_grad_mean": -0.18915115296840668, "epsilon_dpo/beta_margin_grad_std": 0.22492022812366486, "epsilon_dpo/beta_margin_mean": 2.369025468826294, "epsilon_dpo/beta_margin_std": 1.9370827674865723, "epsilon_dpo/loss_margin_mean": 94.07389831542969, "grad_norm": 85.82039642333984, "kl/avg_steps": 0.75, "kl/beta": 0.02545064687728882, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.5624936864957555e-07, "logits/chosen": -2.322283983230591, "logits/rejected": -2.289004325866699, "logps/chosen": -108.8873291015625, "logps/ref_chosen": -35.3756217956543, "logps/ref_rejected": -86.07025146484375, "logps/rejected": -253.65586853027344, "loss": 0.5347, "rewards/accuracies": 0.875, "rewards/chosen": -1.861724615097046, "rewards/margins": 2.369025230407715, "rewards/rejected": -4.23075008392334, "step": 187 }, { "epoch": 0.27606461086637296, "epsilon_dpo/beta": 0.025042656809091568, "epsilon_dpo/beta_margin_grad_mean": -0.15640155971050262, "epsilon_dpo/beta_margin_grad_std": 0.18032178282737732, "epsilon_dpo/beta_margin_mean": 2.593634843826294, "epsilon_dpo/beta_margin_std": 1.8406410217285156, "epsilon_dpo/loss_margin_mean": 103.7129898071289, "grad_norm": 62.04313659667969, "kl/avg_steps": 0.875, "kl/beta": 0.02526118792593479, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.5552139770089454e-07, "logits/chosen": -2.326641321182251, "logits/rejected": -2.3683671951293945, "logps/chosen": -104.10310363769531, "logps/ref_chosen": -34.50104522705078, "logps/ref_rejected": -96.81013488769531, "logps/rejected": -270.12518310546875, "loss": 0.4002, "rewards/accuracies": 0.953125, "rewards/chosen": -1.7442595958709717, "rewards/margins": 2.593634843826294, "rewards/rejected": -4.337894439697266, "step": 188 }, { "epoch": 0.2775330396475771, "epsilon_dpo/beta": 0.024817608296871185, "epsilon_dpo/beta_margin_grad_mean": -0.13081274926662445, "epsilon_dpo/beta_margin_grad_std": 0.16288872063159943, "epsilon_dpo/beta_margin_mean": 2.6774120330810547, "epsilon_dpo/beta_margin_std": 1.7680622339248657, "epsilon_dpo/loss_margin_mean": 108.02799224853516, "grad_norm": 57.21524429321289, "kl/avg_steps": 0.90625, "kl/beta": 0.02504207007586956, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.5478801107224794e-07, "logits/chosen": -2.281872272491455, "logits/rejected": -2.1657588481903076, "logps/chosen": -94.84893798828125, "logps/ref_chosen": -30.625194549560547, "logps/ref_rejected": -78.02359008789062, "logps/rejected": -250.27532958984375, "loss": 0.3608, "rewards/accuracies": 0.96875, "rewards/chosen": -1.5961487293243408, "rewards/margins": 2.6774120330810547, "rewards/rejected": -4.273560523986816, "step": 189 }, { "epoch": 0.2790014684287812, "epsilon_dpo/beta": 0.024594716727733612, "epsilon_dpo/beta_margin_grad_mean": -0.14998801052570343, "epsilon_dpo/beta_margin_grad_std": 0.1647987961769104, "epsilon_dpo/beta_margin_mean": 2.706925392150879, "epsilon_dpo/beta_margin_std": 1.9622080326080322, "epsilon_dpo/loss_margin_mean": 110.1700210571289, "grad_norm": 55.00584030151367, "kl/avg_steps": 0.90625, "kl/beta": 0.02481716312468052, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -2.403958797454834, "logits/rejected": -2.4575366973876953, "logps/chosen": -105.57707214355469, "logps/ref_chosen": -41.5223388671875, "logps/ref_rejected": -109.71112823486328, "logps/rejected": -283.9358825683594, "loss": 0.3711, "rewards/accuracies": 0.953125, "rewards/chosen": -1.5758315324783325, "rewards/margins": 2.706925392150879, "rewards/rejected": -4.282756805419922, "step": 190 }, { "epoch": 0.28046989720998533, "epsilon_dpo/beta": 0.024389203637838364, "epsilon_dpo/beta_margin_grad_mean": -0.1167060062289238, "epsilon_dpo/beta_margin_grad_std": 0.15317706763744354, "epsilon_dpo/beta_margin_mean": 3.051992416381836, "epsilon_dpo/beta_margin_std": 1.9232109785079956, "epsilon_dpo/loss_margin_mean": 125.34806060791016, "grad_norm": 55.07308578491211, "kl/avg_steps": 0.84375, "kl/beta": 0.024594279006123543, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.5330506821893565e-07, "logits/chosen": -2.455369234085083, "logits/rejected": -2.3924498558044434, "logps/chosen": -107.12852478027344, "logps/ref_chosen": -43.46269989013672, "logps/ref_rejected": -115.54437255859375, "logps/rejected": -304.5582580566406, "loss": 0.2901, "rewards/accuracies": 0.9375, "rewards/chosen": -1.554311990737915, "rewards/margins": 3.051992416381836, "rewards/rejected": -4.606304168701172, "step": 191 }, { "epoch": 0.28193832599118945, "epsilon_dpo/beta": 0.024181291460990906, "epsilon_dpo/beta_margin_grad_mean": -0.16211238503456116, "epsilon_dpo/beta_margin_grad_std": 0.18894152343273163, "epsilon_dpo/beta_margin_mean": 2.575836181640625, "epsilon_dpo/beta_margin_std": 1.9563552141189575, "epsilon_dpo/loss_margin_mean": 106.70730590820312, "grad_norm": 76.85231018066406, "kl/avg_steps": 0.859375, "kl/beta": 0.024388499557971954, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.5255555107119336e-07, "logits/chosen": -2.2935123443603516, "logits/rejected": -2.236546754837036, "logps/chosen": -124.95817565917969, "logps/ref_chosen": -45.30467224121094, "logps/ref_rejected": -109.86914825439453, "logps/rejected": -296.22998046875, "loss": 0.4295, "rewards/accuracies": 0.921875, "rewards/chosen": -1.9286646842956543, "rewards/margins": 2.575836181640625, "rewards/rejected": -4.504500865936279, "step": 192 }, { "epoch": 0.2834067547723935, "epsilon_dpo/beta": 0.02397906966507435, "epsilon_dpo/beta_margin_grad_mean": -0.1852301061153412, "epsilon_dpo/beta_margin_grad_std": 0.20167653262615204, "epsilon_dpo/beta_margin_mean": 2.189572334289551, "epsilon_dpo/beta_margin_std": 1.6879116296768188, "epsilon_dpo/loss_margin_mean": 91.50328063964844, "grad_norm": 68.04452514648438, "kl/avg_steps": 0.84375, "kl/beta": 0.0241806972771883, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.5180069639630236e-07, "logits/chosen": -2.299919605255127, "logits/rejected": -2.2096662521362305, "logps/chosen": -114.80885314941406, "logps/ref_chosen": -38.554039001464844, "logps/ref_rejected": -81.17373657226562, "logps/rejected": -248.93182373046875, "loss": 0.5015, "rewards/accuracies": 0.890625, "rewards/chosen": -1.82987380027771, "rewards/margins": 2.189572334289551, "rewards/rejected": -4.01944637298584, "step": 193 }, { "epoch": 0.28487518355359764, "epsilon_dpo/beta": 0.023755960166454315, "epsilon_dpo/beta_margin_grad_mean": -0.15306001901626587, "epsilon_dpo/beta_margin_grad_std": 0.14770975708961487, "epsilon_dpo/beta_margin_mean": 2.215096950531006, "epsilon_dpo/beta_margin_std": 1.2999720573425293, "epsilon_dpo/loss_margin_mean": 93.31468200683594, "grad_norm": 58.625755310058594, "kl/avg_steps": 0.9375, "kl/beta": 0.023978380486369133, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.510405240853854e-07, "logits/chosen": -2.02514386177063, "logits/rejected": -1.9236395359039307, "logps/chosen": -111.03370666503906, "logps/ref_chosen": -26.635162353515625, "logps/ref_rejected": -64.14307403564453, "logps/rejected": -241.85629272460938, "loss": 0.3712, "rewards/accuracies": 0.953125, "rewards/chosen": -2.005694627761841, "rewards/margins": 2.215096950531006, "rewards/rejected": -4.220791816711426, "step": 194 }, { "epoch": 0.28634361233480177, "epsilon_dpo/beta": 0.023535314947366714, "epsilon_dpo/beta_margin_grad_mean": -0.128713458776474, "epsilon_dpo/beta_margin_grad_std": 0.14728419482707977, "epsilon_dpo/beta_margin_mean": 2.749429702758789, "epsilon_dpo/beta_margin_std": 1.724953055381775, "epsilon_dpo/loss_margin_mean": 116.91402435302734, "grad_norm": 68.01368713378906, "kl/avg_steps": 0.9375, "kl/beta": 0.023755669593811035, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 4.5027505416968985e-07, "logits/chosen": -2.1741151809692383, "logits/rejected": -2.245378255844116, "logps/chosen": -107.69198608398438, "logps/ref_chosen": -33.191192626953125, "logps/ref_rejected": -103.19746398925781, "logps/rejected": -294.6122741699219, "loss": 0.316, "rewards/accuracies": 0.96875, "rewards/chosen": -1.754541277885437, "rewards/margins": 2.749429702758789, "rewards/rejected": -4.503971099853516, "step": 195 }, { "epoch": 0.2878120411160059, "epsilon_dpo/beta": 0.023338787257671356, "epsilon_dpo/beta_margin_grad_mean": -0.13086631894111633, "epsilon_dpo/beta_margin_grad_std": 0.16106443107128143, "epsilon_dpo/beta_margin_mean": 2.784287929534912, "epsilon_dpo/beta_margin_std": 1.7888661623001099, "epsilon_dpo/loss_margin_mean": 119.4956283569336, "grad_norm": 46.899410247802734, "kl/avg_steps": 0.84375, "kl/beta": 0.023535029962658882, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.495043068200599e-07, "logits/chosen": -2.1250100135803223, "logits/rejected": -1.9867154359817505, "logps/chosen": -101.93032836914062, "logps/ref_chosen": -30.988731384277344, "logps/ref_rejected": -82.06416320800781, "logps/rejected": -272.50140380859375, "loss": 0.3278, "rewards/accuracies": 0.953125, "rewards/chosen": -1.6580393314361572, "rewards/margins": 2.784287929534912, "rewards/rejected": -4.442327499389648, "step": 196 }, { "epoch": 0.28928046989721, "epsilon_dpo/beta": 0.02313622087240219, "epsilon_dpo/beta_margin_grad_mean": -0.15594223141670227, "epsilon_dpo/beta_margin_grad_std": 0.17379499971866608, "epsilon_dpo/beta_margin_mean": 2.3834075927734375, "epsilon_dpo/beta_margin_std": 1.601683259010315, "epsilon_dpo/loss_margin_mean": 103.17386627197266, "grad_norm": 68.2065200805664, "kl/avg_steps": 0.875, "kl/beta": 0.023338114842772484, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.4872830234640493e-07, "logits/chosen": -2.200084686279297, "logits/rejected": -2.1047005653381348, "logps/chosen": -115.05329895019531, "logps/ref_chosen": -38.329471588134766, "logps/ref_rejected": -81.55877685546875, "logps/rejected": -261.45648193359375, "loss": 0.4039, "rewards/accuracies": 0.953125, "rewards/chosen": -1.7775802612304688, "rewards/margins": 2.3834075927734375, "rewards/rejected": -4.160987854003906, "step": 197 }, { "epoch": 0.2907488986784141, "epsilon_dpo/beta": 0.0229355338960886, "epsilon_dpo/beta_margin_grad_mean": -0.13343603909015656, "epsilon_dpo/beta_margin_grad_std": 0.17914170026779175, "epsilon_dpo/beta_margin_mean": 2.9484434127807617, "epsilon_dpo/beta_margin_std": 2.0632452964782715, "epsilon_dpo/loss_margin_mean": 128.7584228515625, "grad_norm": 67.33406829833984, "kl/avg_steps": 0.875, "kl/beta": 0.023135676980018616, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.479470611971645e-07, "logits/chosen": -2.1627941131591797, "logits/rejected": -2.1062378883361816, "logps/chosen": -111.89299011230469, "logps/ref_chosen": -33.095436096191406, "logps/ref_rejected": -102.81793212890625, "logps/rejected": -310.3739013671875, "loss": 0.3647, "rewards/accuracies": 0.953125, "rewards/chosen": -1.8085954189300537, "rewards/margins": 2.9484434127807617, "rewards/rejected": -4.7570390701293945, "step": 198 }, { "epoch": 0.2922173274596182, "epsilon_dpo/beta": 0.0227294210344553, "epsilon_dpo/beta_margin_grad_mean": -0.13506709039211273, "epsilon_dpo/beta_margin_grad_std": 0.15224339067935944, "epsilon_dpo/beta_margin_mean": 2.6718122959136963, "epsilon_dpo/beta_margin_std": 1.751692295074463, "epsilon_dpo/loss_margin_mean": 117.67414855957031, "grad_norm": 61.87571334838867, "kl/avg_steps": 0.90625, "kl/beta": 0.022934995591640472, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.471606039587695e-07, "logits/chosen": -2.119852066040039, "logits/rejected": -1.9955350160598755, "logps/chosen": -117.20360565185547, "logps/ref_chosen": -34.307518005371094, "logps/ref_rejected": -90.8328857421875, "logps/rejected": -291.4031066894531, "loss": 0.3347, "rewards/accuracies": 0.953125, "rewards/chosen": -1.884255290031433, "rewards/margins": 2.6718125343322754, "rewards/rejected": -4.55606746673584, "step": 199 }, { "epoch": 0.2936857562408223, "epsilon_dpo/beta": 0.022539492696523666, "epsilon_dpo/beta_margin_grad_mean": -0.1900298297405243, "epsilon_dpo/beta_margin_grad_std": 0.20542176067829132, "epsilon_dpo/beta_margin_mean": 2.5110652446746826, "epsilon_dpo/beta_margin_std": 2.381103754043579, "epsilon_dpo/loss_margin_mean": 111.65966033935547, "grad_norm": 93.91866302490234, "kl/avg_steps": 0.84375, "kl/beta": 0.022729014977812767, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -2.0332164764404297, "logits/rejected": -1.9690152406692505, "logps/chosen": -130.70755004882812, "logps/ref_chosen": -43.81787872314453, "logps/ref_rejected": -86.25389099121094, "logps/rejected": -284.80322265625, "loss": 0.559, "rewards/accuracies": 0.921875, "rewards/chosen": -1.9615219831466675, "rewards/margins": 2.5110652446746826, "rewards/rejected": -4.4725871086120605, "step": 200 }, { "epoch": 0.2936857562408223, "eval_epsilon_dpo/beta": 0.022413820028305054, "eval_epsilon_dpo/beta_margin_grad_mean": -0.26273271441459656, "eval_epsilon_dpo/beta_margin_grad_std": 0.2419932633638382, "eval_epsilon_dpo/beta_margin_mean": 1.7521748542785645, "eval_epsilon_dpo/beta_margin_std": 1.9999192953109741, "eval_epsilon_dpo/loss_margin_mean": 78.65644836425781, "eval_kl/n_epsilon_steps": 0.21746575832366943, "eval_kl/p_epsilon_steps": 0.7821061611175537, "eval_logits/chosen": -2.2068309783935547, "eval_logits/rejected": -2.105496644973755, "eval_logps/chosen": -175.93331909179688, "eval_logps/ref_chosen": -68.29110717773438, "eval_logps/ref_rejected": -92.08038330078125, "eval_logps/rejected": -278.3790588378906, "eval_loss": 0.4013729393482208, "eval_rewards/accuracies": 0.798373281955719, "eval_rewards/chosen": -2.4182353019714355, "eval_rewards/margins": 1.7521746158599854, "eval_rewards/rejected": -4.170409679412842, "eval_runtime": 38.3575, "eval_samples_per_second": 60.979, "eval_steps_per_second": 1.929, "step": 200 }, { "epoch": 0.29515418502202645, "epsilon_dpo/beta": 0.02235795184969902, "epsilon_dpo/beta_margin_grad_mean": -0.1633475422859192, "epsilon_dpo/beta_margin_grad_std": 0.2009790688753128, "epsilon_dpo/beta_margin_mean": 2.613624095916748, "epsilon_dpo/beta_margin_std": 1.999254584312439, "epsilon_dpo/loss_margin_mean": 117.17089080810547, "grad_norm": 77.21038055419922, "kl/avg_steps": 0.8125, "kl/beta": 0.022538842633366585, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.455721242469372e-07, "logits/chosen": -2.2331690788269043, "logits/rejected": -2.180114269256592, "logps/chosen": -133.12782287597656, "logps/ref_chosen": -55.676116943359375, "logps/ref_rejected": -121.86392974853516, "logps/rejected": -316.4865417480469, "loss": 0.4443, "rewards/accuracies": 0.90625, "rewards/chosen": -1.7362768650054932, "rewards/margins": 2.613624095916748, "rewards/rejected": -4.34990119934082, "step": 201 }, { "epoch": 0.2966226138032305, "epsilon_dpo/beta": 0.022170767188072205, "epsilon_dpo/beta_margin_grad_mean": -0.16092374920845032, "epsilon_dpo/beta_margin_grad_std": 0.16366447508335114, "epsilon_dpo/beta_margin_mean": 2.40342116355896, "epsilon_dpo/beta_margin_std": 1.810045599937439, "epsilon_dpo/loss_margin_mean": 108.59467315673828, "grad_norm": 63.114845275878906, "kl/avg_steps": 0.84375, "kl/beta": 0.022357190027832985, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.4477014363141755e-07, "logits/chosen": -1.9032566547393799, "logits/rejected": -2.0256400108337402, "logps/chosen": -113.52767181396484, "logps/ref_chosen": -30.73172378540039, "logps/ref_rejected": -93.48927307128906, "logps/rejected": -284.8798828125, "loss": 0.4009, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8372435569763184, "rewards/margins": 2.403420925140381, "rewards/rejected": -4.240664482116699, "step": 202 }, { "epoch": 0.29809104258443464, "epsilon_dpo/beta": 0.021971412003040314, "epsilon_dpo/beta_margin_grad_mean": -0.12936589121818542, "epsilon_dpo/beta_margin_grad_std": 0.16617560386657715, "epsilon_dpo/beta_margin_mean": 2.778576612472534, "epsilon_dpo/beta_margin_std": 1.7826684713363647, "epsilon_dpo/loss_margin_mean": 126.62322998046875, "grad_norm": 45.17288589477539, "kl/avg_steps": 0.90625, "kl/beta": 0.022170130163431168, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.439630306414758e-07, "logits/chosen": -2.110769748687744, "logits/rejected": -2.13863468170166, "logps/chosen": -107.68612670898438, "logps/ref_chosen": -38.8436393737793, "logps/ref_rejected": -92.13667297363281, "logps/rejected": -287.6023864746094, "loss": 0.3417, "rewards/accuracies": 0.953125, "rewards/chosen": -1.5144764184951782, "rewards/margins": 2.778576612472534, "rewards/rejected": -4.293052673339844, "step": 203 }, { "epoch": 0.29955947136563876, "epsilon_dpo/beta": 0.02179468236863613, "epsilon_dpo/beta_margin_grad_mean": -0.14814503490924835, "epsilon_dpo/beta_margin_grad_std": 0.18509677052497864, "epsilon_dpo/beta_margin_mean": 2.5697386264801025, "epsilon_dpo/beta_margin_std": 1.7625229358673096, "epsilon_dpo/loss_margin_mean": 118.17318725585938, "grad_norm": 67.1108169555664, "kl/avg_steps": 0.8125, "kl/beta": 0.021971017122268677, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.431508065452897e-07, "logits/chosen": -2.281491756439209, "logits/rejected": -2.114370822906494, "logps/chosen": -135.9166259765625, "logps/ref_chosen": -55.713932037353516, "logps/ref_rejected": -93.70796203613281, "logps/rejected": -292.0838623046875, "loss": 0.3937, "rewards/accuracies": 0.90625, "rewards/chosen": -1.7504637241363525, "rewards/margins": 2.5697386264801025, "rewards/rejected": -4.320202350616455, "step": 204 }, { "epoch": 0.3010279001468429, "epsilon_dpo/beta": 0.021612215787172318, "epsilon_dpo/beta_margin_grad_mean": -0.12213469296693802, "epsilon_dpo/beta_margin_grad_std": 0.18215671181678772, "epsilon_dpo/beta_margin_mean": 2.9263906478881836, "epsilon_dpo/beta_margin_std": 1.8439240455627441, "epsilon_dpo/loss_margin_mean": 135.66934204101562, "grad_norm": 67.83074951171875, "kl/avg_steps": 0.84375, "kl/beta": 0.021793941035866737, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.4233349274571974e-07, "logits/chosen": -2.015986680984497, "logits/rejected": -1.8826128244400024, "logps/chosen": -121.29883575439453, "logps/ref_chosen": -34.816200256347656, "logps/ref_rejected": -92.58261108398438, "logps/rejected": -314.7345886230469, "loss": 0.3485, "rewards/accuracies": 0.921875, "rewards/chosen": -1.872864007949829, "rewards/margins": 2.9263906478881836, "rewards/rejected": -4.799254417419434, "step": 205 }, { "epoch": 0.302496328928047, "epsilon_dpo/beta": 0.021431388333439827, "epsilon_dpo/beta_margin_grad_mean": -0.14491204917430878, "epsilon_dpo/beta_margin_grad_std": 0.19017614424228668, "epsilon_dpo/beta_margin_mean": 2.6911041736602783, "epsilon_dpo/beta_margin_std": 1.9302713871002197, "epsilon_dpo/loss_margin_mean": 125.82368469238281, "grad_norm": 70.67012023925781, "kl/avg_steps": 0.84375, "kl/beta": 0.021611593663692474, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.415111107797445e-07, "logits/chosen": -1.9591203927993774, "logits/rejected": -1.9519094228744507, "logps/chosen": -117.2315673828125, "logps/ref_chosen": -30.099918365478516, "logps/ref_rejected": -103.39237976074219, "logps/rejected": -316.34771728515625, "loss": 0.4266, "rewards/accuracies": 0.921875, "rewards/chosen": -1.8712209463119507, "rewards/margins": 2.6911041736602783, "rewards/rejected": -4.5623250007629395, "step": 206 }, { "epoch": 0.3039647577092511, "epsilon_dpo/beta": 0.02126547135412693, "epsilon_dpo/beta_margin_grad_mean": -0.1460837423801422, "epsilon_dpo/beta_margin_grad_std": 0.1840200126171112, "epsilon_dpo/beta_margin_mean": 2.8168625831604004, "epsilon_dpo/beta_margin_std": 1.992667317390442, "epsilon_dpo/loss_margin_mean": 132.78456115722656, "grad_norm": 59.43008041381836, "kl/avg_steps": 0.78125, "kl/beta": 0.02143077179789543, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.4068368231789365e-07, "logits/chosen": -2.092212200164795, "logits/rejected": -1.968192458152771, "logps/chosen": -101.26658630371094, "logps/ref_chosen": -31.34187889099121, "logps/ref_rejected": -89.86247253417969, "logps/rejected": -292.57171630859375, "loss": 0.3788, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4902305603027344, "rewards/margins": 2.8168625831604004, "rewards/rejected": -4.307093143463135, "step": 207 }, { "epoch": 0.3054331864904552, "epsilon_dpo/beta": 0.02108733169734478, "epsilon_dpo/beta_margin_grad_mean": -0.1172553300857544, "epsilon_dpo/beta_margin_grad_std": 0.15206165611743927, "epsilon_dpo/beta_margin_mean": 2.8677847385406494, "epsilon_dpo/beta_margin_std": 1.6818532943725586, "epsilon_dpo/loss_margin_mean": 136.2245635986328, "grad_norm": 52.98038101196289, "kl/avg_steps": 0.84375, "kl/beta": 0.021264642477035522, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.398512291636768e-07, "logits/chosen": -2.082803726196289, "logits/rejected": -2.1149425506591797, "logps/chosen": -119.51541137695312, "logps/ref_chosen": -35.819129943847656, "logps/ref_rejected": -100.89794921875, "logps/rejected": -320.81878662109375, "loss": 0.2919, "rewards/accuracies": 0.953125, "rewards/chosen": -1.7668159008026123, "rewards/margins": 2.8677845001220703, "rewards/rejected": -4.634600639343262, "step": 208 }, { "epoch": 0.3069016152716593, "epsilon_dpo/beta": 0.020917484536767006, "epsilon_dpo/beta_margin_grad_mean": -0.1551363617181778, "epsilon_dpo/beta_margin_grad_std": 0.20075276494026184, "epsilon_dpo/beta_margin_mean": 2.5712711811065674, "epsilon_dpo/beta_margin_std": 1.814591646194458, "epsilon_dpo/loss_margin_mean": 123.22737884521484, "grad_norm": 68.60906219482422, "kl/avg_steps": 0.8125, "kl/beta": 0.02108672261238098, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.3901377325300857e-07, "logits/chosen": -2.075434446334839, "logits/rejected": -1.8750395774841309, "logps/chosen": -114.03466796875, "logps/ref_chosen": -38.91720199584961, "logps/ref_rejected": -86.70390319824219, "logps/rejected": -285.0487365722656, "loss": 0.4441, "rewards/accuracies": 0.921875, "rewards/chosen": -1.573714017868042, "rewards/margins": 2.5712709426879883, "rewards/rejected": -4.144985198974609, "step": 209 }, { "epoch": 0.30837004405286345, "epsilon_dpo/beta": 0.020716214552521706, "epsilon_dpo/beta_margin_grad_mean": -0.13410209119319916, "epsilon_dpo/beta_margin_grad_std": 0.13863912224769592, "epsilon_dpo/beta_margin_mean": 2.6166491508483887, "epsilon_dpo/beta_margin_std": 1.6349241733551025, "epsilon_dpo/loss_margin_mean": 126.35758972167969, "grad_norm": 44.407615661621094, "kl/avg_steps": 0.96875, "kl/beta": 0.02091677300632, "kl/n_epsilon_steps": 0.015625, "kl/p_epsilon_steps": 0.984375, "learning_rate": 4.381713366536311e-07, "logits/chosen": -2.012350082397461, "logits/rejected": -1.9750981330871582, "logps/chosen": -108.2236099243164, "logps/ref_chosen": -29.373889923095703, "logps/ref_rejected": -85.03504943847656, "logps/rejected": -290.24237060546875, "loss": 0.32, "rewards/accuracies": 0.984375, "rewards/chosen": -1.6338069438934326, "rewards/margins": 2.6166491508483887, "rewards/rejected": -4.250455856323242, "step": 210 }, { "epoch": 0.30983847283406757, "epsilon_dpo/beta": 0.02053687535226345, "epsilon_dpo/beta_margin_grad_mean": -0.1510934829711914, "epsilon_dpo/beta_margin_grad_std": 0.18279674649238586, "epsilon_dpo/beta_margin_mean": 2.54111647605896, "epsilon_dpo/beta_margin_std": 1.720736026763916, "epsilon_dpo/loss_margin_mean": 123.92025756835938, "grad_norm": 62.367835998535156, "kl/avg_steps": 0.875, "kl/beta": 0.02071608603000641, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.373239415645323e-07, "logits/chosen": -2.1857690811157227, "logits/rejected": -2.038249969482422, "logps/chosen": -131.88340759277344, "logps/ref_chosen": -47.88237380981445, "logps/ref_rejected": -93.18321228027344, "logps/rejected": -301.1044921875, "loss": 0.4024, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7274291515350342, "rewards/margins": 2.541116237640381, "rewards/rejected": -4.268545150756836, "step": 211 }, { "epoch": 0.31130690161527164, "epsilon_dpo/beta": 0.02035231702029705, "epsilon_dpo/beta_margin_grad_mean": -0.11501824855804443, "epsilon_dpo/beta_margin_grad_std": 0.16267718374729156, "epsilon_dpo/beta_margin_mean": 3.0987837314605713, "epsilon_dpo/beta_margin_std": 2.040536403656006, "epsilon_dpo/loss_margin_mean": 152.44503784179688, "grad_norm": 56.73617935180664, "kl/avg_steps": 0.90625, "kl/beta": 0.0205363929271698, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.3647161031536086e-07, "logits/chosen": -2.1070785522460938, "logits/rejected": -2.0808253288269043, "logps/chosen": -105.17568969726562, "logps/ref_chosen": -35.5427360534668, "logps/ref_rejected": -110.93476867675781, "logps/rejected": -333.01275634765625, "loss": 0.3267, "rewards/accuracies": 0.96875, "rewards/chosen": -1.4194426536560059, "rewards/margins": 3.0987837314605713, "rewards/rejected": -4.518226623535156, "step": 212 }, { "epoch": 0.31277533039647576, "epsilon_dpo/beta": 0.02018861286342144, "epsilon_dpo/beta_margin_grad_mean": -0.1576324999332428, "epsilon_dpo/beta_margin_grad_std": 0.1839437037706375, "epsilon_dpo/beta_margin_mean": 2.5347602367401123, "epsilon_dpo/beta_margin_std": 1.81096613407135, "epsilon_dpo/loss_margin_mean": 125.82254028320312, "grad_norm": 60.45912551879883, "kl/avg_steps": 0.8125, "kl/beta": 0.02035195380449295, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.3561436536583774e-07, "logits/chosen": -2.2190451622009277, "logits/rejected": -2.022303581237793, "logps/chosen": -121.68977355957031, "logps/ref_chosen": -46.382476806640625, "logps/ref_rejected": -98.22808074951172, "logps/rejected": -299.35791015625, "loss": 0.4116, "rewards/accuracies": 0.921875, "rewards/chosen": -1.522735595703125, "rewards/margins": 2.5347602367401123, "rewards/rejected": -4.057496070861816, "step": 213 }, { "epoch": 0.3142437591776799, "epsilon_dpo/beta": 0.02001328393816948, "epsilon_dpo/beta_margin_grad_mean": -0.18691565096378326, "epsilon_dpo/beta_margin_grad_std": 0.2173331379890442, "epsilon_dpo/beta_margin_mean": 2.255009889602661, "epsilon_dpo/beta_margin_std": 1.9002856016159058, "epsilon_dpo/loss_margin_mean": 112.92225646972656, "grad_norm": 80.94749450683594, "kl/avg_steps": 0.875, "kl/beta": 0.020187927410006523, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -1.925289273262024, "logits/rejected": -1.9057788848876953, "logps/chosen": -117.653564453125, "logps/ref_chosen": -33.69921112060547, "logps/ref_rejected": -83.6459732055664, "logps/rejected": -280.5225830078125, "loss": 0.5721, "rewards/accuracies": 0.90625, "rewards/chosen": -1.683471441268921, "rewards/margins": 2.255009889602661, "rewards/rejected": -3.938481330871582, "step": 214 }, { "epoch": 0.315712187958884, "epsilon_dpo/beta": 0.019833432510495186, "epsilon_dpo/beta_margin_grad_mean": -0.12425589561462402, "epsilon_dpo/beta_margin_grad_std": 0.13742640614509583, "epsilon_dpo/beta_margin_mean": 2.6745448112487793, "epsilon_dpo/beta_margin_std": 1.592254638671875, "epsilon_dpo/loss_margin_mean": 134.98655700683594, "grad_norm": 46.67967987060547, "kl/avg_steps": 0.90625, "kl/beta": 0.020012814551591873, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.3388522485142885e-07, "logits/chosen": -2.086688756942749, "logits/rejected": -2.0053024291992188, "logps/chosen": -107.9367446899414, "logps/ref_chosen": -30.35393714904785, "logps/ref_rejected": -99.64697265625, "logps/rejected": -312.21636962890625, "loss": 0.2971, "rewards/accuracies": 0.953125, "rewards/chosen": -1.5396337509155273, "rewards/margins": 2.6745448112487793, "rewards/rejected": -4.214178562164307, "step": 215 }, { "epoch": 0.31718061674008813, "epsilon_dpo/beta": 0.019673900678753853, "epsilon_dpo/beta_margin_grad_mean": -0.1697354018688202, "epsilon_dpo/beta_margin_grad_std": 0.19105024635791779, "epsilon_dpo/beta_margin_mean": 2.5016613006591797, "epsilon_dpo/beta_margin_std": 1.8782446384429932, "epsilon_dpo/loss_margin_mean": 127.44037628173828, "grad_norm": 60.99223709106445, "kl/avg_steps": 0.8125, "kl/beta": 0.019833076745271683, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.330133748510036e-07, "logits/chosen": -2.0426881313323975, "logits/rejected": -1.9267804622650146, "logps/chosen": -102.5655746459961, "logps/ref_chosen": -28.687610626220703, "logps/ref_rejected": -83.20097351074219, "logps/rejected": -284.519287109375, "loss": 0.4448, "rewards/accuracies": 0.921875, "rewards/chosen": -1.4557723999023438, "rewards/margins": 2.5016613006591797, "rewards/rejected": -3.9574337005615234, "step": 216 }, { "epoch": 0.3186490455212922, "epsilon_dpo/beta": 0.019503042101860046, "epsilon_dpo/beta_margin_grad_mean": -0.12006211280822754, "epsilon_dpo/beta_margin_grad_std": 0.1564583033323288, "epsilon_dpo/beta_margin_mean": 2.867358446121216, "epsilon_dpo/beta_margin_std": 1.7631230354309082, "epsilon_dpo/loss_margin_mean": 147.23101806640625, "grad_norm": 50.90324401855469, "kl/avg_steps": 0.875, "kl/beta": 0.01967323198914528, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.3213670227794757e-07, "logits/chosen": -2.1286168098449707, "logits/rejected": -2.087315559387207, "logps/chosen": -98.74182891845703, "logps/ref_chosen": -31.528701782226562, "logps/ref_rejected": -107.33251190185547, "logps/rejected": -321.77667236328125, "loss": 0.3023, "rewards/accuracies": 0.953125, "rewards/chosen": -1.3119699954986572, "rewards/margins": 2.867358684539795, "rewards/rejected": -4.179328441619873, "step": 217 }, { "epoch": 0.3201174743024963, "epsilon_dpo/beta": 0.01932777464389801, "epsilon_dpo/beta_margin_grad_mean": -0.15312562882900238, "epsilon_dpo/beta_margin_grad_std": 0.161267951130867, "epsilon_dpo/beta_margin_mean": 2.22654128074646, "epsilon_dpo/beta_margin_std": 1.3763279914855957, "epsilon_dpo/loss_margin_mean": 115.3489990234375, "grad_norm": 58.98360824584961, "kl/avg_steps": 0.90625, "kl/beta": 0.019502583891153336, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.3125523023339815e-07, "logits/chosen": -2.162309169769287, "logits/rejected": -2.1342501640319824, "logps/chosen": -108.99090576171875, "logps/ref_chosen": -36.7948112487793, "logps/ref_rejected": -93.1485366821289, "logps/rejected": -280.693603515625, "loss": 0.3962, "rewards/accuracies": 0.953125, "rewards/chosen": -1.39695405960083, "rewards/margins": 2.226541519165039, "rewards/rejected": -3.623495578765869, "step": 218 }, { "epoch": 0.32158590308370044, "epsilon_dpo/beta": 0.019178351387381554, "epsilon_dpo/beta_margin_grad_mean": -0.178512305021286, "epsilon_dpo/beta_margin_grad_std": 0.184907004237175, "epsilon_dpo/beta_margin_mean": 2.2544329166412354, "epsilon_dpo/beta_margin_std": 1.7746726274490356, "epsilon_dpo/loss_margin_mean": 117.84127807617188, "grad_norm": 61.87991714477539, "kl/avg_steps": 0.78125, "kl/beta": 0.019327430054545403, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.303689819449636e-07, "logits/chosen": -2.2709426879882812, "logits/rejected": -2.1236162185668945, "logps/chosen": -113.684814453125, "logps/ref_chosen": -42.875755310058594, "logps/ref_rejected": -92.20575714111328, "logps/rejected": -280.8560791015625, "loss": 0.4669, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3612611293792725, "rewards/margins": 2.2544326782226562, "rewards/rejected": -3.615694046020508, "step": 219 }, { "epoch": 0.32305433186490456, "epsilon_dpo/beta": 0.019017694517970085, "epsilon_dpo/beta_margin_grad_mean": -0.17227615416049957, "epsilon_dpo/beta_margin_grad_std": 0.1603304147720337, "epsilon_dpo/beta_margin_mean": 2.1362380981445312, "epsilon_dpo/beta_margin_std": 1.426170825958252, "epsilon_dpo/loss_margin_mean": 112.51897430419922, "grad_norm": 57.75115966796875, "kl/avg_steps": 0.84375, "kl/beta": 0.019177604466676712, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -2.249175548553467, "logits/rejected": -2.228762626647949, "logps/chosen": -121.99055480957031, "logps/ref_chosen": -43.218231201171875, "logps/ref_rejected": -94.84095764160156, "logps/rejected": -286.1322326660156, "loss": 0.4237, "rewards/accuracies": 0.953125, "rewards/chosen": -1.4991801977157593, "rewards/margins": 2.1362380981445312, "rewards/rejected": -3.63541841506958, "step": 220 }, { "epoch": 0.3245227606461087, "epsilon_dpo/beta": 0.018852630630135536, "epsilon_dpo/beta_margin_grad_mean": -0.11038573086261749, "epsilon_dpo/beta_margin_grad_std": 0.14331035315990448, "epsilon_dpo/beta_margin_mean": 2.8602123260498047, "epsilon_dpo/beta_margin_std": 1.571157693862915, "epsilon_dpo/loss_margin_mean": 151.9083251953125, "grad_norm": 44.90319061279297, "kl/avg_steps": 0.875, "kl/beta": 0.01901714690029621, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.285822501755485e-07, "logits/chosen": -2.0982179641723633, "logits/rejected": -2.1222004890441895, "logps/chosen": -105.45207214355469, "logps/ref_chosen": -36.884986877441406, "logps/ref_rejected": -112.87872314453125, "logps/rejected": -333.3541259765625, "loss": 0.2732, "rewards/accuracies": 0.953125, "rewards/chosen": -1.2945091724395752, "rewards/margins": 2.8602123260498047, "rewards/rejected": -4.154721260070801, "step": 221 }, { "epoch": 0.32599118942731276, "epsilon_dpo/beta": 0.018689103424549103, "epsilon_dpo/beta_margin_grad_mean": -0.15696805715560913, "epsilon_dpo/beta_margin_grad_std": 0.16546384990215302, "epsilon_dpo/beta_margin_mean": 2.4271247386932373, "epsilon_dpo/beta_margin_std": 1.7556877136230469, "epsilon_dpo/loss_margin_mean": 130.051025390625, "grad_norm": 68.18460845947266, "kl/avg_steps": 0.875, "kl/beta": 0.018852191045880318, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.276818137766118e-07, "logits/chosen": -2.1893157958984375, "logits/rejected": -2.2103257179260254, "logps/chosen": -112.70558166503906, "logps/ref_chosen": -37.27526092529297, "logps/ref_rejected": -106.37206268310547, "logps/rejected": -311.8533935546875, "loss": 0.3942, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4119031429290771, "rewards/margins": 2.4271249771118164, "rewards/rejected": -3.8390281200408936, "step": 222 }, { "epoch": 0.3274596182085169, "epsilon_dpo/beta": 0.018556196242570877, "epsilon_dpo/beta_margin_grad_mean": -0.1877458542585373, "epsilon_dpo/beta_margin_grad_std": 0.21679145097732544, "epsilon_dpo/beta_margin_mean": 2.193220615386963, "epsilon_dpo/beta_margin_std": 1.8057856559753418, "epsilon_dpo/loss_margin_mean": 118.63549041748047, "grad_norm": 77.07755279541016, "kl/avg_steps": 0.71875, "kl/beta": 0.018688665702939034, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -2.1241304874420166, "logits/rejected": -2.0650038719177246, "logps/chosen": -117.65826416015625, "logps/ref_chosen": -32.709083557128906, "logps/ref_rejected": -86.52430725097656, "logps/rejected": -290.10894775390625, "loss": 0.542, "rewards/accuracies": 0.859375, "rewards/chosen": -1.581796407699585, "rewards/margins": 2.193220615386963, "rewards/rejected": -3.775017261505127, "step": 223 }, { "epoch": 0.328928046989721, "epsilon_dpo/beta": 0.018406376242637634, "epsilon_dpo/beta_margin_grad_mean": -0.15500150620937347, "epsilon_dpo/beta_margin_grad_std": 0.1830359399318695, "epsilon_dpo/beta_margin_mean": 2.481572389602661, "epsilon_dpo/beta_margin_std": 1.7067841291427612, "epsilon_dpo/loss_margin_mean": 135.1320037841797, "grad_norm": 64.31571197509766, "kl/avg_steps": 0.8125, "kl/beta": 0.018555298447608948, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.2586691858633747e-07, "logits/chosen": -2.1345582008361816, "logits/rejected": -1.9911662340164185, "logps/chosen": -113.04464721679688, "logps/ref_chosen": -35.54627990722656, "logps/ref_rejected": -81.60932922363281, "logps/rejected": -294.23968505859375, "loss": 0.4078, "rewards/accuracies": 0.921875, "rewards/chosen": -1.429018497467041, "rewards/margins": 2.481572389602661, "rewards/rejected": -3.910590648651123, "step": 224 }, { "epoch": 0.3303964757709251, "epsilon_dpo/beta": 0.018246525898575783, "epsilon_dpo/beta_margin_grad_mean": -0.1459973156452179, "epsilon_dpo/beta_margin_grad_std": 0.19097650051116943, "epsilon_dpo/beta_margin_mean": 2.6388025283813477, "epsilon_dpo/beta_margin_std": 1.800585150718689, "epsilon_dpo/loss_margin_mean": 144.87014770507812, "grad_norm": 65.03907775878906, "kl/avg_steps": 0.875, "kl/beta": 0.018405752256512642, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.249525076191759e-07, "logits/chosen": -2.1871337890625, "logits/rejected": -2.1202986240386963, "logps/chosen": -111.3397216796875, "logps/ref_chosen": -34.65919876098633, "logps/ref_rejected": -106.95365905761719, "logps/rejected": -328.50433349609375, "loss": 0.4097, "rewards/accuracies": 0.953125, "rewards/chosen": -1.4013426303863525, "rewards/margins": 2.6388025283813477, "rewards/rejected": -4.040144920349121, "step": 225 }, { "epoch": 0.33186490455212925, "epsilon_dpo/beta": 0.01808825507760048, "epsilon_dpo/beta_margin_grad_mean": -0.17010191082954407, "epsilon_dpo/beta_margin_grad_std": 0.16130515933036804, "epsilon_dpo/beta_margin_mean": 2.194584846496582, "epsilon_dpo/beta_margin_std": 1.5229578018188477, "epsilon_dpo/loss_margin_mean": 121.48944854736328, "grad_norm": 53.89804458618164, "kl/avg_steps": 0.875, "kl/beta": 0.01824609935283661, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.2403348649073167e-07, "logits/chosen": -2.2922072410583496, "logits/rejected": -2.074962615966797, "logps/chosen": -120.92872619628906, "logps/ref_chosen": -44.4660758972168, "logps/ref_rejected": -84.042724609375, "logps/rejected": -281.99481201171875, "loss": 0.4196, "rewards/accuracies": 0.953125, "rewards/chosen": -1.3841276168823242, "rewards/margins": 2.194584846496582, "rewards/rejected": -3.5787124633789062, "step": 226 }, { "epoch": 0.3333333333333333, "epsilon_dpo/beta": 0.017948314547538757, "epsilon_dpo/beta_margin_grad_mean": -0.15359929203987122, "epsilon_dpo/beta_margin_grad_std": 0.18651175498962402, "epsilon_dpo/beta_margin_mean": 2.6601450443267822, "epsilon_dpo/beta_margin_std": 1.9379621744155884, "epsilon_dpo/loss_margin_mean": 148.5768585205078, "grad_norm": 62.88763427734375, "kl/avg_steps": 0.78125, "kl/beta": 0.018087830394506454, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.2310987941806615e-07, "logits/chosen": -2.217491865158081, "logits/rejected": -2.0915043354034424, "logps/chosen": -111.29264831542969, "logps/ref_chosen": -33.7484245300293, "logps/ref_rejected": -106.498291015625, "logps/rejected": -332.619384765625, "loss": 0.4017, "rewards/accuracies": 0.90625, "rewards/chosen": -1.3941301107406616, "rewards/margins": 2.6601450443267822, "rewards/rejected": -4.054275035858154, "step": 227 }, { "epoch": 0.33480176211453744, "epsilon_dpo/beta": 0.0177923534065485, "epsilon_dpo/beta_margin_grad_mean": -0.15493857860565186, "epsilon_dpo/beta_margin_grad_std": 0.1513359248638153, "epsilon_dpo/beta_margin_mean": 2.2347021102905273, "epsilon_dpo/beta_margin_std": 1.3810467720031738, "epsilon_dpo/loss_margin_mean": 125.76899719238281, "grad_norm": 55.945281982421875, "kl/avg_steps": 0.875, "kl/beta": 0.017947614192962646, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.2218171073908463e-07, "logits/chosen": -2.300729990005493, "logits/rejected": -2.2182745933532715, "logps/chosen": -128.66030883789062, "logps/ref_chosen": -45.935726165771484, "logps/ref_rejected": -96.16656494140625, "logps/rejected": -304.66015625, "loss": 0.3771, "rewards/accuracies": 0.96875, "rewards/chosen": -1.473421335220337, "rewards/margins": 2.2347021102905273, "rewards/rejected": -3.7081234455108643, "step": 228 }, { "epoch": 0.33627019089574156, "epsilon_dpo/beta": 0.01766026020050049, "epsilon_dpo/beta_margin_grad_mean": -0.155641108751297, "epsilon_dpo/beta_margin_grad_std": 0.19922208786010742, "epsilon_dpo/beta_margin_mean": 2.632094383239746, "epsilon_dpo/beta_margin_std": 1.839905023574829, "epsilon_dpo/loss_margin_mean": 149.47091674804688, "grad_norm": 64.01463317871094, "kl/avg_steps": 0.75, "kl/beta": 0.017791934311389923, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.212490049118951e-07, "logits/chosen": -2.2102646827697754, "logits/rejected": -2.084036350250244, "logps/chosen": -100.0278091430664, "logps/ref_chosen": -35.16382598876953, "logps/ref_rejected": -89.91634368896484, "logps/rejected": -304.251220703125, "loss": 0.4188, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1469919681549072, "rewards/margins": 2.632094383239746, "rewards/rejected": -3.779086112976074, "step": 229 }, { "epoch": 0.3377386196769457, "epsilon_dpo/beta": 0.017512237653136253, "epsilon_dpo/beta_margin_grad_mean": -0.15799804031848907, "epsilon_dpo/beta_margin_grad_std": 0.1725914478302002, "epsilon_dpo/beta_margin_mean": 2.30881667137146, "epsilon_dpo/beta_margin_std": 1.4573560953140259, "epsilon_dpo/loss_margin_mean": 132.07550048828125, "grad_norm": 57.6724853515625, "kl/avg_steps": 0.84375, "kl/beta": 0.017659489065408707, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.203117865141635e-07, "logits/chosen": -1.9427410364151, "logits/rejected": -1.9851582050323486, "logps/chosen": -114.36758422851562, "logps/ref_chosen": -28.29522705078125, "logps/ref_rejected": -89.92157745361328, "logps/rejected": -308.0694274902344, "loss": 0.4007, "rewards/accuracies": 0.953125, "rewards/chosen": -1.5095224380493164, "rewards/margins": 2.30881667137146, "rewards/rejected": -3.8183393478393555, "step": 230 }, { "epoch": 0.3392070484581498, "epsilon_dpo/beta": 0.01736571453511715, "epsilon_dpo/beta_margin_grad_mean": -0.15104635059833527, "epsilon_dpo/beta_margin_grad_std": 0.16789193451404572, "epsilon_dpo/beta_margin_mean": 2.344306468963623, "epsilon_dpo/beta_margin_std": 1.5224120616912842, "epsilon_dpo/loss_margin_mean": 135.25741577148438, "grad_norm": 58.66816711425781, "kl/avg_steps": 0.84375, "kl/beta": 0.01751173473894596, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.1937008024246625e-07, "logits/chosen": -2.2687087059020996, "logits/rejected": -2.0110936164855957, "logps/chosen": -122.01680755615234, "logps/ref_chosen": -39.274810791015625, "logps/ref_rejected": -80.1042709350586, "logps/rejected": -298.10369873046875, "loss": 0.3873, "rewards/accuracies": 0.921875, "rewards/chosen": -1.4385372400283813, "rewards/margins": 2.344306468963623, "rewards/rejected": -3.782843828201294, "step": 231 }, { "epoch": 0.3406754772393539, "epsilon_dpo/beta": 0.0172258447855711, "epsilon_dpo/beta_margin_grad_mean": -0.16703417897224426, "epsilon_dpo/beta_margin_grad_std": 0.17445211112499237, "epsilon_dpo/beta_margin_mean": 2.3062567710876465, "epsilon_dpo/beta_margin_std": 1.650961995124817, "epsilon_dpo/loss_margin_mean": 134.1649169921875, "grad_norm": 55.898075103759766, "kl/avg_steps": 0.8125, "kl/beta": 0.017365215346217155, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.1842391091163933e-07, "logits/chosen": -2.2190351486206055, "logits/rejected": -2.2044219970703125, "logps/chosen": -121.8990707397461, "logps/ref_chosen": -42.393104553222656, "logps/ref_rejected": -88.90144348144531, "logps/rejected": -302.57232666015625, "loss": 0.4288, "rewards/accuracies": 0.953125, "rewards/chosen": -1.3723829984664917, "rewards/margins": 2.3062567710876465, "rewards/rejected": -3.6786396503448486, "step": 232 }, { "epoch": 0.342143906020558, "epsilon_dpo/beta": 0.017076246440410614, "epsilon_dpo/beta_margin_grad_mean": -0.11840350925922394, "epsilon_dpo/beta_margin_grad_std": 0.15504033863544464, "epsilon_dpo/beta_margin_mean": 2.873262882232666, "epsilon_dpo/beta_margin_std": 1.6749653816223145, "epsilon_dpo/loss_margin_mean": 168.5035858154297, "grad_norm": 49.05929183959961, "kl/avg_steps": 0.875, "kl/beta": 0.01722525991499424, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.174733034541245e-07, "logits/chosen": -2.11576509475708, "logits/rejected": -2.1665868759155273, "logps/chosen": -108.28323364257812, "logps/ref_chosen": -30.483036041259766, "logps/ref_rejected": -115.03839111328125, "logps/rejected": -361.3421630859375, "loss": 0.2959, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3310226202011108, "rewards/margins": 2.873262882232666, "rewards/rejected": -4.204285621643066, "step": 233 }, { "epoch": 0.3436123348017621, "epsilon_dpo/beta": 0.01694413460791111, "epsilon_dpo/beta_margin_grad_mean": -0.15992534160614014, "epsilon_dpo/beta_margin_grad_std": 0.19790522754192352, "epsilon_dpo/beta_margin_mean": 2.6408679485321045, "epsilon_dpo/beta_margin_std": 2.0294158458709717, "epsilon_dpo/loss_margin_mean": 156.26454162597656, "grad_norm": 67.56781005859375, "kl/avg_steps": 0.78125, "kl/beta": 0.01707584597170353, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.165182829193126e-07, "logits/chosen": -2.0071873664855957, "logits/rejected": -2.1877129077911377, "logps/chosen": -111.21868133544922, "logps/ref_chosen": -30.016942977905273, "logps/ref_rejected": -108.75608825683594, "logps/rejected": -346.22235107421875, "loss": 0.4328, "rewards/accuracies": 0.90625, "rewards/chosen": -1.3788411617279053, "rewards/margins": 2.6408681869506836, "rewards/rejected": -4.01970911026001, "step": 234 }, { "epoch": 0.34508076358296624, "epsilon_dpo/beta": 0.01681278459727764, "epsilon_dpo/beta_margin_grad_mean": -0.175105020403862, "epsilon_dpo/beta_margin_grad_std": 0.1960555464029312, "epsilon_dpo/beta_margin_mean": 2.271127700805664, "epsilon_dpo/beta_margin_std": 1.7841382026672363, "epsilon_dpo/loss_margin_mean": 135.44342041015625, "grad_norm": 77.5376205444336, "kl/avg_steps": 0.78125, "kl/beta": 0.016943475231528282, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.1555887447288255e-07, "logits/chosen": -2.111543655395508, "logits/rejected": -2.081702470779419, "logps/chosen": -141.7093963623047, "logps/ref_chosen": -46.081146240234375, "logps/ref_rejected": -96.02244567871094, "logps/rejected": -327.0941162109375, "loss": 0.4894, "rewards/accuracies": 0.921875, "rewards/chosen": -1.6119556427001953, "rewards/margins": 2.271127700805664, "rewards/rejected": -3.8830833435058594, "step": 235 }, { "epoch": 0.3465491923641703, "epsilon_dpo/beta": 0.01666669175028801, "epsilon_dpo/beta_margin_grad_mean": -0.1637529879808426, "epsilon_dpo/beta_margin_grad_std": 0.16410239040851593, "epsilon_dpo/beta_margin_mean": 2.3281750679016113, "epsilon_dpo/beta_margin_std": 1.6591843366622925, "epsilon_dpo/loss_margin_mean": 139.8822784423828, "grad_norm": 56.97810363769531, "kl/avg_steps": 0.875, "kl/beta": 0.01681213080883026, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.1459510339613946e-07, "logits/chosen": -2.05222487449646, "logits/rejected": -2.147634267807007, "logps/chosen": -112.40109252929688, "logps/ref_chosen": -31.17489242553711, "logps/ref_rejected": -108.55508422851562, "logps/rejected": -329.66357421875, "loss": 0.4048, "rewards/accuracies": 0.921875, "rewards/chosen": -1.3549550771713257, "rewards/margins": 2.3281750679016113, "rewards/rejected": -3.6831302642822266, "step": 236 }, { "epoch": 0.34801762114537443, "epsilon_dpo/beta": 0.01652212254703045, "epsilon_dpo/beta_margin_grad_mean": -0.13068081438541412, "epsilon_dpo/beta_margin_grad_std": 0.15401048958301544, "epsilon_dpo/beta_margin_mean": 2.651111125946045, "epsilon_dpo/beta_margin_std": 1.6006019115447998, "epsilon_dpo/loss_margin_mean": 160.67715454101562, "grad_norm": 47.88125991821289, "kl/avg_steps": 0.875, "kl/beta": 0.01666630059480667, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.136269950853473e-07, "logits/chosen": -2.0394325256347656, "logits/rejected": -2.007416248321533, "logps/chosen": -99.72041320800781, "logps/ref_chosen": -27.259479522705078, "logps/ref_rejected": -100.87033081054688, "logps/rejected": -334.0084228515625, "loss": 0.3225, "rewards/accuracies": 0.953125, "rewards/chosen": -1.198293924331665, "rewards/margins": 2.651111125946045, "rewards/rejected": -3.849404811859131, "step": 237 }, { "epoch": 0.34948604992657856, "epsilon_dpo/beta": 0.016383972018957138, "epsilon_dpo/beta_margin_grad_mean": -0.1836518496274948, "epsilon_dpo/beta_margin_grad_std": 0.17506077885627747, "epsilon_dpo/beta_margin_mean": 2.173346996307373, "epsilon_dpo/beta_margin_std": 1.7011436223983765, "epsilon_dpo/loss_margin_mean": 132.8949737548828, "grad_norm": 65.86769104003906, "kl/avg_steps": 0.84375, "kl/beta": 0.01652173511683941, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.126545750510605e-07, "logits/chosen": -2.0702290534973145, "logits/rejected": -2.123429298400879, "logps/chosen": -137.6107177734375, "logps/ref_chosen": -40.190574645996094, "logps/ref_rejected": -93.49894714355469, "logps/rejected": -323.8140563964844, "loss": 0.4646, "rewards/accuracies": 0.9375, "rewards/chosen": -1.598804235458374, "rewards/margins": 2.173346996307373, "rewards/rejected": -3.772151231765747, "step": 238 }, { "epoch": 0.3509544787077827, "epsilon_dpo/beta": 0.016236647963523865, "epsilon_dpo/beta_margin_grad_mean": -0.16019320487976074, "epsilon_dpo/beta_margin_grad_std": 0.1673043966293335, "epsilon_dpo/beta_margin_mean": 2.3489763736724854, "epsilon_dpo/beta_margin_std": 1.5995773077011108, "epsilon_dpo/loss_margin_mean": 144.83961486816406, "grad_norm": 56.121368408203125, "kl/avg_steps": 0.90625, "kl/beta": 0.016383498907089233, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.116778689174514e-07, "logits/chosen": -2.0602469444274902, "logits/rejected": -1.9942574501037598, "logps/chosen": -124.91606140136719, "logps/ref_chosen": -38.33943176269531, "logps/ref_rejected": -99.64225769042969, "logps/rejected": -331.0585021972656, "loss": 0.4034, "rewards/accuracies": 0.96875, "rewards/chosen": -1.4062585830688477, "rewards/margins": 2.3489763736724854, "rewards/rejected": -3.755234956741333, "step": 239 }, { "epoch": 0.3524229074889868, "epsilon_dpo/beta": 0.016100972890853882, "epsilon_dpo/beta_margin_grad_mean": -0.16005262732505798, "epsilon_dpo/beta_margin_grad_std": 0.16643419861793518, "epsilon_dpo/beta_margin_mean": 2.2492594718933105, "epsilon_dpo/beta_margin_std": 1.4863446950912476, "epsilon_dpo/loss_margin_mean": 139.95294189453125, "grad_norm": 66.01368713378906, "kl/avg_steps": 0.84375, "kl/beta": 0.016236357390880585, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 4.106969024216348e-07, "logits/chosen": -2.0398552417755127, "logits/rejected": -1.8625662326812744, "logps/chosen": -111.68765258789062, "logps/ref_chosen": -36.1579704284668, "logps/ref_rejected": -80.07916259765625, "logps/rejected": -295.5617980957031, "loss": 0.4095, "rewards/accuracies": 0.953125, "rewards/chosen": -1.2190780639648438, "rewards/margins": 2.2492594718933105, "rewards/rejected": -3.4683375358581543, "step": 240 }, { "epoch": 0.35389133627019087, "epsilon_dpo/beta": 0.015971289947628975, "epsilon_dpo/beta_margin_grad_mean": -0.16114287078380585, "epsilon_dpo/beta_margin_grad_std": 0.18923403322696686, "epsilon_dpo/beta_margin_mean": 2.457042932510376, "epsilon_dpo/beta_margin_std": 1.8046914339065552, "epsilon_dpo/loss_margin_mean": 154.20245361328125, "grad_norm": 74.83716583251953, "kl/avg_steps": 0.8125, "kl/beta": 0.016100509092211723, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.097117014129903e-07, "logits/chosen": -2.196474313735962, "logits/rejected": -1.913820505142212, "logps/chosen": -126.07571411132812, "logps/ref_chosen": -44.0040397644043, "logps/ref_rejected": -93.001220703125, "logps/rejected": -329.2753601074219, "loss": 0.426, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3152108192443848, "rewards/margins": 2.457043170928955, "rewards/rejected": -3.7722537517547607, "step": 241 }, { "epoch": 0.355359765051395, "epsilon_dpo/beta": 0.015827594324946404, "epsilon_dpo/beta_margin_grad_mean": -0.14874504506587982, "epsilon_dpo/beta_margin_grad_std": 0.17723040282726288, "epsilon_dpo/beta_margin_mean": 2.45681095123291, "epsilon_dpo/beta_margin_std": 1.6646445989608765, "epsilon_dpo/loss_margin_mean": 155.43951416015625, "grad_norm": 78.47781372070312, "kl/avg_steps": 0.90625, "kl/beta": 0.0159707460552454, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.087222918524807e-07, "logits/chosen": -2.161656379699707, "logits/rejected": -1.9027358293533325, "logps/chosen": -109.82485961914062, "logps/ref_chosen": -32.014137268066406, "logps/ref_rejected": -91.38673400878906, "logps/rejected": -324.636962890625, "loss": 0.4175, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2343041896820068, "rewards/margins": 2.45681095123291, "rewards/rejected": -3.691114902496338, "step": 242 }, { "epoch": 0.3568281938325991, "epsilon_dpo/beta": 0.01569039188325405, "epsilon_dpo/beta_margin_grad_mean": -0.14957940578460693, "epsilon_dpo/beta_margin_grad_std": 0.1580885350704193, "epsilon_dpo/beta_margin_mean": 2.3924245834350586, "epsilon_dpo/beta_margin_std": 1.4725757837295532, "epsilon_dpo/loss_margin_mean": 152.6823272705078, "grad_norm": 58.3444709777832, "kl/avg_steps": 0.875, "kl/beta": 0.01582731120288372, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.07728699811968e-07, "logits/chosen": -2.0235860347747803, "logits/rejected": -1.7727184295654297, "logps/chosen": -119.57041931152344, "logps/ref_chosen": -37.93376922607422, "logps/ref_rejected": -81.79078674316406, "logps/rejected": -316.1097717285156, "loss": 0.3675, "rewards/accuracies": 0.953125, "rewards/chosen": -1.282745122909546, "rewards/margins": 2.3924245834350586, "rewards/rejected": -3.6751694679260254, "step": 243 }, { "epoch": 0.35829662261380324, "epsilon_dpo/beta": 0.015569004230201244, "epsilon_dpo/beta_margin_grad_mean": -0.18348611891269684, "epsilon_dpo/beta_margin_grad_std": 0.19074872136116028, "epsilon_dpo/beta_margin_mean": 2.0866005420684814, "epsilon_dpo/beta_margin_std": 1.515892744064331, "epsilon_dpo/loss_margin_mean": 134.38136291503906, "grad_norm": 60.811737060546875, "kl/avg_steps": 0.78125, "kl/beta": 0.015690024942159653, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.067309514735267e-07, "logits/chosen": -2.1224303245544434, "logits/rejected": -2.0141685009002686, "logps/chosen": -125.60481262207031, "logps/ref_chosen": -39.915008544921875, "logps/ref_rejected": -101.33251953125, "logps/rejected": -321.4036865234375, "loss": 0.4834, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3352429866790771, "rewards/margins": 2.0866003036499023, "rewards/rejected": -3.4218435287475586, "step": 244 }, { "epoch": 0.35976505139500736, "epsilon_dpo/beta": 0.015458044596016407, "epsilon_dpo/beta_margin_grad_mean": -0.19498883187770844, "epsilon_dpo/beta_margin_grad_std": 0.21323725581169128, "epsilon_dpo/beta_margin_mean": 2.181067943572998, "epsilon_dpo/beta_margin_std": 1.7734534740447998, "epsilon_dpo/loss_margin_mean": 141.5851593017578, "grad_norm": 70.79053497314453, "kl/avg_steps": 0.71875, "kl/beta": 0.01556839607656002, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.057290731287531e-07, "logits/chosen": -2.187222957611084, "logits/rejected": -1.977565050125122, "logps/chosen": -115.84690856933594, "logps/ref_chosen": -40.404693603515625, "logps/ref_rejected": -93.24897766113281, "logps/rejected": -310.2763671875, "loss": 0.538, "rewards/accuracies": 0.875, "rewards/chosen": -1.1700770854949951, "rewards/margins": 2.181067943572998, "rewards/rejected": -3.351145029067993, "step": 245 }, { "epoch": 0.36123348017621143, "epsilon_dpo/beta": 0.015333239920437336, "epsilon_dpo/beta_margin_grad_mean": -0.18123117089271545, "epsilon_dpo/beta_margin_grad_std": 0.17709243297576904, "epsilon_dpo/beta_margin_mean": 2.124358892440796, "epsilon_dpo/beta_margin_std": 1.5861018896102905, "epsilon_dpo/loss_margin_mean": 138.85049438476562, "grad_norm": 59.1838264465332, "kl/avg_steps": 0.8125, "kl/beta": 0.01545729674398899, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.047230911780736e-07, "logits/chosen": -2.1199212074279785, "logits/rejected": -1.9131314754486084, "logps/chosen": -134.75564575195312, "logps/ref_chosen": -46.212432861328125, "logps/ref_rejected": -90.18721008300781, "logps/rejected": -317.5809326171875, "loss": 0.4609, "rewards/accuracies": 0.921875, "rewards/chosen": -1.3602585792541504, "rewards/margins": 2.124358892440796, "rewards/rejected": -3.4846174716949463, "step": 246 }, { "epoch": 0.36270190895741555, "epsilon_dpo/beta": 0.015209660865366459, "epsilon_dpo/beta_margin_grad_mean": -0.16919678449630737, "epsilon_dpo/beta_margin_grad_std": 0.20299823582172394, "epsilon_dpo/beta_margin_mean": 2.335465669631958, "epsilon_dpo/beta_margin_std": 1.7916253805160522, "epsilon_dpo/loss_margin_mean": 153.95811462402344, "grad_norm": 73.05105590820312, "kl/avg_steps": 0.8125, "kl/beta": 0.015332718379795551, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.0371303213004814e-07, "logits/chosen": -2.0222108364105225, "logits/rejected": -2.00075626373291, "logps/chosen": -138.3466796875, "logps/ref_chosen": -41.23990249633789, "logps/ref_rejected": -111.74742126464844, "logps/rejected": -362.81231689453125, "loss": 0.4948, "rewards/accuracies": 0.90625, "rewards/chosen": -1.479392409324646, "rewards/margins": 2.335465431213379, "rewards/rejected": -3.8148579597473145, "step": 247 }, { "epoch": 0.3641703377386197, "epsilon_dpo/beta": 0.015063311904668808, "epsilon_dpo/beta_margin_grad_mean": -0.12139809876680374, "epsilon_dpo/beta_margin_grad_std": 0.12686574459075928, "epsilon_dpo/beta_margin_mean": 2.461768865585327, "epsilon_dpo/beta_margin_std": 1.234181523323059, "epsilon_dpo/loss_margin_mean": 163.502685546875, "grad_norm": 40.315460205078125, "kl/avg_steps": 0.96875, "kl/beta": 0.015209143981337547, "kl/n_epsilon_steps": 0.015625, "kl/p_epsilon_steps": 0.984375, "learning_rate": 4.0269892260067197e-07, "logits/chosen": -1.9299895763397217, "logits/rejected": -2.0007243156433105, "logps/chosen": -102.28887939453125, "logps/ref_chosen": -25.618247985839844, "logps/ref_rejected": -97.59014892578125, "logps/rejected": -337.76348876953125, "loss": 0.2904, "rewards/accuracies": 0.984375, "rewards/chosen": -1.1554369926452637, "rewards/margins": 2.461768865585327, "rewards/rejected": -3.61720609664917, "step": 248 }, { "epoch": 0.3656387665198238, "epsilon_dpo/beta": 0.014947031624615192, "epsilon_dpo/beta_margin_grad_mean": -0.18095283210277557, "epsilon_dpo/beta_margin_grad_std": 0.164636492729187, "epsilon_dpo/beta_margin_mean": 2.128697395324707, "epsilon_dpo/beta_margin_std": 1.604970097541809, "epsilon_dpo/loss_margin_mean": 142.73989868164062, "grad_norm": 58.40861129760742, "kl/avg_steps": 0.78125, "kl/beta": 0.015063218772411346, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.0168078931267426e-07, "logits/chosen": -2.040287494659424, "logits/rejected": -1.8837002515792847, "logps/chosen": -134.6277618408203, "logps/ref_chosen": -42.576805114746094, "logps/ref_rejected": -87.38154602050781, "logps/rejected": -322.1723937988281, "loss": 0.4484, "rewards/accuracies": 0.90625, "rewards/chosen": -1.377791404724121, "rewards/margins": 2.128697395324707, "rewards/rejected": -3.506488800048828, "step": 249 }, { "epoch": 0.3671071953010279, "epsilon_dpo/beta": 0.014812478795647621, "epsilon_dpo/beta_margin_grad_mean": -0.16431747376918793, "epsilon_dpo/beta_margin_grad_std": 0.16239361464977264, "epsilon_dpo/beta_margin_mean": 2.1128971576690674, "epsilon_dpo/beta_margin_std": 1.335999608039856, "epsilon_dpo/loss_margin_mean": 142.83154296875, "grad_norm": 54.212379455566406, "kl/avg_steps": 0.90625, "kl/beta": 0.014946449548006058, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.006586590948141e-07, "logits/chosen": -2.1342391967773438, "logits/rejected": -1.7132606506347656, "logps/chosen": -126.69745635986328, "logps/ref_chosen": -43.33977508544922, "logps/ref_rejected": -79.84855651855469, "logps/rejected": -306.03778076171875, "loss": 0.4175, "rewards/accuracies": 0.953125, "rewards/chosen": -1.2351545095443726, "rewards/margins": 2.1128971576690674, "rewards/rejected": -3.3480517864227295, "step": 250 }, { "epoch": 0.368575624082232, "epsilon_dpo/beta": 0.014693334698677063, "epsilon_dpo/beta_margin_grad_mean": -0.1908206194639206, "epsilon_dpo/beta_margin_grad_std": 0.17475423216819763, "epsilon_dpo/beta_margin_mean": 1.979662299156189, "epsilon_dpo/beta_margin_std": 1.4761016368865967, "epsilon_dpo/loss_margin_mean": 135.0380859375, "grad_norm": 58.540199279785156, "kl/avg_steps": 0.8125, "kl/beta": 0.01481221430003643, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.9963255888117325e-07, "logits/chosen": -2.064459800720215, "logits/rejected": -1.8460227251052856, "logps/chosen": -128.45263671875, "logps/ref_chosen": -37.8934211730957, "logps/ref_rejected": -82.71955871582031, "logps/rejected": -308.31683349609375, "loss": 0.4894, "rewards/accuracies": 0.9375, "rewards/chosen": -1.332367181777954, "rewards/margins": 1.979662299156189, "rewards/rejected": -3.3120296001434326, "step": 251 }, { "epoch": 0.3700440528634361, "epsilon_dpo/beta": 0.014561137184500694, "epsilon_dpo/beta_margin_grad_mean": -0.16394339501857758, "epsilon_dpo/beta_margin_grad_std": 0.15933236479759216, "epsilon_dpo/beta_margin_mean": 2.167374849319458, "epsilon_dpo/beta_margin_std": 1.4150327444076538, "epsilon_dpo/loss_margin_mean": 149.01345825195312, "grad_norm": 61.02168273925781, "kl/avg_steps": 0.90625, "kl/beta": 0.014692834578454494, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 3.9860251571044666e-07, "logits/chosen": -2.0972189903259277, "logits/rejected": -1.8574435710906982, "logps/chosen": -143.55337524414062, "logps/ref_chosen": -49.172019958496094, "logps/ref_rejected": -91.81843566894531, "logps/rejected": -335.2132568359375, "loss": 0.4052, "rewards/accuracies": 0.953125, "rewards/chosen": -1.3763771057128906, "rewards/margins": 2.167374610900879, "rewards/rejected": -3.5437517166137695, "step": 252 }, { "epoch": 0.37151248164464024, "epsilon_dpo/beta": 0.014448565430939198, "epsilon_dpo/beta_margin_grad_mean": -0.1948602944612503, "epsilon_dpo/beta_margin_grad_std": 0.1849391907453537, "epsilon_dpo/beta_margin_mean": 1.9554654359817505, "epsilon_dpo/beta_margin_std": 1.4330785274505615, "epsilon_dpo/loss_margin_mean": 135.68392944335938, "grad_norm": 65.06855773925781, "kl/avg_steps": 0.78125, "kl/beta": 0.014560877345502377, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.9756855672522986e-07, "logits/chosen": -2.0035042762756348, "logits/rejected": -2.029510021209717, "logps/chosen": -156.42469787597656, "logps/ref_chosen": -55.18561553955078, "logps/ref_rejected": -104.15153503417969, "logps/rejected": -341.0745544433594, "loss": 0.5018, "rewards/accuracies": 0.890625, "rewards/chosen": -1.4654229879379272, "rewards/margins": 1.95546555519104, "rewards/rejected": -3.4208884239196777, "step": 253 }, { "epoch": 0.37298091042584436, "epsilon_dpo/beta": 0.014350106939673424, "epsilon_dpo/beta_margin_grad_mean": -0.19600307941436768, "epsilon_dpo/beta_margin_grad_std": 0.19512991607189178, "epsilon_dpo/beta_margin_mean": 2.031984806060791, "epsilon_dpo/beta_margin_std": 1.5720576047897339, "epsilon_dpo/loss_margin_mean": 142.08840942382812, "grad_norm": 57.888275146484375, "kl/avg_steps": 0.6875, "kl/beta": 0.014448001980781555, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.965307091713037e-07, "logits/chosen": -2.0362446308135986, "logits/rejected": -1.8677754402160645, "logps/chosen": -133.6912078857422, "logps/ref_chosen": -44.80467224121094, "logps/ref_rejected": -93.50021362304688, "logps/rejected": -324.47515869140625, "loss": 0.5136, "rewards/accuracies": 0.859375, "rewards/chosen": -1.277719259262085, "rewards/margins": 2.031984806060791, "rewards/rejected": -3.309704065322876, "step": 254 }, { "epoch": 0.3744493392070485, "epsilon_dpo/beta": 0.01423418615013361, "epsilon_dpo/beta_margin_grad_mean": -0.16781477630138397, "epsilon_dpo/beta_margin_grad_std": 0.1848216950893402, "epsilon_dpo/beta_margin_mean": 2.3087897300720215, "epsilon_dpo/beta_margin_std": 1.6275063753128052, "epsilon_dpo/loss_margin_mean": 162.57176208496094, "grad_norm": 61.83622360229492, "kl/avg_steps": 0.8125, "kl/beta": 0.014349350705742836, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.954890003969163e-07, "logits/chosen": -1.7996987104415894, "logits/rejected": -1.752530574798584, "logps/chosen": -130.06533813476562, "logps/ref_chosen": -37.239234924316406, "logps/ref_rejected": -96.95054626464844, "logps/rejected": -352.3484191894531, "loss": 0.4448, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3225433826446533, "rewards/margins": 2.3087897300720215, "rewards/rejected": -3.631333112716675, "step": 255 }, { "epoch": 0.37591776798825255, "epsilon_dpo/beta": 0.014110567979514599, "epsilon_dpo/beta_margin_grad_mean": -0.14517581462860107, "epsilon_dpo/beta_margin_grad_std": 0.1478438377380371, "epsilon_dpo/beta_margin_mean": 2.3481807708740234, "epsilon_dpo/beta_margin_std": 1.3911449909210205, "epsilon_dpo/loss_margin_mean": 166.6383819580078, "grad_norm": 47.72064208984375, "kl/avg_steps": 0.875, "kl/beta": 0.01423370186239481, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.944434578520628e-07, "logits/chosen": -1.8363198041915894, "logits/rejected": -1.8857228755950928, "logps/chosen": -118.48297119140625, "logps/ref_chosen": -35.025508880615234, "logps/ref_rejected": -99.25279998779297, "logps/rejected": -349.3486328125, "loss": 0.3517, "rewards/accuracies": 0.953125, "rewards/chosen": -1.1789929866790771, "rewards/margins": 2.3481807708740234, "rewards/rejected": -3.5271737575531006, "step": 256 }, { "epoch": 0.37738619676945667, "epsilon_dpo/beta": 0.013992580585181713, "epsilon_dpo/beta_margin_grad_mean": -0.1678382307291031, "epsilon_dpo/beta_margin_grad_std": 0.16572539508342743, "epsilon_dpo/beta_margin_mean": 2.1580240726470947, "epsilon_dpo/beta_margin_std": 1.4499742984771729, "epsilon_dpo/loss_margin_mean": 154.5001983642578, "grad_norm": 58.855709075927734, "kl/avg_steps": 0.84375, "kl/beta": 0.014110236428678036, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.933941090877615e-07, "logits/chosen": -1.8602499961853027, "logits/rejected": -1.7255544662475586, "logps/chosen": -125.66242980957031, "logps/ref_chosen": -34.74375534057617, "logps/ref_rejected": -84.338134765625, "logps/rejected": -329.75701904296875, "loss": 0.4202, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2731714248657227, "rewards/margins": 2.1580240726470947, "rewards/rejected": -3.4311952590942383, "step": 257 }, { "epoch": 0.3788546255506608, "epsilon_dpo/beta": 0.013875506818294525, "epsilon_dpo/beta_margin_grad_mean": -0.16494029760360718, "epsilon_dpo/beta_margin_grad_std": 0.19155991077423096, "epsilon_dpo/beta_margin_mean": 2.2436838150024414, "epsilon_dpo/beta_margin_std": 1.5077780485153198, "epsilon_dpo/loss_margin_mean": 162.05101013183594, "grad_norm": 56.01180648803711, "kl/avg_steps": 0.84375, "kl/beta": 0.01399217825382948, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.923409817553284e-07, "logits/chosen": -1.829561710357666, "logits/rejected": -1.8402704000473022, "logps/chosen": -128.53773498535156, "logps/ref_chosen": -38.32011032104492, "logps/ref_rejected": -104.34061431884766, "logps/rejected": -356.6092529296875, "loss": 0.4481, "rewards/accuracies": 0.921875, "rewards/chosen": -1.2534571886062622, "rewards/margins": 2.2436838150024414, "rewards/rejected": -3.497140884399414, "step": 258 }, { "epoch": 0.3803230543318649, "epsilon_dpo/beta": 0.013755074702203274, "epsilon_dpo/beta_margin_grad_mean": -0.17741423845291138, "epsilon_dpo/beta_margin_grad_std": 0.16296601295471191, "epsilon_dpo/beta_margin_mean": 2.0163936614990234, "epsilon_dpo/beta_margin_std": 1.3358848094940186, "epsilon_dpo/loss_margin_mean": 146.82223510742188, "grad_norm": 53.66272735595703, "kl/avg_steps": 0.875, "kl/beta": 0.01387510634958744, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.9128410360564793e-07, "logits/chosen": -1.766379952430725, "logits/rejected": -1.7046854496002197, "logps/chosen": -133.28622436523438, "logps/ref_chosen": -37.96626663208008, "logps/ref_rejected": -94.62816619873047, "logps/rejected": -336.7703552246094, "loss": 0.4435, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3125896453857422, "rewards/margins": 2.0163938999176025, "rewards/rejected": -3.3289833068847656, "step": 259 }, { "epoch": 0.38179148311306904, "epsilon_dpo/beta": 0.01364436000585556, "epsilon_dpo/beta_margin_grad_mean": -0.18185892701148987, "epsilon_dpo/beta_margin_grad_std": 0.1839314103126526, "epsilon_dpo/beta_margin_mean": 2.0660133361816406, "epsilon_dpo/beta_margin_std": 1.5317002534866333, "epsilon_dpo/loss_margin_mean": 151.79209899902344, "grad_norm": 71.4144287109375, "kl/avg_steps": 0.8125, "kl/beta": 0.013754752464592457, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -1.7605104446411133, "logits/rejected": -1.844254732131958, "logps/chosen": -128.06361389160156, "logps/ref_chosen": -29.670434951782227, "logps/ref_rejected": -101.38003540039062, "logps/rejected": -351.5653076171875, "loss": 0.4876, "rewards/accuracies": 0.921875, "rewards/chosen": -1.3459320068359375, "rewards/margins": 2.0660133361816406, "rewards/rejected": -3.411945343017578, "step": 260 }, { "epoch": 0.3832599118942731, "epsilon_dpo/beta": 0.013534392230212688, "epsilon_dpo/beta_margin_grad_mean": -0.14695821702480316, "epsilon_dpo/beta_margin_grad_std": 0.183539479970932, "epsilon_dpo/beta_margin_mean": 2.5458641052246094, "epsilon_dpo/beta_margin_std": 1.7510801553726196, "epsilon_dpo/loss_margin_mean": 188.52330017089844, "grad_norm": 66.16899871826172, "kl/avg_steps": 0.8125, "kl/beta": 0.013643896207213402, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.891592063515376e-07, "logits/chosen": -1.7042864561080933, "logits/rejected": -1.5886824131011963, "logps/chosen": -113.61106872558594, "logps/ref_chosen": -31.892623901367188, "logps/ref_rejected": -94.94419860839844, "logps/rejected": -365.1859436035156, "loss": 0.396, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1082427501678467, "rewards/margins": 2.5458641052246094, "rewards/rejected": -3.654106855392456, "step": 261 }, { "epoch": 0.38472834067547723, "epsilon_dpo/beta": 0.013416852802038193, "epsilon_dpo/beta_margin_grad_mean": -0.16779224574565887, "epsilon_dpo/beta_margin_grad_std": 0.15692253410816193, "epsilon_dpo/beta_margin_mean": 2.050642251968384, "epsilon_dpo/beta_margin_std": 1.2614741325378418, "epsilon_dpo/loss_margin_mean": 153.06997680664062, "grad_norm": 56.77939224243164, "kl/avg_steps": 0.875, "kl/beta": 0.013533933088183403, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.880912432401264e-07, "logits/chosen": -1.6830520629882812, "logits/rejected": -1.5291244983673096, "logps/chosen": -134.6734619140625, "logps/ref_chosen": -34.26129913330078, "logps/ref_rejected": -90.29788208007812, "logps/rejected": -343.780029296875, "loss": 0.4156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.348780632019043, "rewards/margins": 2.050642251968384, "rewards/rejected": -3.3994228839874268, "step": 262 }, { "epoch": 0.38619676945668135, "epsilon_dpo/beta": 0.013304665684700012, "epsilon_dpo/beta_margin_grad_mean": -0.14855755865573883, "epsilon_dpo/beta_margin_grad_std": 0.16537067294120789, "epsilon_dpo/beta_margin_mean": 2.3461291790008545, "epsilon_dpo/beta_margin_std": 1.4775606393814087, "epsilon_dpo/loss_margin_mean": 176.666748046875, "grad_norm": 59.517337799072266, "kl/avg_steps": 0.84375, "kl/beta": 0.013416538015007973, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.870196412960302e-07, "logits/chosen": -1.8858226537704468, "logits/rejected": -1.5893621444702148, "logps/chosen": -131.9771728515625, "logps/ref_chosen": -39.4767951965332, "logps/ref_rejected": -102.44886779785156, "logps/rejected": -371.615966796875, "loss": 0.3749, "rewards/accuracies": 0.921875, "rewards/chosen": -1.233377456665039, "rewards/margins": 2.3461294174194336, "rewards/rejected": -3.5795066356658936, "step": 263 }, { "epoch": 0.3876651982378855, "epsilon_dpo/beta": 0.01318918913602829, "epsilon_dpo/beta_margin_grad_mean": -0.15769457817077637, "epsilon_dpo/beta_margin_grad_std": 0.15815286338329315, "epsilon_dpo/beta_margin_mean": 2.2963368892669678, "epsilon_dpo/beta_margin_std": 1.5442652702331543, "epsilon_dpo/loss_margin_mean": 174.36825561523438, "grad_norm": 60.96957015991211, "kl/avg_steps": 0.875, "kl/beta": 0.013304282911121845, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.8594442875695665e-07, "logits/chosen": -1.6896053552627563, "logits/rejected": -1.7085572481155396, "logps/chosen": -123.23445129394531, "logps/ref_chosen": -38.707645416259766, "logps/ref_rejected": -100.15180969238281, "logps/rejected": -359.046875, "loss": 0.3936, "rewards/accuracies": 0.953125, "rewards/chosen": -1.1168205738067627, "rewards/margins": 2.296337127685547, "rewards/rejected": -3.4131574630737305, "step": 264 }, { "epoch": 0.3891336270190896, "epsilon_dpo/beta": 0.013070664368569851, "epsilon_dpo/beta_margin_grad_mean": -0.17950545251369476, "epsilon_dpo/beta_margin_grad_std": 0.168075293302536, "epsilon_dpo/beta_margin_mean": 2.0962088108062744, "epsilon_dpo/beta_margin_std": 1.5082602500915527, "epsilon_dpo/loss_margin_mean": 160.54234313964844, "grad_norm": 61.71824645996094, "kl/avg_steps": 0.90625, "kl/beta": 0.013188880868256092, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 3.848656339557562e-07, "logits/chosen": -1.6642320156097412, "logits/rejected": -1.5806505680084229, "logps/chosen": -145.28750610351562, "logps/ref_chosen": -37.51420593261719, "logps/ref_rejected": -94.66896057128906, "logps/rejected": -362.984619140625, "loss": 0.4478, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4101543426513672, "rewards/margins": 2.0962088108062744, "rewards/rejected": -3.5063629150390625, "step": 265 }, { "epoch": 0.39060205580029367, "epsilon_dpo/beta": 0.012973698787391186, "epsilon_dpo/beta_margin_grad_mean": -0.20105549693107605, "epsilon_dpo/beta_margin_grad_std": 0.19386924803256989, "epsilon_dpo/beta_margin_mean": 1.843229055404663, "epsilon_dpo/beta_margin_std": 1.3907108306884766, "epsilon_dpo/loss_margin_mean": 142.52218627929688, "grad_norm": 68.82777404785156, "kl/avg_steps": 0.75, "kl/beta": 0.013070429675281048, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.8378328531967507e-07, "logits/chosen": -1.6910204887390137, "logits/rejected": -1.2894883155822754, "logps/chosen": -167.84719848632812, "logps/ref_chosen": -51.23357391357422, "logps/ref_rejected": -73.29232025146484, "logps/rejected": -332.4281311035156, "loss": 0.5442, "rewards/accuracies": 0.921875, "rewards/chosen": -1.5156571865081787, "rewards/margins": 1.843229055404663, "rewards/rejected": -3.358886241912842, "step": 266 }, { "epoch": 0.3920704845814978, "epsilon_dpo/beta": 0.012881174683570862, "epsilon_dpo/beta_margin_grad_mean": -0.19470059871673584, "epsilon_dpo/beta_margin_grad_std": 0.1797318160533905, "epsilon_dpo/beta_margin_mean": 1.9295697212219238, "epsilon_dpo/beta_margin_std": 1.4559646844863892, "epsilon_dpo/loss_margin_mean": 150.2775115966797, "grad_norm": 67.35930633544922, "kl/avg_steps": 0.71875, "kl/beta": 0.012973131611943245, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.8269741136960646e-07, "logits/chosen": -1.8555903434753418, "logits/rejected": -1.507951021194458, "logps/chosen": -154.36102294921875, "logps/ref_chosen": -50.439453125, "logps/ref_rejected": -95.28913879394531, "logps/rejected": -349.48822021484375, "loss": 0.4993, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3415794372558594, "rewards/margins": 1.9295697212219238, "rewards/rejected": -3.271149158477783, "step": 267 }, { "epoch": 0.3935389133627019, "epsilon_dpo/beta": 0.012789253145456314, "epsilon_dpo/beta_margin_grad_mean": -0.21375124156475067, "epsilon_dpo/beta_margin_grad_std": 0.20960654318332672, "epsilon_dpo/beta_margin_mean": 1.9788737297058105, "epsilon_dpo/beta_margin_std": 1.7514647245407104, "epsilon_dpo/loss_margin_mean": 155.29205322265625, "grad_norm": 80.12260437011719, "kl/avg_steps": 0.71875, "kl/beta": 0.012880552560091019, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.8160804071933894e-07, "logits/chosen": -1.6909327507019043, "logits/rejected": -1.6416935920715332, "logps/chosen": -156.72665405273438, "logps/ref_chosen": -42.84587097167969, "logps/ref_rejected": -108.26278686523438, "logps/rejected": -377.4356384277344, "loss": 0.5814, "rewards/accuracies": 0.859375, "rewards/chosen": -1.461806058883667, "rewards/margins": 1.9788737297058105, "rewards/rejected": -3.4406800270080566, "step": 268 }, { "epoch": 0.39500734214390604, "epsilon_dpo/beta": 0.012697985395789146, "epsilon_dpo/beta_margin_grad_mean": -0.15136900544166565, "epsilon_dpo/beta_margin_grad_std": 0.18286660313606262, "epsilon_dpo/beta_margin_mean": 2.484262228012085, "epsilon_dpo/beta_margin_std": 1.5870592594146729, "epsilon_dpo/loss_margin_mean": 196.23475646972656, "grad_norm": 59.1228141784668, "kl/avg_steps": 0.71875, "kl/beta": 0.012788633815944195, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.8051520207480204e-07, "logits/chosen": -1.7581638097763062, "logits/rejected": -1.7033562660217285, "logps/chosen": -142.43875122070312, "logps/ref_chosen": -43.99567413330078, "logps/ref_rejected": -115.62770080566406, "logps/rejected": -410.3055419921875, "loss": 0.3916, "rewards/accuracies": 0.90625, "rewards/chosen": -1.253930926322937, "rewards/margins": 2.484261989593506, "rewards/rejected": -3.7381930351257324, "step": 269 }, { "epoch": 0.3964757709251101, "epsilon_dpo/beta": 0.012599433772265911, "epsilon_dpo/beta_margin_grad_mean": -0.18488836288452148, "epsilon_dpo/beta_margin_grad_std": 0.2021288424730301, "epsilon_dpo/beta_margin_mean": 2.0731112957000732, "epsilon_dpo/beta_margin_std": 1.5216357707977295, "epsilon_dpo/loss_margin_mean": 165.01644897460938, "grad_norm": 79.74817657470703, "kl/avg_steps": 0.78125, "kl/beta": 0.012697371654212475, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.794189242333106e-07, "logits/chosen": -1.8194841146469116, "logits/rejected": -1.6801855564117432, "logps/chosen": -164.27764892578125, "logps/ref_chosen": -44.127079010009766, "logps/ref_rejected": -116.14840698242188, "logps/rejected": -401.3154296875, "loss": 0.512, "rewards/accuracies": 0.875, "rewards/chosen": -1.5177865028381348, "rewards/margins": 2.0731112957000732, "rewards/rejected": -3.590897798538208, "step": 270 }, { "epoch": 0.39794419970631423, "epsilon_dpo/beta": 0.012493887916207314, "epsilon_dpo/beta_margin_grad_mean": -0.19335998594760895, "epsilon_dpo/beta_margin_grad_std": 0.19798798859119415, "epsilon_dpo/beta_margin_mean": 2.0564560890197754, "epsilon_dpo/beta_margin_std": 1.6271862983703613, "epsilon_dpo/loss_margin_mean": 164.9576873779297, "grad_norm": 58.63235855102539, "kl/avg_steps": 0.84375, "kl/beta": 0.012598942033946514, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.7831923608280514e-07, "logits/chosen": -1.7537434101104736, "logits/rejected": -1.6001136302947998, "logps/chosen": -133.10488891601562, "logps/ref_chosen": -35.64381408691406, "logps/ref_rejected": -97.9295883178711, "logps/rejected": -360.34832763671875, "loss": 0.5335, "rewards/accuracies": 0.9375, "rewards/chosen": -1.219379186630249, "rewards/margins": 2.0564560890197754, "rewards/rejected": -3.2758355140686035, "step": 271 }, { "epoch": 0.39941262848751835, "epsilon_dpo/beta": 0.012385448440909386, "epsilon_dpo/beta_margin_grad_mean": -0.12951304018497467, "epsilon_dpo/beta_margin_grad_std": 0.1683792918920517, "epsilon_dpo/beta_margin_mean": 2.5986411571502686, "epsilon_dpo/beta_margin_std": 1.48728609085083, "epsilon_dpo/loss_margin_mean": 210.13800048828125, "grad_norm": 43.1225471496582, "kl/avg_steps": 0.875, "kl/beta": 0.012493528425693512, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.772161666010912e-07, "logits/chosen": -1.6689045429229736, "logits/rejected": -1.6010162830352783, "logps/chosen": -112.50562286376953, "logps/ref_chosen": -26.52655792236328, "logps/ref_rejected": -112.5738525390625, "logps/rejected": -408.69091796875, "loss": 0.336, "rewards/accuracies": 0.921875, "rewards/chosen": -1.065723180770874, "rewards/margins": 2.5986409187316895, "rewards/rejected": -3.6643643379211426, "step": 272 }, { "epoch": 0.4008810572687225, "epsilon_dpo/beta": 0.012266403064131737, "epsilon_dpo/beta_margin_grad_mean": -0.14163939654827118, "epsilon_dpo/beta_margin_grad_std": 0.14621266722679138, "epsilon_dpo/beta_margin_mean": 2.353142261505127, "epsilon_dpo/beta_margin_std": 1.3905528783798218, "epsilon_dpo/loss_margin_mean": 191.923583984375, "grad_norm": 50.07746505737305, "kl/avg_steps": 0.96875, "kl/beta": 0.012385157868266106, "kl/n_epsilon_steps": 0.015625, "kl/p_epsilon_steps": 0.984375, "learning_rate": 3.761097448550755e-07, "logits/chosen": -1.71974515914917, "logits/rejected": -1.6216449737548828, "logps/chosen": -137.0894317626953, "logps/ref_chosen": -34.63496017456055, "logps/ref_rejected": -97.36636352539062, "logps/rejected": -391.7444152832031, "loss": 0.3461, "rewards/accuracies": 0.984375, "rewards/chosen": -1.2572829723358154, "rewards/margins": 2.353142261505127, "rewards/rejected": -3.6104252338409424, "step": 273 }, { "epoch": 0.4023494860499266, "epsilon_dpo/beta": 0.012164046056568623, "epsilon_dpo/beta_margin_grad_mean": -0.1828116923570633, "epsilon_dpo/beta_margin_grad_std": 0.1835782825946808, "epsilon_dpo/beta_margin_mean": 2.068690061569214, "epsilon_dpo/beta_margin_std": 1.446864366531372, "epsilon_dpo/loss_margin_mean": 170.40960693359375, "grad_norm": 58.21686553955078, "kl/avg_steps": 0.84375, "kl/beta": 0.012266327627003193, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.75e-07, "logits/chosen": -1.5487819910049438, "logits/rejected": -1.4224430322647095, "logps/chosen": -150.64378356933594, "logps/ref_chosen": -32.576805114746094, "logps/ref_rejected": -83.13209533691406, "logps/rejected": -371.6086730957031, "loss": 0.4745, "rewards/accuracies": 0.921875, "rewards/chosen": -1.4374797344207764, "rewards/margins": 2.0686898231506348, "rewards/rejected": -3.5061697959899902, "step": 274 }, { "epoch": 0.40381791483113066, "epsilon_dpo/beta": 0.012066072784364223, "epsilon_dpo/beta_margin_grad_mean": -0.1988518238067627, "epsilon_dpo/beta_margin_grad_std": 0.19736242294311523, "epsilon_dpo/beta_margin_mean": 1.9684127569198608, "epsilon_dpo/beta_margin_std": 1.5511200428009033, "epsilon_dpo/loss_margin_mean": 163.5451202392578, "grad_norm": 58.6602783203125, "kl/avg_steps": 0.8125, "kl/beta": 0.012163696810603142, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.738869612786737e-07, "logits/chosen": -1.6947541236877441, "logits/rejected": -1.5999658107757568, "logps/chosen": -152.82989501953125, "logps/ref_chosen": -34.1890983581543, "logps/ref_rejected": -98.71599578857422, "logps/rejected": -380.90191650390625, "loss": 0.5343, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4348502159118652, "rewards/margins": 1.9684127569198608, "rewards/rejected": -3.4032628536224365, "step": 275 }, { "epoch": 0.4052863436123348, "epsilon_dpo/beta": 0.011965055949985981, "epsilon_dpo/beta_margin_grad_mean": -0.2083517462015152, "epsilon_dpo/beta_margin_grad_std": 0.1770058423280716, "epsilon_dpo/beta_margin_mean": 1.7445530891418457, "epsilon_dpo/beta_margin_std": 1.3083961009979248, "epsilon_dpo/loss_margin_mean": 146.08535766601562, "grad_norm": 62.463478088378906, "kl/avg_steps": 0.84375, "kl/beta": 0.012065663002431393, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.7277065802070204e-07, "logits/chosen": -1.75152587890625, "logits/rejected": -1.3374929428100586, "logps/chosen": -188.8147735595703, "logps/ref_chosen": -47.06272888183594, "logps/ref_rejected": -76.37776947021484, "logps/rejected": -364.2151794433594, "loss": 0.537, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6980412006378174, "rewards/margins": 1.7445529699325562, "rewards/rejected": -3.442594289779663, "step": 276 }, { "epoch": 0.4067547723935389, "epsilon_dpo/beta": 0.01186494529247284, "epsilon_dpo/beta_margin_grad_mean": -0.17211416363716125, "epsilon_dpo/beta_margin_grad_std": 0.16370359063148499, "epsilon_dpo/beta_margin_mean": 2.1374924182891846, "epsilon_dpo/beta_margin_std": 1.4325969219207764, "epsilon_dpo/loss_margin_mean": 180.449951171875, "grad_norm": 52.0661735534668, "kl/avg_steps": 0.84375, "kl/beta": 0.011964711360633373, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.71651119641714e-07, "logits/chosen": -1.622573971748352, "logits/rejected": -1.471212387084961, "logps/chosen": -159.37274169921875, "logps/ref_chosen": -35.24298095703125, "logps/ref_rejected": -99.66352844238281, "logps/rejected": -404.24322509765625, "loss": 0.4258, "rewards/accuracies": 0.953125, "rewards/chosen": -1.4755170345306396, "rewards/margins": 2.1374926567077637, "rewards/rejected": -3.613009452819824, "step": 277 }, { "epoch": 0.40822320117474303, "epsilon_dpo/beta": 0.011769380420446396, "epsilon_dpo/beta_margin_grad_mean": -0.18432171642780304, "epsilon_dpo/beta_margin_grad_std": 0.17036166787147522, "epsilon_dpo/beta_margin_mean": 2.0264534950256348, "epsilon_dpo/beta_margin_std": 1.5373589992523193, "epsilon_dpo/loss_margin_mean": 172.5458526611328, "grad_norm": 54.76714324951172, "kl/avg_steps": 0.8125, "kl/beta": 0.011864603497087955, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.705283756425872e-07, "logits/chosen": -1.7193242311477661, "logits/rejected": -1.5687487125396729, "logps/chosen": -169.8358154296875, "logps/ref_chosen": -38.0869140625, "logps/ref_rejected": -96.20486450195312, "logps/rejected": -400.4996337890625, "loss": 0.4658, "rewards/accuracies": 0.90625, "rewards/chosen": -1.5532996654510498, "rewards/margins": 2.0264534950256348, "rewards/rejected": -3.5797531604766846, "step": 278 }, { "epoch": 0.40969162995594716, "epsilon_dpo/beta": 0.011667169630527496, "epsilon_dpo/beta_margin_grad_mean": -0.17751751840114594, "epsilon_dpo/beta_margin_grad_std": 0.16590261459350586, "epsilon_dpo/beta_margin_mean": 2.106186628341675, "epsilon_dpo/beta_margin_std": 1.458591341972351, "epsilon_dpo/loss_margin_mean": 180.77297973632812, "grad_norm": 56.205360412597656, "kl/avg_steps": 0.875, "kl/beta": 0.011768980883061886, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.6940245560867e-07, "logits/chosen": -1.4648022651672363, "logits/rejected": -1.4414713382720947, "logps/chosen": -151.58514404296875, "logps/ref_chosen": -29.908262252807617, "logps/ref_rejected": -93.1087646484375, "logps/rejected": -395.55865478515625, "loss": 0.4424, "rewards/accuracies": 0.96875, "rewards/chosen": -1.4212896823883057, "rewards/margins": 2.106186628341675, "rewards/rejected": -3.5274763107299805, "step": 279 }, { "epoch": 0.4111600587371512, "epsilon_dpo/beta": 0.01156961265951395, "epsilon_dpo/beta_margin_grad_mean": -0.16350157558918, "epsilon_dpo/beta_margin_grad_std": 0.16755622625350952, "epsilon_dpo/beta_margin_mean": 2.1660408973693848, "epsilon_dpo/beta_margin_std": 1.4027117490768433, "epsilon_dpo/loss_margin_mean": 187.58596801757812, "grad_norm": 52.6536865234375, "kl/avg_steps": 0.84375, "kl/beta": 0.011666894890367985, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -1.6275845766067505, "logits/rejected": -1.6037421226501465, "logps/chosen": -165.87985229492188, "logps/ref_chosen": -40.1248664855957, "logps/ref_rejected": -105.53138732910156, "logps/rejected": -418.8723449707031, "loss": 0.417, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4572311639785767, "rewards/margins": 2.1660408973693848, "rewards/rejected": -3.623271942138672, "step": 280 }, { "epoch": 0.41262848751835535, "epsilon_dpo/beta": 0.011469194665551186, "epsilon_dpo/beta_margin_grad_mean": -0.18068785965442657, "epsilon_dpo/beta_margin_grad_std": 0.1649240106344223, "epsilon_dpo/beta_margin_mean": 2.053178548812866, "epsilon_dpo/beta_margin_std": 1.504626750946045, "epsilon_dpo/loss_margin_mean": 179.28672790527344, "grad_norm": 54.2619743347168, "kl/avg_steps": 0.875, "kl/beta": 0.011569279246032238, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.6714120619553435e-07, "logits/chosen": -1.6719520092010498, "logits/rejected": -1.367035150527954, "logps/chosen": -169.83157348632812, "logps/ref_chosen": -39.501502990722656, "logps/ref_rejected": -87.23008728027344, "logps/rejected": -396.84686279296875, "loss": 0.45, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4972405433654785, "rewards/margins": 2.053178548812866, "rewards/rejected": -3.5504188537597656, "step": 281 }, { "epoch": 0.41409691629955947, "epsilon_dpo/beta": 0.011376879177987576, "epsilon_dpo/beta_margin_grad_mean": -0.18481247127056122, "epsilon_dpo/beta_margin_grad_std": 0.17733608186244965, "epsilon_dpo/beta_margin_mean": 1.9530349969863892, "epsilon_dpo/beta_margin_std": 1.3995331525802612, "epsilon_dpo/loss_margin_mean": 172.08128356933594, "grad_norm": 54.99889373779297, "kl/avg_steps": 0.8125, "kl/beta": 0.011468926444649696, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.660059364023408e-07, "logits/chosen": -1.6971466541290283, "logits/rejected": -1.5857012271881104, "logps/chosen": -174.48223876953125, "logps/ref_chosen": -46.00492858886719, "logps/ref_rejected": -101.88005828857422, "logps/rejected": -402.43865966796875, "loss": 0.4842, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4642865657806396, "rewards/margins": 1.9530349969863892, "rewards/rejected": -3.4173216819763184, "step": 282 }, { "epoch": 0.4155653450807636, "epsilon_dpo/beta": 0.01127807516604662, "epsilon_dpo/beta_margin_grad_mean": -0.1464729756116867, "epsilon_dpo/beta_margin_grad_std": 0.15092232823371887, "epsilon_dpo/beta_margin_mean": 2.340468645095825, "epsilon_dpo/beta_margin_std": 1.3866698741912842, "epsilon_dpo/loss_margin_mean": 207.81991577148438, "grad_norm": 47.481971740722656, "kl/avg_steps": 0.875, "kl/beta": 0.011376491747796535, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.6486760974483685e-07, "logits/chosen": -1.6741735935211182, "logits/rejected": -1.502656102180481, "logps/chosen": -153.2581329345703, "logps/ref_chosen": -35.3682861328125, "logps/ref_rejected": -103.27058410644531, "logps/rejected": -428.9803466796875, "loss": 0.3582, "rewards/accuracies": 0.953125, "rewards/chosen": -1.330566644668579, "rewards/margins": 2.340468645095825, "rewards/rejected": -3.6710352897644043, "step": 283 }, { "epoch": 0.4170337738619677, "epsilon_dpo/beta": 0.011197871528565884, "epsilon_dpo/beta_margin_grad_mean": -0.18323473632335663, "epsilon_dpo/beta_margin_grad_std": 0.1965819150209427, "epsilon_dpo/beta_margin_mean": 2.156057596206665, "epsilon_dpo/beta_margin_std": 1.6328465938568115, "epsilon_dpo/loss_margin_mean": 193.1970977783203, "grad_norm": 64.37258911132812, "kl/avg_steps": 0.71875, "kl/beta": 0.011277811601758003, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.6372625621898863e-07, "logits/chosen": -1.67313551902771, "logits/rejected": -1.5210282802581787, "logps/chosen": -168.32333374023438, "logps/ref_chosen": -41.10857391357422, "logps/ref_rejected": -99.55363464355469, "logps/rejected": -419.96551513671875, "loss": 0.4919, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4272245168685913, "rewards/margins": 2.156057357788086, "rewards/rejected": -3.583281993865967, "step": 284 }, { "epoch": 0.4185022026431718, "epsilon_dpo/beta": 0.011096964590251446, "epsilon_dpo/beta_margin_grad_mean": -0.14801117777824402, "epsilon_dpo/beta_margin_grad_std": 0.17136132717132568, "epsilon_dpo/beta_margin_mean": 2.2969985008239746, "epsilon_dpo/beta_margin_std": 1.3974249362945557, "epsilon_dpo/loss_margin_mean": 207.27685546875, "grad_norm": 60.48746871948242, "kl/avg_steps": 0.90625, "kl/beta": 0.011197330430150032, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 3.625819059005228e-07, "logits/chosen": -1.64085054397583, "logits/rejected": -1.4849854707717896, "logps/chosen": -163.57217407226562, "logps/ref_chosen": -35.757301330566406, "logps/ref_rejected": -108.66427612304688, "logps/rejected": -443.7559814453125, "loss": 0.4048, "rewards/accuracies": 0.953125, "rewards/chosen": -1.4201911687850952, "rewards/margins": 2.2969985008239746, "rewards/rejected": -3.7171897888183594, "step": 285 }, { "epoch": 0.4199706314243759, "epsilon_dpo/beta": 0.011004237458109856, "epsilon_dpo/beta_margin_grad_mean": -0.16391685605049133, "epsilon_dpo/beta_margin_grad_std": 0.18253783881664276, "epsilon_dpo/beta_margin_mean": 2.2222766876220703, "epsilon_dpo/beta_margin_std": 1.464618444442749, "epsilon_dpo/loss_margin_mean": 202.3627471923828, "grad_norm": 60.148529052734375, "kl/avg_steps": 0.84375, "kl/beta": 0.011096766218543053, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.614345889441346e-07, "logits/chosen": -1.7381117343902588, "logits/rejected": -1.4690245389938354, "logps/chosen": -175.14715576171875, "logps/ref_chosen": -45.06391525268555, "logps/ref_rejected": -95.78263854980469, "logps/rejected": -428.2286376953125, "loss": 0.4342, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4335534572601318, "rewards/margins": 2.2222766876220703, "rewards/rejected": -3.6558303833007812, "step": 286 }, { "epoch": 0.42143906020558003, "epsilon_dpo/beta": 0.010929361917078495, "epsilon_dpo/beta_margin_grad_mean": -0.23106051981449127, "epsilon_dpo/beta_margin_grad_std": 0.19667471945285797, "epsilon_dpo/beta_margin_mean": 1.6739410161972046, "epsilon_dpo/beta_margin_std": 1.4721882343292236, "epsilon_dpo/loss_margin_mean": 153.76026916503906, "grad_norm": 71.68363189697266, "kl/avg_steps": 0.6875, "kl/beta": 0.011003920808434486, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.6028433558269275e-07, "logits/chosen": -1.7347445487976074, "logits/rejected": -1.4353196620941162, "logps/chosen": -186.29312133789062, "logps/ref_chosen": -44.68206787109375, "logps/ref_rejected": -82.90010070800781, "logps/rejected": -378.27142333984375, "loss": 0.611, "rewards/accuracies": 0.84375, "rewards/chosen": -1.551443099975586, "rewards/margins": 1.6739410161972046, "rewards/rejected": -3.22538423538208, "step": 287 }, { "epoch": 0.42290748898678415, "epsilon_dpo/beta": 0.010827410966157913, "epsilon_dpo/beta_margin_grad_mean": -0.15691792964935303, "epsilon_dpo/beta_margin_grad_std": 0.16194118559360504, "epsilon_dpo/beta_margin_mean": 2.1758952140808105, "epsilon_dpo/beta_margin_std": 1.2892285585403442, "epsilon_dpo/loss_margin_mean": 201.15391540527344, "grad_norm": 56.249759674072266, "kl/avg_steps": 0.9375, "kl/beta": 0.010928785428404808, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 3.5913117612644327e-07, "logits/chosen": -1.6811950206756592, "logits/rejected": -1.4292771816253662, "logps/chosen": -160.89662170410156, "logps/ref_chosen": -35.92053985595703, "logps/ref_rejected": -92.28993225097656, "logps/rejected": -418.419921875, "loss": 0.3983, "rewards/accuracies": 0.96875, "rewards/chosen": -1.3546841144561768, "rewards/margins": 2.1758952140808105, "rewards/rejected": -3.5305793285369873, "step": 288 }, { "epoch": 0.4243759177679883, "epsilon_dpo/beta": 0.010733614675700665, "epsilon_dpo/beta_margin_grad_mean": -0.19883452355861664, "epsilon_dpo/beta_margin_grad_std": 0.18830937147140503, "epsilon_dpo/beta_margin_mean": 1.978835105895996, "epsilon_dpo/beta_margin_std": 1.5322097539901733, "epsilon_dpo/loss_margin_mean": 184.6563262939453, "grad_norm": 53.693756103515625, "kl/avg_steps": 0.875, "kl/beta": 0.010827279649674892, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.5797514096221024e-07, "logits/chosen": -1.7554616928100586, "logits/rejected": -1.5511056184768677, "logps/chosen": -151.41265869140625, "logps/ref_chosen": -37.65406036376953, "logps/ref_rejected": -92.58161163330078, "logps/rejected": -390.99652099609375, "loss": 0.5158, "rewards/accuracies": 0.9375, "rewards/chosen": -1.222680687904358, "rewards/margins": 1.978835105895996, "rewards/rejected": -3.2015156745910645, "step": 289 }, { "epoch": 0.42584434654919234, "epsilon_dpo/beta": 0.01063715573400259, "epsilon_dpo/beta_margin_grad_mean": -0.17113091051578522, "epsilon_dpo/beta_margin_grad_std": 0.15478722751140594, "epsilon_dpo/beta_margin_mean": 2.094456911087036, "epsilon_dpo/beta_margin_std": 1.3732136487960815, "epsilon_dpo/loss_margin_mean": 197.09515380859375, "grad_norm": 51.92376708984375, "kl/avg_steps": 0.90625, "kl/beta": 0.010733362287282944, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 3.568162605525952e-07, "logits/chosen": -1.6529500484466553, "logits/rejected": -1.7434873580932617, "logps/chosen": -165.3158416748047, "logps/ref_chosen": -43.256103515625, "logps/ref_rejected": -123.40228271484375, "logps/rejected": -442.55718994140625, "loss": 0.4168, "rewards/accuracies": 0.953125, "rewards/chosen": -1.2994060516357422, "rewards/margins": 2.094456911087036, "rewards/rejected": -3.3938629627227783, "step": 290 }, { "epoch": 0.42731277533039647, "epsilon_dpo/beta": 0.010556564666330814, "epsilon_dpo/beta_margin_grad_mean": -0.1979316920042038, "epsilon_dpo/beta_margin_grad_std": 0.18217770755290985, "epsilon_dpo/beta_margin_mean": 1.9196021556854248, "epsilon_dpo/beta_margin_std": 1.422903060913086, "epsilon_dpo/loss_margin_mean": 182.32635498046875, "grad_norm": 62.94866943359375, "kl/avg_steps": 0.765625, "kl/beta": 0.010636964812874794, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.5565456543517485e-07, "logits/chosen": -1.8479794263839722, "logits/rejected": -1.6821517944335938, "logps/chosen": -165.5596466064453, "logps/ref_chosen": -43.823760986328125, "logps/ref_rejected": -94.49006652832031, "logps/rejected": -398.55230712890625, "loss": 0.5064, "rewards/accuracies": 0.90625, "rewards/chosen": -1.2878754138946533, "rewards/margins": 1.9196021556854248, "rewards/rejected": -3.207477569580078, "step": 291 }, { "epoch": 0.4287812041116006, "epsilon_dpo/beta": 0.010471423156559467, "epsilon_dpo/beta_margin_grad_mean": -0.1748674064874649, "epsilon_dpo/beta_margin_grad_std": 0.17317946255207062, "epsilon_dpo/beta_margin_mean": 2.048370122909546, "epsilon_dpo/beta_margin_std": 1.3248964548110962, "epsilon_dpo/loss_margin_mean": 196.04180908203125, "grad_norm": 62.74113464355469, "kl/avg_steps": 0.8125, "kl/beta": 0.010556144639849663, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.5449008622169583e-07, "logits/chosen": -1.7095630168914795, "logits/rejected": -1.6350113153457642, "logps/chosen": -141.83103942871094, "logps/ref_chosen": -35.38202667236328, "logps/ref_rejected": -94.85586547851562, "logps/rejected": -397.3466796875, "loss": 0.4426, "rewards/accuracies": 0.921875, "rewards/chosen": -1.1173462867736816, "rewards/margins": 2.048370361328125, "rewards/rejected": -3.1657166481018066, "step": 292 }, { "epoch": 0.4302496328928047, "epsilon_dpo/beta": 0.010403390973806381, "epsilon_dpo/beta_margin_grad_mean": -0.2334311306476593, "epsilon_dpo/beta_margin_grad_std": 0.19696100056171417, "epsilon_dpo/beta_margin_mean": 1.6216365098953247, "epsilon_dpo/beta_margin_std": 1.3715225458145142, "epsilon_dpo/loss_margin_mean": 156.5295867919922, "grad_norm": 59.1533088684082, "kl/avg_steps": 0.65625, "kl/beta": 0.010471067391335964, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.5332285359726846e-07, "logits/chosen": -1.8089414834976196, "logits/rejected": -1.613842248916626, "logps/chosen": -150.8321075439453, "logps/ref_chosen": -36.39015197753906, "logps/ref_rejected": -83.55977630615234, "logps/rejected": -354.53131103515625, "loss": 0.6168, "rewards/accuracies": 0.84375, "rewards/chosen": -1.193084478378296, "rewards/margins": 1.6216366291046143, "rewards/rejected": -2.81472110748291, "step": 293 }, { "epoch": 0.43171806167400884, "epsilon_dpo/beta": 0.01031605713069439, "epsilon_dpo/beta_margin_grad_mean": -0.20596943795681, "epsilon_dpo/beta_margin_grad_std": 0.16830576956272125, "epsilon_dpo/beta_margin_mean": 1.6836501359939575, "epsilon_dpo/beta_margin_std": 1.2057768106460571, "epsilon_dpo/loss_margin_mean": 163.5525360107422, "grad_norm": 53.15205001831055, "kl/avg_steps": 0.84375, "kl/beta": 0.010402798652648926, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.5215289831955786e-07, "logits/chosen": -1.6690821647644043, "logits/rejected": -1.641129732131958, "logps/chosen": -139.99928283691406, "logps/ref_chosen": -32.32667541503906, "logps/ref_rejected": -87.2591552734375, "logps/rejected": -358.48431396484375, "loss": 0.5324, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1122854948043823, "rewards/margins": 1.6836501359939575, "rewards/rejected": -2.79593563079834, "step": 294 }, { "epoch": 0.4331864904552129, "epsilon_dpo/beta": 0.010229743085801601, "epsilon_dpo/beta_margin_grad_mean": -0.2001977413892746, "epsilon_dpo/beta_margin_grad_std": 0.19301089644432068, "epsilon_dpo/beta_margin_mean": 1.9150567054748535, "epsilon_dpo/beta_margin_std": 1.5144377946853638, "epsilon_dpo/loss_margin_mean": 187.62313842773438, "grad_norm": 73.17021179199219, "kl/avg_steps": 0.84375, "kl/beta": 0.010315759107470512, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.509802512179737e-07, "logits/chosen": -1.7157707214355469, "logits/rejected": -1.6699479818344116, "logps/chosen": -145.589599609375, "logps/ref_chosen": -32.976951599121094, "logps/ref_rejected": -96.25013732910156, "logps/rejected": -396.48590087890625, "loss": 0.5393, "rewards/accuracies": 0.921875, "rewards/chosen": -1.1549984216690063, "rewards/margins": 1.9150567054748535, "rewards/rejected": -3.0700550079345703, "step": 295 }, { "epoch": 0.434654919236417, "epsilon_dpo/beta": 0.010147349908947945, "epsilon_dpo/beta_margin_grad_mean": -0.21665216982364655, "epsilon_dpo/beta_margin_grad_std": 0.17736758291721344, "epsilon_dpo/beta_margin_mean": 1.7295547723770142, "epsilon_dpo/beta_margin_std": 1.38618004322052, "epsilon_dpo/loss_margin_mean": 170.8570556640625, "grad_norm": 54.07697296142578, "kl/avg_steps": 0.8125, "kl/beta": 0.010229448787868023, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.498049431928577e-07, "logits/chosen": -1.944000005722046, "logits/rejected": -1.7023565769195557, "logps/chosen": -159.61997985839844, "logps/ref_chosen": -41.81062316894531, "logps/ref_rejected": -99.36541748046875, "logps/rejected": -388.03179931640625, "loss": 0.5562, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1971094608306885, "rewards/margins": 1.7295548915863037, "rewards/rejected": -2.926664352416992, "step": 296 }, { "epoch": 0.43612334801762115, "epsilon_dpo/beta": 0.010062395595014095, "epsilon_dpo/beta_margin_grad_mean": -0.19078318774700165, "epsilon_dpo/beta_margin_grad_std": 0.15408045053482056, "epsilon_dpo/beta_margin_mean": 1.7881520986557007, "epsilon_dpo/beta_margin_std": 1.1349061727523804, "epsilon_dpo/loss_margin_mean": 178.01626586914062, "grad_norm": 52.08464431762695, "kl/avg_steps": 0.84375, "kl/beta": 0.010147004388272762, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.486270052146694e-07, "logits/chosen": -1.8551688194274902, "logits/rejected": -1.8467998504638672, "logps/chosen": -146.10855102539062, "logps/ref_chosen": -35.64509582519531, "logps/ref_rejected": -101.33485412597656, "logps/rejected": -389.8145751953125, "loss": 0.469, "rewards/accuracies": 0.953125, "rewards/chosen": -1.112633466720581, "rewards/margins": 1.7881522178649902, "rewards/rejected": -2.900785446166992, "step": 297 }, { "epoch": 0.43759177679882527, "epsilon_dpo/beta": 0.009971914812922478, "epsilon_dpo/beta_margin_grad_mean": -0.17282237112522125, "epsilon_dpo/beta_margin_grad_std": 0.14743012189865112, "epsilon_dpo/beta_margin_mean": 2.0463857650756836, "epsilon_dpo/beta_margin_std": 1.3721272945404053, "epsilon_dpo/loss_margin_mean": 205.4117889404297, "grad_norm": 47.372406005859375, "kl/avg_steps": 0.90625, "kl/beta": 0.010062105022370815, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 3.474464683231698e-07, "logits/chosen": -1.854588508605957, "logits/rejected": -1.974242091178894, "logps/chosen": -136.94308471679688, "logps/ref_chosen": -39.13259506225586, "logps/ref_rejected": -125.148193359375, "logps/rejected": -428.3704833984375, "loss": 0.4176, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9769392013549805, "rewards/margins": 2.0463857650756836, "rewards/rejected": -3.023324966430664, "step": 298 }, { "epoch": 0.4390602055800294, "epsilon_dpo/beta": 0.009882355108857155, "epsilon_dpo/beta_margin_grad_mean": -0.17700159549713135, "epsilon_dpo/beta_margin_grad_std": 0.15176741778850555, "epsilon_dpo/beta_margin_mean": 1.9076223373413086, "epsilon_dpo/beta_margin_std": 1.1600240468978882, "epsilon_dpo/loss_margin_mean": 193.25885009765625, "grad_norm": 50.078041076660156, "kl/avg_steps": 0.90625, "kl/beta": 0.009971735998988152, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 3.462633636266041e-07, "logits/chosen": -1.7307016849517822, "logits/rejected": -1.819953203201294, "logps/chosen": -122.997314453125, "logps/ref_chosen": -28.626670837402344, "logps/ref_rejected": -87.74382781982422, "logps/rejected": -375.3733215332031, "loss": 0.4348, "rewards/accuracies": 0.953125, "rewards/chosen": -0.933772087097168, "rewards/margins": 1.9076223373413086, "rewards/rejected": -2.8413944244384766, "step": 299 }, { "epoch": 0.44052863436123346, "epsilon_dpo/beta": 0.009802866727113724, "epsilon_dpo/beta_margin_grad_mean": -0.1854911744594574, "epsilon_dpo/beta_margin_grad_std": 0.19396579265594482, "epsilon_dpo/beta_margin_mean": 2.040740728378296, "epsilon_dpo/beta_margin_std": 1.4987692832946777, "epsilon_dpo/loss_margin_mean": 208.71090698242188, "grad_norm": 58.53467559814453, "kl/avg_steps": 0.8125, "kl/beta": 0.009882179088890553, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -1.7490193843841553, "logits/rejected": -1.7381911277770996, "logps/chosen": -144.44647216796875, "logps/ref_chosen": -33.894203186035156, "logps/ref_rejected": -103.88007354736328, "logps/rejected": -423.14324951171875, "loss": 0.4996, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0862467288970947, "rewards/margins": 2.040740728378296, "rewards/rejected": -3.1269874572753906, "step": 300 }, { "epoch": 0.44052863436123346, "eval_epsilon_dpo/beta": 0.009748955257236958, "eval_epsilon_dpo/beta_margin_grad_mean": -0.29610493779182434, "eval_epsilon_dpo/beta_margin_grad_std": 0.20769952237606049, "eval_epsilon_dpo/beta_margin_mean": 1.1843825578689575, "eval_epsilon_dpo/beta_margin_std": 1.3051005601882935, "eval_epsilon_dpo/loss_margin_mean": 122.25267791748047, "eval_kl/n_epsilon_steps": 0.22174657881259918, "eval_kl/p_epsilon_steps": 0.778253436088562, "eval_logits/chosen": -1.9427273273468018, "eval_logits/rejected": -1.7790117263793945, "eval_logps/chosen": -225.67529296875, "eval_logps/ref_chosen": -68.29110717773438, "eval_logps/ref_rejected": -92.08038330078125, "eval_logps/rejected": -371.7172546386719, "eval_loss": 0.41313475370407104, "eval_rewards/accuracies": 0.7928082346916199, "eval_rewards/chosen": -1.5382959842681885, "eval_rewards/margins": 1.1843825578689575, "eval_rewards/rejected": -2.7226786613464355, "eval_runtime": 38.3815, "eval_samples_per_second": 60.941, "eval_steps_per_second": 1.928, "step": 300 }, { "epoch": 0.4419970631424376, "epsilon_dpo/beta": 0.009714669547975063, "epsilon_dpo/beta_margin_grad_mean": -0.16645051538944244, "epsilon_dpo/beta_margin_grad_std": 0.14359311759471893, "epsilon_dpo/beta_margin_mean": 1.9759942293167114, "epsilon_dpo/beta_margin_std": 1.1226637363433838, "epsilon_dpo/loss_margin_mean": 203.62547302246094, "grad_norm": 49.01324462890625, "kl/avg_steps": 0.90625, "kl/beta": 0.009802533313632011, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 3.4388957558875316e-07, "logits/chosen": -1.7920889854431152, "logits/rejected": -1.7111059427261353, "logps/chosen": -140.640380859375, "logps/ref_chosen": -35.052242279052734, "logps/ref_rejected": -98.54647827148438, "logps/rejected": -407.7601013183594, "loss": 0.402, "rewards/accuracies": 0.953125, "rewards/chosen": -1.0266605615615845, "rewards/margins": 1.9759942293167114, "rewards/rejected": -3.002654790878296, "step": 301 }, { "epoch": 0.4434654919236417, "epsilon_dpo/beta": 0.009633492678403854, "epsilon_dpo/beta_margin_grad_mean": -0.1874367892742157, "epsilon_dpo/beta_margin_grad_std": 0.1728639304637909, "epsilon_dpo/beta_margin_mean": 1.9052062034606934, "epsilon_dpo/beta_margin_std": 1.2975436449050903, "epsilon_dpo/loss_margin_mean": 198.14389038085938, "grad_norm": 63.02902603149414, "kl/avg_steps": 0.84375, "kl/beta": 0.00971449539065361, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.426989547989902e-07, "logits/chosen": -1.8670696020126343, "logits/rejected": -1.852245807647705, "logps/chosen": -124.57978820800781, "logps/ref_chosen": -31.218624114990234, "logps/ref_rejected": -103.73237609863281, "logps/rejected": -395.2374267578125, "loss": 0.4794, "rewards/accuracies": 0.953125, "rewards/chosen": -0.9009679555892944, "rewards/margins": 1.9052062034606934, "rewards/rejected": -2.8061742782592773, "step": 302 }, { "epoch": 0.44493392070484583, "epsilon_dpo/beta": 0.009558911435306072, "epsilon_dpo/beta_margin_grad_mean": -0.1947356015443802, "epsilon_dpo/beta_margin_grad_std": 0.19330711662769318, "epsilon_dpo/beta_margin_mean": 1.8838841915130615, "epsilon_dpo/beta_margin_std": 1.4101883172988892, "epsilon_dpo/loss_margin_mean": 197.6653594970703, "grad_norm": 64.47979736328125, "kl/avg_steps": 0.78125, "kl/beta": 0.009633215144276619, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.4150589130555773e-07, "logits/chosen": -1.9428577423095703, "logits/rejected": -1.761818289756775, "logps/chosen": -145.26632690429688, "logps/ref_chosen": -42.143306732177734, "logps/ref_rejected": -93.17671203613281, "logps/rejected": -393.965087890625, "loss": 0.5374, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9886916875839233, "rewards/margins": 1.8838841915130615, "rewards/rejected": -2.8725757598876953, "step": 303 }, { "epoch": 0.44640234948604995, "epsilon_dpo/beta": 0.009478837251663208, "epsilon_dpo/beta_margin_grad_mean": -0.17279693484306335, "epsilon_dpo/beta_margin_grad_std": 0.15856152772903442, "epsilon_dpo/beta_margin_mean": 1.9841113090515137, "epsilon_dpo/beta_margin_std": 1.1967487335205078, "epsilon_dpo/loss_margin_mean": 209.67984008789062, "grad_norm": 50.6738395690918, "kl/avg_steps": 0.84375, "kl/beta": 0.0095585398375988, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.403104165467883e-07, "logits/chosen": -1.9802241325378418, "logits/rejected": -1.827461838722229, "logps/chosen": -144.84146118164062, "logps/ref_chosen": -42.97766876220703, "logps/ref_rejected": -97.31230926513672, "logps/rejected": -408.85595703125, "loss": 0.4258, "rewards/accuracies": 0.921875, "rewards/chosen": -0.9677666425704956, "rewards/margins": 1.9841113090515137, "rewards/rejected": -2.951878070831299, "step": 304 }, { "epoch": 0.447870778267254, "epsilon_dpo/beta": 0.009399528615176678, "epsilon_dpo/beta_margin_grad_mean": -0.1955670863389969, "epsilon_dpo/beta_margin_grad_std": 0.16656768321990967, "epsilon_dpo/beta_margin_mean": 1.814961552619934, "epsilon_dpo/beta_margin_std": 1.2615888118743896, "epsilon_dpo/loss_margin_mean": 193.46107482910156, "grad_norm": 49.391204833984375, "kl/avg_steps": 0.84375, "kl/beta": 0.009478564374148846, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.391125620245535e-07, "logits/chosen": -1.9485129117965698, "logits/rejected": -1.8468644618988037, "logps/chosen": -159.82958984375, "logps/ref_chosen": -46.15346145629883, "logps/ref_rejected": -98.42967224121094, "logps/rejected": -405.56689453125, "loss": 0.4923, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0703450441360474, "rewards/margins": 1.8149614334106445, "rewards/rejected": -2.8853065967559814, "step": 305 }, { "epoch": 0.44933920704845814, "epsilon_dpo/beta": 0.009312070906162262, "epsilon_dpo/beta_margin_grad_mean": -0.17502400279045105, "epsilon_dpo/beta_margin_grad_std": 0.13139215111732483, "epsilon_dpo/beta_margin_mean": 1.8663511276245117, "epsilon_dpo/beta_margin_std": 1.0456689596176147, "epsilon_dpo/loss_margin_mean": 200.55418395996094, "grad_norm": 44.052284240722656, "kl/avg_steps": 0.9375, "kl/beta": 0.009399257600307465, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 3.3791235930343417e-07, "logits/chosen": -1.9102896451950073, "logits/rejected": -1.7865833044052124, "logps/chosen": -163.22915649414062, "logps/ref_chosen": -46.639854431152344, "logps/ref_rejected": -91.45211791992188, "logps/rejected": -408.5956115722656, "loss": 0.4142, "rewards/accuracies": 0.984375, "rewards/chosen": -1.0861380100250244, "rewards/margins": 1.8663511276245117, "rewards/rejected": -2.952488899230957, "step": 306 }, { "epoch": 0.45080763582966227, "epsilon_dpo/beta": 0.009248863905668259, "epsilon_dpo/beta_margin_grad_mean": -0.2182687371969223, "epsilon_dpo/beta_margin_grad_std": 0.1931067556142807, "epsilon_dpo/beta_margin_mean": 1.6918431520462036, "epsilon_dpo/beta_margin_std": 1.3128682374954224, "epsilon_dpo/loss_margin_mean": 183.6102294921875, "grad_norm": 60.684417724609375, "kl/avg_steps": 0.6875, "kl/beta": 0.009311958216130733, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.367098400098881e-07, "logits/chosen": -1.928983449935913, "logits/rejected": -1.8208749294281006, "logps/chosen": -150.8052978515625, "logps/ref_chosen": -44.40485382080078, "logps/ref_rejected": -92.84294128417969, "logps/rejected": -382.8536376953125, "loss": 0.5766, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9867569208145142, "rewards/margins": 1.6918432712554932, "rewards/rejected": -2.6786000728607178, "step": 307 }, { "epoch": 0.4522760646108664, "epsilon_dpo/beta": 0.009177041240036488, "epsilon_dpo/beta_margin_grad_mean": -0.21642521023750305, "epsilon_dpo/beta_margin_grad_std": 0.18705737590789795, "epsilon_dpo/beta_margin_mean": 1.6384843587875366, "epsilon_dpo/beta_margin_std": 1.205591082572937, "epsilon_dpo/loss_margin_mean": 179.0634307861328, "grad_norm": 64.62168884277344, "kl/avg_steps": 0.78125, "kl/beta": 0.00924837589263916, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.355050358314172e-07, "logits/chosen": -1.9160457849502563, "logits/rejected": -1.7217934131622314, "logps/chosen": -164.06167602539062, "logps/ref_chosen": -37.731178283691406, "logps/ref_rejected": -85.77114868164062, "logps/rejected": -391.16510009765625, "loss": 0.5679, "rewards/accuracies": 0.90625, "rewards/chosen": -1.161744236946106, "rewards/margins": 1.638484239578247, "rewards/rejected": -2.8002285957336426, "step": 308 }, { "epoch": 0.45374449339207046, "epsilon_dpo/beta": 0.009103032760322094, "epsilon_dpo/beta_margin_grad_mean": -0.20181921124458313, "epsilon_dpo/beta_margin_grad_std": 0.18235483765602112, "epsilon_dpo/beta_margin_mean": 1.7625192403793335, "epsilon_dpo/beta_margin_std": 1.2436327934265137, "epsilon_dpo/loss_margin_mean": 194.1045379638672, "grad_norm": 56.110897064208984, "kl/avg_steps": 0.8125, "kl/beta": 0.00917668268084526, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.3429797851573183e-07, "logits/chosen": -1.9111260175704956, "logits/rejected": -1.7678544521331787, "logps/chosen": -139.52688598632812, "logps/ref_chosen": -35.310665130615234, "logps/ref_rejected": -86.65730285644531, "logps/rejected": -384.9780578613281, "loss": 0.5253, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9513264894485474, "rewards/margins": 1.7625192403793335, "rewards/rejected": -2.713845729827881, "step": 309 }, { "epoch": 0.4552129221732746, "epsilon_dpo/beta": 0.009035356342792511, "epsilon_dpo/beta_margin_grad_mean": -0.2072700560092926, "epsilon_dpo/beta_margin_grad_std": 0.18662025034427643, "epsilon_dpo/beta_margin_mean": 1.7331546545028687, "epsilon_dpo/beta_margin_std": 1.237532138824463, "epsilon_dpo/loss_margin_mean": 192.4178924560547, "grad_norm": 57.408546447753906, "kl/avg_steps": 0.75, "kl/beta": 0.009102723561227322, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -1.9054917097091675, "logits/rejected": -1.831470012664795, "logps/chosen": -152.9647674560547, "logps/ref_chosen": -42.4453125, "logps/ref_rejected": -85.33100128173828, "logps/rejected": -388.26837158203125, "loss": 0.5391, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0019582509994507, "rewards/margins": 1.7331546545028687, "rewards/rejected": -2.7351129055023193, "step": 310 }, { "epoch": 0.4566813509544787, "epsilon_dpo/beta": 0.008951152674853802, "epsilon_dpo/beta_margin_grad_mean": -0.1625707447528839, "epsilon_dpo/beta_margin_grad_std": 0.14155903458595276, "epsilon_dpo/beta_margin_mean": 2.1258339881896973, "epsilon_dpo/beta_margin_std": 1.3122553825378418, "epsilon_dpo/loss_margin_mean": 237.6621856689453, "grad_norm": 37.80696487426758, "kl/avg_steps": 0.9375, "kl/beta": 0.0090349605306983, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 3.3187723175958346e-07, "logits/chosen": -1.8577749729156494, "logits/rejected": -1.6029454469680786, "logps/chosen": -135.50538635253906, "logps/ref_chosen": -30.15520477294922, "logps/ref_rejected": -82.74742126464844, "logps/rejected": -425.759765625, "loss": 0.3892, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9435994029045105, "rewards/margins": 2.1258339881896973, "rewards/rejected": -3.0694334506988525, "step": 311 }, { "epoch": 0.4581497797356828, "epsilon_dpo/beta": 0.008882002905011177, "epsilon_dpo/beta_margin_grad_mean": -0.209417924284935, "epsilon_dpo/beta_margin_grad_std": 0.18264873325824738, "epsilon_dpo/beta_margin_mean": 1.7365641593933105, "epsilon_dpo/beta_margin_std": 1.3034473657608032, "epsilon_dpo/loss_margin_mean": 196.04774475097656, "grad_norm": 56.1854362487793, "kl/avg_steps": 0.78125, "kl/beta": 0.008951044641435146, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.306636061080487e-07, "logits/chosen": -1.8750476837158203, "logits/rejected": -1.6363931894302368, "logps/chosen": -154.71817016601562, "logps/ref_chosen": -32.57682418823242, "logps/ref_rejected": -81.77426147460938, "logps/rejected": -399.96337890625, "loss": 0.5417, "rewards/accuracies": 0.875, "rewards/chosen": -1.088057041168213, "rewards/margins": 1.7365641593933105, "rewards/rejected": -2.8246212005615234, "step": 312 }, { "epoch": 0.45961820851688695, "epsilon_dpo/beta": 0.008807598613202572, "epsilon_dpo/beta_margin_grad_mean": -0.19396935403347015, "epsilon_dpo/beta_margin_grad_std": 0.16426628828048706, "epsilon_dpo/beta_margin_mean": 1.8069427013397217, "epsilon_dpo/beta_margin_std": 1.2012414932250977, "epsilon_dpo/loss_margin_mean": 205.55426025390625, "grad_norm": 47.559181213378906, "kl/avg_steps": 0.84375, "kl/beta": 0.008881657384335995, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.2944785489547537e-07, "logits/chosen": -1.9602954387664795, "logits/rejected": -1.78814697265625, "logps/chosen": -147.1026153564453, "logps/ref_chosen": -28.45586395263672, "logps/ref_rejected": -93.60973358154297, "logps/rejected": -417.8107604980469, "loss": 0.4862, "rewards/accuracies": 0.921875, "rewards/chosen": -1.0461211204528809, "rewards/margins": 1.8069427013397217, "rewards/rejected": -2.8530638217926025, "step": 313 }, { "epoch": 0.461086637298091, "epsilon_dpo/beta": 0.008725648745894432, "epsilon_dpo/beta_margin_grad_mean": -0.19880293309688568, "epsilon_dpo/beta_margin_grad_std": 0.14619764685630798, "epsilon_dpo/beta_margin_mean": 1.7408623695373535, "epsilon_dpo/beta_margin_std": 1.1878901720046997, "epsilon_dpo/loss_margin_mean": 199.67056274414062, "grad_norm": 43.1519660949707, "kl/avg_steps": 0.9375, "kl/beta": 0.008807345293462276, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 3.2823001015803857e-07, "logits/chosen": -1.9734838008880615, "logits/rejected": -1.930768370628357, "logps/chosen": -144.1108856201172, "logps/ref_chosen": -34.89348220825195, "logps/ref_rejected": -102.7928695678711, "logps/rejected": -411.68084716796875, "loss": 0.4832, "rewards/accuracies": 0.96875, "rewards/chosen": -0.953238308429718, "rewards/margins": 1.7408623695373535, "rewards/rejected": -2.694100856781006, "step": 314 }, { "epoch": 0.46255506607929514, "epsilon_dpo/beta": 0.008652785792946815, "epsilon_dpo/beta_margin_grad_mean": -0.23646311461925507, "epsilon_dpo/beta_margin_grad_std": 0.16848552227020264, "epsilon_dpo/beta_margin_mean": 1.4849480390548706, "epsilon_dpo/beta_margin_std": 1.1557073593139648, "epsilon_dpo/loss_margin_mean": 171.94798278808594, "grad_norm": 62.91958999633789, "kl/avg_steps": 0.84375, "kl/beta": 0.008725542575120926, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.270101039870797e-07, "logits/chosen": -1.8116509914398193, "logits/rejected": -1.7590763568878174, "logps/chosen": -159.944091796875, "logps/ref_chosen": -33.06333923339844, "logps/ref_rejected": -85.8106918334961, "logps/rejected": -384.639404296875, "loss": 0.5973, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0994541645050049, "rewards/margins": 1.4849481582641602, "rewards/rejected": -2.584402084350586, "step": 315 }, { "epoch": 0.46402349486049926, "epsilon_dpo/beta": 0.008572276681661606, "epsilon_dpo/beta_margin_grad_mean": -0.19783826172351837, "epsilon_dpo/beta_margin_grad_std": 0.14127233624458313, "epsilon_dpo/beta_margin_mean": 1.7659293413162231, "epsilon_dpo/beta_margin_std": 1.222463607788086, "epsilon_dpo/loss_margin_mean": 206.15966796875, "grad_norm": 53.759849548339844, "kl/avg_steps": 0.9375, "kl/beta": 0.008652537129819393, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 3.2578816852826086e-07, "logits/chosen": -1.918544054031372, "logits/rejected": -1.8977160453796387, "logps/chosen": -171.6690216064453, "logps/ref_chosen": -39.93564224243164, "logps/ref_rejected": -107.95684814453125, "logps/rejected": -445.84991455078125, "loss": 0.4775, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1298701763153076, "rewards/margins": 1.7659293413162231, "rewards/rejected": -2.8957996368408203, "step": 316 }, { "epoch": 0.4654919236417034, "epsilon_dpo/beta": 0.00850069522857666, "epsilon_dpo/beta_margin_grad_mean": -0.1773056834936142, "epsilon_dpo/beta_margin_grad_std": 0.1531369686126709, "epsilon_dpo/beta_margin_mean": 1.9689700603485107, "epsilon_dpo/beta_margin_std": 1.239305853843689, "epsilon_dpo/loss_margin_mean": 232.01678466796875, "grad_norm": 45.539825439453125, "kl/avg_steps": 0.84375, "kl/beta": 0.008572173304855824, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.2456423598071783e-07, "logits/chosen": -1.982346773147583, "logits/rejected": -1.7732250690460205, "logps/chosen": -157.90228271484375, "logps/ref_chosen": -33.96290969848633, "logps/ref_rejected": -107.23854064941406, "logps/rejected": -463.1947021484375, "loss": 0.4319, "rewards/accuracies": 0.921875, "rewards/chosen": -1.0545722246170044, "rewards/margins": 1.9689700603485107, "rewards/rejected": -3.0235424041748047, "step": 317 }, { "epoch": 0.4669603524229075, "epsilon_dpo/beta": 0.00842957105487585, "epsilon_dpo/beta_margin_grad_mean": -0.21183888614177704, "epsilon_dpo/beta_margin_grad_std": 0.17660567164421082, "epsilon_dpo/beta_margin_mean": 1.6326032876968384, "epsilon_dpo/beta_margin_std": 1.1618354320526123, "epsilon_dpo/loss_margin_mean": 194.12109375, "grad_norm": 55.2903938293457, "kl/avg_steps": 0.84375, "kl/beta": 0.008500450290739536, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.233383385962115e-07, "logits/chosen": -2.0958797931671143, "logits/rejected": -1.8145737648010254, "logps/chosen": -180.26669311523438, "logps/ref_chosen": -40.75099563598633, "logps/ref_rejected": -89.30494689941406, "logps/rejected": -422.9417419433594, "loss": 0.5493, "rewards/accuracies": 0.921875, "rewards/chosen": -1.177966833114624, "rewards/margins": 1.6326031684875488, "rewards/rejected": -2.810570240020752, "step": 318 }, { "epoch": 0.4684287812041116, "epsilon_dpo/beta": 0.008361675776541233, "epsilon_dpo/beta_margin_grad_mean": -0.18624837696552277, "epsilon_dpo/beta_margin_grad_std": 0.18912473320960999, "epsilon_dpo/beta_margin_mean": 1.9518612623214722, "epsilon_dpo/beta_margin_std": 1.3541351556777954, "epsilon_dpo/loss_margin_mean": 234.00582885742188, "grad_norm": 45.54665756225586, "kl/avg_steps": 0.8125, "kl/beta": 0.008429327979683876, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.2211050867827805e-07, "logits/chosen": -2.003638982772827, "logits/rejected": -2.138455867767334, "logps/chosen": -135.60565185546875, "logps/ref_chosen": -31.367862701416016, "logps/ref_rejected": -121.58282470703125, "logps/rejected": -459.826416015625, "loss": 0.4951, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8743832111358643, "rewards/margins": 1.9518612623214722, "rewards/rejected": -2.826244354248047, "step": 319 }, { "epoch": 0.4698972099853157, "epsilon_dpo/beta": 0.008289058692753315, "epsilon_dpo/beta_margin_grad_mean": -0.18520162999629974, "epsilon_dpo/beta_margin_grad_std": 0.15284550189971924, "epsilon_dpo/beta_margin_mean": 1.8797887563705444, "epsilon_dpo/beta_margin_std": 1.241491675376892, "epsilon_dpo/loss_margin_mean": 227.08258056640625, "grad_norm": 42.13555908203125, "kl/avg_steps": 0.875, "kl/beta": 0.008361391723155975, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.208807785813777e-07, "logits/chosen": -2.008018970489502, "logits/rejected": -1.9689502716064453, "logps/chosen": -159.43008422851562, "logps/ref_chosen": -38.88483428955078, "logps/ref_rejected": -104.38986206054688, "logps/rejected": -452.0176696777344, "loss": 0.4546, "rewards/accuracies": 0.953125, "rewards/chosen": -0.9999314546585083, "rewards/margins": 1.8797886371612549, "rewards/rejected": -2.8797202110290527, "step": 320 }, { "epoch": 0.4713656387665198, "epsilon_dpo/beta": 0.008217157796025276, "epsilon_dpo/beta_margin_grad_mean": -0.19574427604675293, "epsilon_dpo/beta_margin_grad_std": 0.17788065969944, "epsilon_dpo/beta_margin_mean": 1.8304779529571533, "epsilon_dpo/beta_margin_std": 1.294824481010437, "epsilon_dpo/loss_margin_mean": 223.14154052734375, "grad_norm": 57.55246353149414, "kl/avg_steps": 0.875, "kl/beta": 0.00828886404633522, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.1964918071004217e-07, "logits/chosen": -1.9321234226226807, "logits/rejected": -1.7463536262512207, "logps/chosen": -163.596923828125, "logps/ref_chosen": -33.60523986816406, "logps/ref_rejected": -97.33091735839844, "logps/rejected": -450.46417236328125, "loss": 0.5089, "rewards/accuracies": 0.921875, "rewards/chosen": -1.0699717998504639, "rewards/margins": 1.8304779529571533, "rewards/rejected": -2.900449752807617, "step": 321 }, { "epoch": 0.47283406754772395, "epsilon_dpo/beta": 0.008156154304742813, "epsilon_dpo/beta_margin_grad_mean": -0.1957569420337677, "epsilon_dpo/beta_margin_grad_std": 0.16668935120105743, "epsilon_dpo/beta_margin_mean": 1.7986748218536377, "epsilon_dpo/beta_margin_std": 1.202728509902954, "epsilon_dpo/loss_margin_mean": 221.13067626953125, "grad_norm": 51.22137451171875, "kl/avg_steps": 0.75, "kl/beta": 0.008216965943574905, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.184157475180207e-07, "logits/chosen": -2.0422754287719727, "logits/rejected": -1.863848090171814, "logps/chosen": -169.1143798828125, "logps/ref_chosen": -39.760765075683594, "logps/ref_rejected": -101.8095703125, "logps/rejected": -452.2938537597656, "loss": 0.4902, "rewards/accuracies": 0.921875, "rewards/chosen": -1.057375431060791, "rewards/margins": 1.7986748218536377, "rewards/rejected": -2.8560502529144287, "step": 322 }, { "epoch": 0.47430249632892807, "epsilon_dpo/beta": 0.008091602474451065, "epsilon_dpo/beta_margin_grad_mean": -0.2292575240135193, "epsilon_dpo/beta_margin_grad_std": 0.17154912650585175, "epsilon_dpo/beta_margin_mean": 1.5250296592712402, "epsilon_dpo/beta_margin_std": 1.1593869924545288, "epsilon_dpo/loss_margin_mean": 188.9366912841797, "grad_norm": 59.20304489135742, "kl/avg_steps": 0.796875, "kl/beta": 0.008155797608196735, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.171805115074251e-07, "logits/chosen": -1.9689267873764038, "logits/rejected": -1.8758494853973389, "logps/chosen": -173.9646759033203, "logps/ref_chosen": -39.59267807006836, "logps/ref_rejected": -78.83342742919922, "logps/rejected": -402.1421203613281, "loss": 0.5838, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0898997783660889, "rewards/margins": 1.5250296592712402, "rewards/rejected": -2.614929437637329, "step": 323 }, { "epoch": 0.47577092511013214, "epsilon_dpo/beta": 0.008031437173485756, "epsilon_dpo/beta_margin_grad_mean": -0.19054661691188812, "epsilon_dpo/beta_margin_grad_std": 0.18834634125232697, "epsilon_dpo/beta_margin_mean": 1.9402786493301392, "epsilon_dpo/beta_margin_std": 1.3676698207855225, "epsilon_dpo/loss_margin_mean": 242.30368041992188, "grad_norm": 56.6846923828125, "kl/avg_steps": 0.75, "kl/beta": 0.008091319352388382, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.1594350522787295e-07, "logits/chosen": -2.086808443069458, "logits/rejected": -1.915328025817871, "logps/chosen": -158.4698486328125, "logps/ref_chosen": -39.379974365234375, "logps/ref_rejected": -94.40634155273438, "logps/rejected": -455.7998962402344, "loss": 0.4977, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9597259163856506, "rewards/margins": 1.9402787685394287, "rewards/rejected": -2.9000043869018555, "step": 324 }, { "epoch": 0.47723935389133626, "epsilon_dpo/beta": 0.007969140075147152, "epsilon_dpo/beta_margin_grad_mean": -0.22008143365383148, "epsilon_dpo/beta_margin_grad_std": 0.1740773767232895, "epsilon_dpo/beta_margin_mean": 1.603606939315796, "epsilon_dpo/beta_margin_std": 1.2323416471481323, "epsilon_dpo/loss_margin_mean": 201.80323791503906, "grad_norm": 57.17675018310547, "kl/avg_steps": 0.78125, "kl/beta": 0.008031086064875126, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.147047612756302e-07, "logits/chosen": -2.2039828300476074, "logits/rejected": -1.9359973669052124, "logps/chosen": -174.04409790039062, "logps/ref_chosen": -44.78877258300781, "logps/ref_rejected": -76.1741943359375, "logps/rejected": -407.23272705078125, "loss": 0.5678, "rewards/accuracies": 0.90625, "rewards/chosen": -1.032346248626709, "rewards/margins": 1.603606939315796, "rewards/rejected": -2.635953426361084, "step": 325 }, { "epoch": 0.4787077826725404, "epsilon_dpo/beta": 0.007899893447756767, "epsilon_dpo/beta_margin_grad_mean": -0.19375857710838318, "epsilon_dpo/beta_margin_grad_std": 0.14700661599636078, "epsilon_dpo/beta_margin_mean": 1.8040101528167725, "epsilon_dpo/beta_margin_std": 1.1729263067245483, "epsilon_dpo/loss_margin_mean": 228.64596557617188, "grad_norm": 43.5681037902832, "kl/avg_steps": 0.875, "kl/beta": 0.007968829944729805, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.134643122927519e-07, "logits/chosen": -2.2271931171417236, "logits/rejected": -1.8989125490188599, "logps/chosen": -165.11163330078125, "logps/ref_chosen": -39.91899871826172, "logps/ref_rejected": -87.51998901367188, "logps/rejected": -441.35858154296875, "loss": 0.4677, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9895492792129517, "rewards/margins": 1.8040101528167725, "rewards/rejected": -2.7935595512390137, "step": 326 }, { "epoch": 0.4801762114537445, "epsilon_dpo/beta": 0.007831367664039135, "epsilon_dpo/beta_margin_grad_mean": -0.16008856892585754, "epsilon_dpo/beta_margin_grad_std": 0.15593233704566956, "epsilon_dpo/beta_margin_mean": 2.0613248348236084, "epsilon_dpo/beta_margin_std": 1.1618621349334717, "epsilon_dpo/loss_margin_mean": 263.6210632324219, "grad_norm": 45.13627624511719, "kl/avg_steps": 0.875, "kl/beta": 0.007899707183241844, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.1222219096622264e-07, "logits/chosen": -2.139697551727295, "logits/rejected": -1.981135368347168, "logps/chosen": -143.63931274414062, "logps/ref_chosen": -38.00127410888672, "logps/ref_rejected": -119.96142578125, "logps/rejected": -489.22052001953125, "loss": 0.3983, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8290703296661377, "rewards/margins": 2.0613250732421875, "rewards/rejected": -2.890395164489746, "step": 327 }, { "epoch": 0.48164464023494863, "epsilon_dpo/beta": 0.0077707814052701, "epsilon_dpo/beta_margin_grad_mean": -0.18991978466510773, "epsilon_dpo/beta_margin_grad_std": 0.1622304469347, "epsilon_dpo/beta_margin_mean": 1.8376761674880981, "epsilon_dpo/beta_margin_std": 1.1939152479171753, "epsilon_dpo/loss_margin_mean": 237.0498046875, "grad_norm": 43.36030197143555, "kl/avg_steps": 0.78125, "kl/beta": 0.00783118512481451, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.1097843002709427e-07, "logits/chosen": -2.1575894355773926, "logits/rejected": -1.941756010055542, "logps/chosen": -156.41744995117188, "logps/ref_chosen": -34.87682342529297, "logps/ref_rejected": -96.5279769897461, "logps/rejected": -455.118408203125, "loss": 0.4714, "rewards/accuracies": 0.921875, "rewards/chosen": -0.9466592669487, "rewards/margins": 1.8376761674880981, "rewards/rejected": -2.7843356132507324, "step": 328 }, { "epoch": 0.4831130690161527, "epsilon_dpo/beta": 0.007708113174885511, "epsilon_dpo/beta_margin_grad_mean": -0.18650057911872864, "epsilon_dpo/beta_margin_grad_std": 0.1677255630493164, "epsilon_dpo/beta_margin_mean": 1.8852558135986328, "epsilon_dpo/beta_margin_std": 1.2298924922943115, "epsilon_dpo/loss_margin_mean": 245.12493896484375, "grad_norm": 50.19667053222656, "kl/avg_steps": 0.8125, "kl/beta": 0.007770477794110775, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.0973306224962437e-07, "logits/chosen": -2.229661464691162, "logits/rejected": -2.011414051055908, "logps/chosen": -148.80914306640625, "logps/ref_chosen": -35.48017120361328, "logps/ref_rejected": -105.25934600830078, "logps/rejected": -463.7132568359375, "loss": 0.4686, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8763324022293091, "rewards/margins": 1.8852558135986328, "rewards/rejected": -2.7615880966186523, "step": 329 }, { "epoch": 0.4845814977973568, "epsilon_dpo/beta": 0.007645989768207073, "epsilon_dpo/beta_margin_grad_mean": -0.1875944435596466, "epsilon_dpo/beta_margin_grad_std": 0.16437454521656036, "epsilon_dpo/beta_margin_mean": 1.8213557004928589, "epsilon_dpo/beta_margin_std": 1.1398029327392578, "epsilon_dpo/loss_margin_mean": 238.7486572265625, "grad_norm": 45.98007583618164, "kl/avg_steps": 0.8125, "kl/beta": 0.0077078514732420444, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.084861204504122e-07, "logits/chosen": -2.0862879753112793, "logits/rejected": -2.0670981407165527, "logps/chosen": -126.71240234375, "logps/ref_chosen": -29.089645385742188, "logps/ref_rejected": -93.00184631347656, "logps/rejected": -429.37322998046875, "loss": 0.4709, "rewards/accuracies": 0.921875, "rewards/chosen": -0.7483941912651062, "rewards/margins": 1.8213555812835693, "rewards/rejected": -2.5697498321533203, "step": 330 }, { "epoch": 0.48604992657856094, "epsilon_dpo/beta": 0.0075819771736860275, "epsilon_dpo/beta_margin_grad_mean": -0.1762796938419342, "epsilon_dpo/beta_margin_grad_std": 0.15704338252544403, "epsilon_dpo/beta_margin_mean": 1.8491023778915405, "epsilon_dpo/beta_margin_std": 1.0536081790924072, "epsilon_dpo/loss_margin_mean": 244.35574340820312, "grad_norm": 42.173667907714844, "kl/avg_steps": 0.84375, "kl/beta": 0.0076457299292087555, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.072376374875335e-07, "logits/chosen": -2.128115177154541, "logits/rejected": -1.9702262878417969, "logps/chosen": -145.30062866210938, "logps/ref_chosen": -30.57064437866211, "logps/ref_rejected": -95.55513000488281, "logps/rejected": -454.640869140625, "loss": 0.4442, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8718706369400024, "rewards/margins": 1.84910249710083, "rewards/rejected": -2.720973014831543, "step": 331 }, { "epoch": 0.48751835535976507, "epsilon_dpo/beta": 0.007520908955484629, "epsilon_dpo/beta_margin_grad_mean": -0.2063533514738083, "epsilon_dpo/beta_margin_grad_std": 0.1539819985628128, "epsilon_dpo/beta_margin_mean": 1.6616438627243042, "epsilon_dpo/beta_margin_std": 1.0939134359359741, "epsilon_dpo/loss_margin_mean": 221.3557891845703, "grad_norm": 46.4879035949707, "kl/avg_steps": 0.8125, "kl/beta": 0.00758175877854228, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.059876462596758e-07, "logits/chosen": -2.075554370880127, "logits/rejected": -1.9067268371582031, "logps/chosen": -150.6046142578125, "logps/ref_chosen": -31.9512882232666, "logps/ref_rejected": -81.75474548339844, "logps/rejected": -421.76385498046875, "loss": 0.5063, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8930763006210327, "rewards/margins": 1.6616438627243042, "rewards/rejected": -2.554720163345337, "step": 332 }, { "epoch": 0.4889867841409692, "epsilon_dpo/beta": 0.007457944098860025, "epsilon_dpo/beta_margin_grad_mean": -0.1893877238035202, "epsilon_dpo/beta_margin_grad_std": 0.14721722900867462, "epsilon_dpo/beta_margin_mean": 1.7599835395812988, "epsilon_dpo/beta_margin_std": 1.0482655763626099, "epsilon_dpo/loss_margin_mean": 236.4034423828125, "grad_norm": 41.35095977783203, "kl/avg_steps": 0.84375, "kl/beta": 0.007520653773099184, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.0473617970527015e-07, "logits/chosen": -2.1496317386627197, "logits/rejected": -2.04693603515625, "logps/chosen": -148.9061737060547, "logps/ref_chosen": -35.37367630004883, "logps/ref_rejected": -102.79765319824219, "logps/rejected": -452.73358154296875, "loss": 0.4612, "rewards/accuracies": 0.953125, "rewards/chosen": -0.8483164310455322, "rewards/margins": 1.7599835395812988, "rewards/rejected": -2.608299732208252, "step": 333 }, { "epoch": 0.49045521292217326, "epsilon_dpo/beta": 0.007395543623715639, "epsilon_dpo/beta_margin_grad_mean": -0.20312325656414032, "epsilon_dpo/beta_margin_grad_std": 0.1642070859670639, "epsilon_dpo/beta_margin_mean": 1.6376922130584717, "epsilon_dpo/beta_margin_std": 1.04725182056427, "epsilon_dpo/loss_margin_mean": 221.89352416992188, "grad_norm": 43.43370819091797, "kl/avg_steps": 0.84375, "kl/beta": 0.007457728963345289, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.034832708016243e-07, "logits/chosen": -2.1904244422912598, "logits/rejected": -2.034341812133789, "logps/chosen": -160.87864685058594, "logps/ref_chosen": -44.046653747558594, "logps/ref_rejected": -101.81437683105469, "logps/rejected": -440.5398864746094, "loss": 0.5177, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8658192157745361, "rewards/margins": 1.6376922130584717, "rewards/rejected": -2.503511428833008, "step": 334 }, { "epoch": 0.4919236417033774, "epsilon_dpo/beta": 0.007335977163165808, "epsilon_dpo/beta_margin_grad_mean": -0.24230733513832092, "epsilon_dpo/beta_margin_grad_std": 0.15432856976985931, "epsilon_dpo/beta_margin_mean": 1.3553001880645752, "epsilon_dpo/beta_margin_std": 0.970168948173523, "epsilon_dpo/loss_margin_mean": 185.18431091308594, "grad_norm": 51.230499267578125, "kl/avg_steps": 0.8125, "kl/beta": 0.0073953308165073395, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.022289525640531e-07, "logits/chosen": -2.224094867706299, "logits/rejected": -2.0437393188476562, "logps/chosen": -182.1242218017578, "logps/ref_chosen": -45.84385681152344, "logps/ref_rejected": -93.38748931884766, "logps/rejected": -414.8521728515625, "loss": 0.6057, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0017235279083252, "rewards/margins": 1.3553001880645752, "rewards/rejected": -2.3570237159729004, "step": 335 }, { "epoch": 0.4933920704845815, "epsilon_dpo/beta": 0.00726768234744668, "epsilon_dpo/beta_margin_grad_mean": -0.18038199841976166, "epsilon_dpo/beta_margin_grad_std": 0.13710618019104004, "epsilon_dpo/beta_margin_mean": 1.829659104347229, "epsilon_dpo/beta_margin_std": 1.08203125, "epsilon_dpo/loss_margin_mean": 251.95303344726562, "grad_norm": 50.833736419677734, "kl/avg_steps": 0.9375, "kl/beta": 0.007335728034377098, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 3.009732580450086e-07, "logits/chosen": -2.130596876144409, "logits/rejected": -2.1031112670898438, "logps/chosen": -138.39865112304688, "logps/ref_chosen": -27.110448837280273, "logps/ref_rejected": -113.38800048828125, "logps/rejected": -476.6292419433594, "loss": 0.4338, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8097207546234131, "rewards/margins": 1.829659104347229, "rewards/rejected": -2.6393799781799316, "step": 336 }, { "epoch": 0.4948604992657856, "epsilon_dpo/beta": 0.007211537566035986, "epsilon_dpo/beta_margin_grad_mean": -0.2015434354543686, "epsilon_dpo/beta_margin_grad_std": 0.17217382788658142, "epsilon_dpo/beta_margin_mean": 1.7117812633514404, "epsilon_dpo/beta_margin_std": 1.1512197256088257, "epsilon_dpo/loss_margin_mean": 238.00546264648438, "grad_norm": 54.00065612792969, "kl/avg_steps": 0.78125, "kl/beta": 0.007267594337463379, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.9971622033320914e-07, "logits/chosen": -2.1570067405700684, "logits/rejected": -2.0568230152130127, "logps/chosen": -177.97787475585938, "logps/ref_chosen": -46.36784362792969, "logps/ref_rejected": -109.42700958251953, "logps/rejected": -479.04248046875, "loss": 0.5162, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9518444538116455, "rewards/margins": 1.7117812633514404, "rewards/rejected": -2.663625717163086, "step": 337 }, { "epoch": 0.49632892804698975, "epsilon_dpo/beta": 0.007151126395910978, "epsilon_dpo/beta_margin_grad_mean": -0.21953918039798737, "epsilon_dpo/beta_margin_grad_std": 0.16535867750644684, "epsilon_dpo/beta_margin_mean": 1.531717300415039, "epsilon_dpo/beta_margin_std": 1.0485196113586426, "epsilon_dpo/loss_margin_mean": 214.6240692138672, "grad_norm": 54.01927185058594, "kl/avg_steps": 0.84375, "kl/beta": 0.007211256306618452, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.984578725527675e-07, "logits/chosen": -2.093170166015625, "logits/rejected": -2.0884156227111816, "logps/chosen": -169.5230712890625, "logps/ref_chosen": -40.62709045410156, "logps/ref_rejected": -94.02056121826172, "logps/rejected": -437.54058837890625, "loss": 0.5573, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9232571125030518, "rewards/margins": 1.531717300415039, "rewards/rejected": -2.454974412918091, "step": 338 }, { "epoch": 0.4977973568281938, "epsilon_dpo/beta": 0.007089058868587017, "epsilon_dpo/beta_margin_grad_mean": -0.19239673018455505, "epsilon_dpo/beta_margin_grad_std": 0.1555846780538559, "epsilon_dpo/beta_margin_mean": 1.7873952388763428, "epsilon_dpo/beta_margin_std": 1.163671851158142, "epsilon_dpo/loss_margin_mean": 252.5406494140625, "grad_norm": 48.617767333984375, "kl/avg_steps": 0.875, "kl/beta": 0.007150920573621988, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.9719824786231796e-07, "logits/chosen": -2.1362557411193848, "logits/rejected": -2.0298609733581543, "logps/chosen": -171.6763458251953, "logps/ref_chosen": -42.66804885864258, "logps/ref_rejected": -113.68071746826172, "logps/rejected": -495.22967529296875, "loss": 0.4775, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9160549640655518, "rewards/margins": 1.7873952388763428, "rewards/rejected": -2.7034502029418945, "step": 339 }, { "epoch": 0.49926578560939794, "epsilon_dpo/beta": 0.007029782980680466, "epsilon_dpo/beta_margin_grad_mean": -0.22466416656970978, "epsilon_dpo/beta_margin_grad_std": 0.18301187455654144, "epsilon_dpo/beta_margin_mean": 1.512877106666565, "epsilon_dpo/beta_margin_std": 1.1476131677627563, "epsilon_dpo/loss_margin_mean": 215.7501983642578, "grad_norm": 53.56328582763672, "kl/avg_steps": 0.84375, "kl/beta": 0.007088892627507448, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.959373794541426e-07, "logits/chosen": -2.1749508380889893, "logits/rejected": -1.981499195098877, "logps/chosen": -152.33245849609375, "logps/ref_chosen": -33.344947814941406, "logps/ref_rejected": -78.67686462402344, "logps/rejected": -413.41455078125, "loss": 0.6076, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8387173414230347, "rewards/margins": 1.5128772258758545, "rewards/rejected": -2.3515944480895996, "step": 340 }, { "epoch": 0.5007342143906021, "epsilon_dpo/beta": 0.006977556273341179, "epsilon_dpo/beta_margin_grad_mean": -0.1975635439157486, "epsilon_dpo/beta_margin_grad_std": 0.181951105594635, "epsilon_dpo/beta_margin_mean": 1.7857365608215332, "epsilon_dpo/beta_margin_std": 1.1933374404907227, "epsilon_dpo/loss_margin_mean": 256.70587158203125, "grad_norm": 51.781864166259766, "kl/avg_steps": 0.75, "kl/beta": 0.007029580418020487, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.946753005532965e-07, "logits/chosen": -2.1569323539733887, "logits/rejected": -1.9963412284851074, "logps/chosen": -133.99024963378906, "logps/ref_chosen": -26.7175235748291, "logps/ref_rejected": -108.1113510131836, "logps/rejected": -472.0899353027344, "loss": 0.5095, "rewards/accuracies": 0.890625, "rewards/chosen": -0.7515710592269897, "rewards/margins": 1.7857365608215332, "rewards/rejected": -2.5373077392578125, "step": 341 }, { "epoch": 0.5022026431718062, "epsilon_dpo/beta": 0.006912530865520239, "epsilon_dpo/beta_margin_grad_mean": -0.18946228921413422, "epsilon_dpo/beta_margin_grad_std": 0.1443195641040802, "epsilon_dpo/beta_margin_mean": 1.8066987991333008, "epsilon_dpo/beta_margin_std": 1.138454794883728, "epsilon_dpo/loss_margin_mean": 261.54833984375, "grad_norm": 45.30876159667969, "kl/avg_steps": 0.9375, "kl/beta": 0.006977251265197992, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 2.934120444167326e-07, "logits/chosen": -2.289546489715576, "logits/rejected": -1.9754924774169922, "logps/chosen": -175.69277954101562, "logps/ref_chosen": -55.37556838989258, "logps/ref_rejected": -100.79505920410156, "logps/rejected": -482.6606140136719, "loss": 0.4569, "rewards/accuracies": 0.984375, "rewards/chosen": -0.8333628177642822, "rewards/margins": 1.8066987991333008, "rewards/rejected": -2.640061378479004, "step": 342 }, { "epoch": 0.5036710719530103, "epsilon_dpo/beta": 0.006846167147159576, "epsilon_dpo/beta_margin_grad_mean": -0.21721310913562775, "epsilon_dpo/beta_margin_grad_std": 0.11748269945383072, "epsilon_dpo/beta_margin_mean": 1.4517802000045776, "epsilon_dpo/beta_margin_std": 0.8032383322715759, "epsilon_dpo/loss_margin_mean": 212.13150024414062, "grad_norm": 49.670379638671875, "kl/avg_steps": 0.96875, "kl/beta": 0.0069124470464885235, "kl/n_epsilon_steps": 0.015625, "kl/p_epsilon_steps": 0.984375, "learning_rate": 2.9214764433242476e-07, "logits/chosen": -2.164459228515625, "logits/rejected": -2.1661453247070312, "logps/chosen": -178.66543579101562, "logps/ref_chosen": -42.08940124511719, "logps/ref_rejected": -117.90960693359375, "logps/rejected": -466.6171569824219, "loss": 0.514, "rewards/accuracies": 0.984375, "rewards/chosen": -0.9355216026306152, "rewards/margins": 1.451780080795288, "rewards/rejected": -2.3873019218444824, "step": 343 }, { "epoch": 0.5051395007342144, "epsilon_dpo/beta": 0.006797598209232092, "epsilon_dpo/beta_margin_grad_mean": -0.24506476521492004, "epsilon_dpo/beta_margin_grad_std": 0.1786363273859024, "epsilon_dpo/beta_margin_mean": 1.4522466659545898, "epsilon_dpo/beta_margin_std": 1.2337263822555542, "epsilon_dpo/loss_margin_mean": 214.36805725097656, "grad_norm": 57.320960998535156, "kl/avg_steps": 0.71875, "kl/beta": 0.006846125237643719, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.9088213361849126e-07, "logits/chosen": -2.179155111312866, "logits/rejected": -2.0898489952087402, "logps/chosen": -162.51026916503906, "logps/ref_chosen": -34.435752868652344, "logps/ref_rejected": -96.5870361328125, "logps/rejected": -439.02960205078125, "loss": 0.6315, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8735264539718628, "rewards/margins": 1.4522466659545898, "rewards/rejected": -2.325773239135742, "step": 344 }, { "epoch": 0.5066079295154186, "epsilon_dpo/beta": 0.0067384676076471806, "epsilon_dpo/beta_margin_grad_mean": -0.18444088101387024, "epsilon_dpo/beta_margin_grad_std": 0.13682545721530914, "epsilon_dpo/beta_margin_mean": 1.7831730842590332, "epsilon_dpo/beta_margin_std": 1.0325002670288086, "epsilon_dpo/loss_margin_mean": 264.975341796875, "grad_norm": 50.00996398925781, "kl/avg_steps": 0.875, "kl/beta": 0.00679726991802454, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.896155456223163e-07, "logits/chosen": -2.24761962890625, "logits/rejected": -2.086604595184326, "logps/chosen": -173.568603515625, "logps/ref_chosen": -40.498329162597656, "logps/ref_rejected": -107.90937805175781, "logps/rejected": -505.9549560546875, "loss": 0.4414, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8974334001541138, "rewards/margins": 1.7831730842590332, "rewards/rejected": -2.6806066036224365, "step": 345 }, { "epoch": 0.5080763582966226, "epsilon_dpo/beta": 0.00667791161686182, "epsilon_dpo/beta_margin_grad_mean": -0.18874618411064148, "epsilon_dpo/beta_margin_grad_std": 0.13201214373111725, "epsilon_dpo/beta_margin_mean": 1.7088000774383545, "epsilon_dpo/beta_margin_std": 0.9547905325889587, "epsilon_dpo/loss_margin_mean": 256.1742858886719, "grad_norm": 42.087677001953125, "kl/avg_steps": 0.90625, "kl/beta": 0.006738309748470783, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.883479137196714e-07, "logits/chosen": -2.0703492164611816, "logits/rejected": -1.9229077100753784, "logps/chosen": -160.73776245117188, "logps/ref_chosen": -33.41410827636719, "logps/ref_rejected": -83.742919921875, "logps/rejected": -467.2408447265625, "loss": 0.4511, "rewards/accuracies": 0.953125, "rewards/chosen": -0.8508467078208923, "rewards/margins": 1.7088000774383545, "rewards/rejected": -2.5596466064453125, "step": 346 }, { "epoch": 0.5095447870778267, "epsilon_dpo/beta": 0.006634632125496864, "epsilon_dpo/beta_margin_grad_mean": -0.26513612270355225, "epsilon_dpo/beta_margin_grad_std": 0.2060505747795105, "epsilon_dpo/beta_margin_mean": 1.3007481098175049, "epsilon_dpo/beta_margin_std": 1.2315279245376587, "epsilon_dpo/loss_margin_mean": 197.03175354003906, "grad_norm": 72.8255615234375, "kl/avg_steps": 0.65625, "kl/beta": 0.006677791941910982, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.8707927131383614e-07, "logits/chosen": -2.1761817932128906, "logits/rejected": -2.0537772178649902, "logps/chosen": -213.68258666992188, "logps/ref_chosen": -46.14110565185547, "logps/ref_rejected": -97.28417205810547, "logps/rejected": -461.857421875, "loss": 0.7379, "rewards/accuracies": 0.875, "rewards/chosen": -1.1159558296203613, "rewards/margins": 1.3007481098175049, "rewards/rejected": -2.416703701019287, "step": 347 }, { "epoch": 0.5110132158590308, "epsilon_dpo/beta": 0.0065768626518547535, "epsilon_dpo/beta_margin_grad_mean": -0.20659483969211578, "epsilon_dpo/beta_margin_grad_std": 0.15565478801727295, "epsilon_dpo/beta_margin_mean": 1.618215560913086, "epsilon_dpo/beta_margin_std": 1.0140705108642578, "epsilon_dpo/loss_margin_mean": 246.40516662597656, "grad_norm": 40.67171859741211, "kl/avg_steps": 0.875, "kl/beta": 0.006634254939854145, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.858096518347179e-07, "logits/chosen": -2.202709674835205, "logits/rejected": -2.092670440673828, "logps/chosen": -162.21255493164062, "logps/ref_chosen": -36.474021911621094, "logps/ref_rejected": -93.86297607421875, "logps/rejected": -466.0066833496094, "loss": 0.5108, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8284679651260376, "rewards/margins": 1.618215560913086, "rewards/rejected": -2.446683645248413, "step": 348 }, { "epoch": 0.5124816446402349, "epsilon_dpo/beta": 0.006519814487546682, "epsilon_dpo/beta_margin_grad_mean": -0.2202288955450058, "epsilon_dpo/beta_margin_grad_std": 0.171220600605011, "epsilon_dpo/beta_margin_mean": 1.6194798946380615, "epsilon_dpo/beta_margin_std": 1.1991218328475952, "epsilon_dpo/loss_margin_mean": 248.80596923828125, "grad_norm": 54.361026763916016, "kl/avg_steps": 0.875, "kl/beta": 0.006576708517968655, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.845390887379706e-07, "logits/chosen": -2.1096177101135254, "logits/rejected": -2.068809986114502, "logps/chosen": -157.87994384765625, "logps/ref_chosen": -32.72327423095703, "logps/ref_rejected": -104.45381927490234, "logps/rejected": -478.4164733886719, "loss": 0.5577, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8179513216018677, "rewards/margins": 1.6194798946380615, "rewards/rejected": -2.4374313354492188, "step": 349 }, { "epoch": 0.5139500734214391, "epsilon_dpo/beta": 0.006471411325037479, "epsilon_dpo/beta_margin_grad_mean": -0.2324196696281433, "epsilon_dpo/beta_margin_grad_std": 0.18096159398555756, "epsilon_dpo/beta_margin_mean": 1.507636308670044, "epsilon_dpo/beta_margin_std": 1.1591410636901855, "epsilon_dpo/loss_margin_mean": 233.73513793945312, "grad_norm": 66.0088119506836, "kl/avg_steps": 0.75, "kl/beta": 0.006519661750644445, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.8326761550411346e-07, "logits/chosen": -2.133009910583496, "logits/rejected": -2.081470489501953, "logps/chosen": -188.4402618408203, "logps/ref_chosen": -41.924171447753906, "logps/ref_rejected": -96.81851196289062, "logps/rejected": -477.0697326660156, "loss": 0.6014, "rewards/accuracies": 0.875, "rewards/chosen": -0.9508112668991089, "rewards/margins": 1.507636308670044, "rewards/rejected": -2.4584474563598633, "step": 350 }, { "epoch": 0.5154185022026432, "epsilon_dpo/beta": 0.006419192533940077, "epsilon_dpo/beta_margin_grad_mean": -0.22091074287891388, "epsilon_dpo/beta_margin_grad_std": 0.16051997244358063, "epsilon_dpo/beta_margin_mean": 1.5670552253723145, "epsilon_dpo/beta_margin_std": 1.1509732007980347, "epsilon_dpo/loss_margin_mean": 244.6831817626953, "grad_norm": 63.894622802734375, "kl/avg_steps": 0.8125, "kl/beta": 0.006471128202974796, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.819952656376487e-07, "logits/chosen": -2.141003131866455, "logits/rejected": -2.1702256202697754, "logps/chosen": -178.15701293945312, "logps/ref_chosen": -38.231632232666016, "logps/ref_rejected": -109.67044830322266, "logps/rejected": -494.2790222167969, "loss": 0.5544, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9005885124206543, "rewards/margins": 1.5670552253723145, "rewards/rejected": -2.4676437377929688, "step": 351 }, { "epoch": 0.5168869309838473, "epsilon_dpo/beta": 0.006367456633597612, "epsilon_dpo/beta_margin_grad_mean": -0.22667014598846436, "epsilon_dpo/beta_margin_grad_std": 0.1572793573141098, "epsilon_dpo/beta_margin_mean": 1.4911013841629028, "epsilon_dpo/beta_margin_std": 1.0576071739196777, "epsilon_dpo/loss_margin_mean": 234.71234130859375, "grad_norm": 50.314727783203125, "kl/avg_steps": 0.8125, "kl/beta": 0.0064189741387963295, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.8072207266617854e-07, "logits/chosen": -2.2193918228149414, "logits/rejected": -2.0443153381347656, "logps/chosen": -167.52468872070312, "logps/ref_chosen": -39.300880432128906, "logps/ref_rejected": -82.15660095214844, "logps/rejected": -445.0927734375, "loss": 0.5649, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8179040551185608, "rewards/margins": 1.4911013841629028, "rewards/rejected": -2.3090052604675293, "step": 352 }, { "epoch": 0.5183553597650514, "epsilon_dpo/beta": 0.006326087284833193, "epsilon_dpo/beta_margin_grad_mean": -0.24461467564105988, "epsilon_dpo/beta_margin_grad_std": 0.182557612657547, "epsilon_dpo/beta_margin_mean": 1.4262304306030273, "epsilon_dpo/beta_margin_std": 1.1893874406814575, "epsilon_dpo/loss_margin_mean": 226.36761474609375, "grad_norm": 57.24125289916992, "kl/avg_steps": 0.65625, "kl/beta": 0.006367240101099014, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.794480701395219e-07, "logits/chosen": -2.104984760284424, "logits/rejected": -2.00747013092041, "logps/chosen": -186.15036010742188, "logps/ref_chosen": -37.74017333984375, "logps/ref_rejected": -86.70198059082031, "logps/rejected": -461.47979736328125, "loss": 0.6444, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9420816898345947, "rewards/margins": 1.4262304306030273, "rewards/rejected": -2.368312120437622, "step": 353 }, { "epoch": 0.5198237885462555, "epsilon_dpo/beta": 0.006276935804635286, "epsilon_dpo/beta_margin_grad_mean": -0.21808402240276337, "epsilon_dpo/beta_margin_grad_std": 0.14849531650543213, "epsilon_dpo/beta_margin_mean": 1.4829356670379639, "epsilon_dpo/beta_margin_std": 0.9004656672477722, "epsilon_dpo/loss_margin_mean": 236.8414306640625, "grad_norm": 44.26091384887695, "kl/avg_steps": 0.78125, "kl/beta": 0.006325727794319391, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.781732916288303e-07, "logits/chosen": -2.21701717376709, "logits/rejected": -2.05487322807312, "logps/chosen": -189.85113525390625, "logps/ref_chosen": -46.64361572265625, "logps/ref_rejected": -94.82032775878906, "logps/rejected": -474.8692626953125, "loss": 0.5369, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8998274803161621, "rewards/margins": 1.4829356670379639, "rewards/rejected": -2.382763147354126, "step": 354 }, { "epoch": 0.5212922173274597, "epsilon_dpo/beta": 0.006220430601388216, "epsilon_dpo/beta_margin_grad_mean": -0.19742940366268158, "epsilon_dpo/beta_margin_grad_std": 0.1338631510734558, "epsilon_dpo/beta_margin_mean": 1.626767873764038, "epsilon_dpo/beta_margin_std": 0.9073279500007629, "epsilon_dpo/loss_margin_mean": 261.8030700683594, "grad_norm": 43.440181732177734, "kl/avg_steps": 0.90625, "kl/beta": 0.0062766908667981625, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.7689777072570284e-07, "logits/chosen": -2.233151435852051, "logits/rejected": -2.0062508583068848, "logps/chosen": -158.43057250976562, "logps/ref_chosen": -38.99018859863281, "logps/ref_rejected": -88.16902160644531, "logps/rejected": -469.4125061035156, "loss": 0.473, "rewards/accuracies": 0.953125, "rewards/chosen": -0.7434677481651306, "rewards/margins": 1.626767873764038, "rewards/rejected": -2.3702354431152344, "step": 355 }, { "epoch": 0.5227606461086637, "epsilon_dpo/beta": 0.006174284033477306, "epsilon_dpo/beta_margin_grad_mean": -0.2459879219532013, "epsilon_dpo/beta_margin_grad_std": 0.1747446060180664, "epsilon_dpo/beta_margin_mean": 1.3851886987686157, "epsilon_dpo/beta_margin_std": 1.0974996089935303, "epsilon_dpo/loss_margin_mean": 225.0797882080078, "grad_norm": 52.8966178894043, "kl/avg_steps": 0.75, "kl/beta": 0.0062203193083405495, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.7562154104130176e-07, "logits/chosen": -2.149020195007324, "logits/rejected": -1.9297568798065186, "logps/chosen": -177.79510498046875, "logps/ref_chosen": -36.000328063964844, "logps/ref_rejected": -81.04385375976562, "logps/rejected": -447.9184265136719, "loss": 0.6355, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8773345351219177, "rewards/margins": 1.3851888179779053, "rewards/rejected": -2.2625231742858887, "step": 356 }, { "epoch": 0.5242290748898678, "epsilon_dpo/beta": 0.006120603997260332, "epsilon_dpo/beta_margin_grad_mean": -0.22637690603733063, "epsilon_dpo/beta_margin_grad_std": 0.17489291727542877, "epsilon_dpo/beta_margin_mean": 1.5144197940826416, "epsilon_dpo/beta_margin_std": 1.1098681688308716, "epsilon_dpo/loss_margin_mean": 247.9174041748047, "grad_norm": 53.406307220458984, "kl/avg_steps": 0.875, "kl/beta": 0.006174014415591955, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.7434463620546594e-07, "logits/chosen": -2.1703100204467773, "logits/rejected": -2.005626916885376, "logps/chosen": -169.40367126464844, "logps/ref_chosen": -37.324180603027344, "logps/ref_rejected": -90.94546508789062, "logps/rejected": -470.9423522949219, "loss": 0.5852, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8109502792358398, "rewards/margins": 1.5144197940826416, "rewards/rejected": -2.3253703117370605, "step": 357 }, { "epoch": 0.5256975036710719, "epsilon_dpo/beta": 0.0060713388957083225, "epsilon_dpo/beta_margin_grad_mean": -0.2422979176044464, "epsilon_dpo/beta_margin_grad_std": 0.17289938032627106, "epsilon_dpo/beta_margin_mean": 1.4026376008987427, "epsilon_dpo/beta_margin_std": 1.0886473655700684, "epsilon_dpo/loss_margin_mean": 231.60694885253906, "grad_norm": 54.73785400390625, "kl/avg_steps": 0.8125, "kl/beta": 0.006120460107922554, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.730670898658255e-07, "logits/chosen": -2.2487030029296875, "logits/rejected": -2.109694480895996, "logps/chosen": -169.3449249267578, "logps/ref_chosen": -45.07238006591797, "logps/ref_rejected": -94.94227600097656, "logps/rejected": -450.82177734375, "loss": 0.6267, "rewards/accuracies": 0.921875, "rewards/chosen": -0.7564330101013184, "rewards/margins": 1.4026376008987427, "rewards/rejected": -2.1590704917907715, "step": 358 }, { "epoch": 0.527165932452276, "epsilon_dpo/beta": 0.0060280985198915005, "epsilon_dpo/beta_margin_grad_mean": -0.23849013447761536, "epsilon_dpo/beta_margin_grad_std": 0.1872168630361557, "epsilon_dpo/beta_margin_mean": 1.4863260984420776, "epsilon_dpo/beta_margin_std": 1.1895928382873535, "epsilon_dpo/loss_margin_mean": 247.45826721191406, "grad_norm": 62.71131134033203, "kl/avg_steps": 0.71875, "kl/beta": 0.006071132142096758, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.717889356869146e-07, "logits/chosen": -2.1315059661865234, "logits/rejected": -1.9415364265441895, "logps/chosen": -162.13319396972656, "logps/ref_chosen": -34.454952239990234, "logps/ref_rejected": -87.54586791992188, "logps/rejected": -462.682373046875, "loss": 0.6242, "rewards/accuracies": 0.875, "rewards/chosen": -0.7723500728607178, "rewards/margins": 1.486325979232788, "rewards/rejected": -2.258676052093506, "step": 359 }, { "epoch": 0.5286343612334802, "epsilon_dpo/beta": 0.005986965261399746, "epsilon_dpo/beta_margin_grad_mean": -0.2441015988588333, "epsilon_dpo/beta_margin_grad_std": 0.1673990935087204, "epsilon_dpo/beta_margin_mean": 1.3607901334762573, "epsilon_dpo/beta_margin_std": 0.9962857365608215, "epsilon_dpo/loss_margin_mean": 228.12159729003906, "grad_norm": 53.97544860839844, "kl/avg_steps": 0.6875, "kl/beta": 0.006027807481586933, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.7051020734928443e-07, "logits/chosen": -2.0680994987487793, "logits/rejected": -1.957366704940796, "logps/chosen": -168.40216064453125, "logps/ref_chosen": -37.17042922973633, "logps/ref_rejected": -76.79757690429688, "logps/rejected": -436.15087890625, "loss": 0.6196, "rewards/accuracies": 0.875, "rewards/chosen": -0.7889617681503296, "rewards/margins": 1.3607900142669678, "rewards/rejected": -2.149751901626587, "step": 360 }, { "epoch": 0.5301027900146843, "epsilon_dpo/beta": 0.005942343734204769, "epsilon_dpo/beta_margin_grad_mean": -0.25044846534729004, "epsilon_dpo/beta_margin_grad_std": 0.18136946856975555, "epsilon_dpo/beta_margin_mean": 1.3711673021316528, "epsilon_dpo/beta_margin_std": 1.1313101053237915, "epsilon_dpo/loss_margin_mean": 231.51885986328125, "grad_norm": 63.17055892944336, "kl/avg_steps": 0.75, "kl/beta": 0.005986649077385664, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.6923093854861593e-07, "logits/chosen": -2.1590943336486816, "logits/rejected": -2.121795177459717, "logps/chosen": -164.0542755126953, "logps/ref_chosen": -33.520301818847656, "logps/ref_rejected": -95.78604125976562, "logps/rejected": -457.8388671875, "loss": 0.6539, "rewards/accuracies": 0.875, "rewards/chosen": -0.7777143120765686, "rewards/margins": 1.3711671829223633, "rewards/rejected": -2.148881435394287, "step": 361 }, { "epoch": 0.5315712187958884, "epsilon_dpo/beta": 0.005890679080039263, "epsilon_dpo/beta_margin_grad_mean": -0.23398736119270325, "epsilon_dpo/beta_margin_grad_std": 0.1614319384098053, "epsilon_dpo/beta_margin_mean": 1.4454318284988403, "epsilon_dpo/beta_margin_std": 1.0421142578125, "epsilon_dpo/loss_margin_mean": 245.76043701171875, "grad_norm": 57.27875900268555, "kl/avg_steps": 0.875, "kl/beta": 0.005942083429545164, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.679511629948319e-07, "logits/chosen": -2.2858972549438477, "logits/rejected": -2.1964077949523926, "logps/chosen": -185.8004913330078, "logps/ref_chosen": -44.4540901184082, "logps/ref_rejected": -110.75244140625, "logps/rejected": -497.8592834472656, "loss": 0.5872, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8333665728569031, "rewards/margins": 1.4454319477081299, "rewards/rejected": -2.2787985801696777, "step": 362 }, { "epoch": 0.5330396475770925, "epsilon_dpo/beta": 0.005839582998305559, "epsilon_dpo/beta_margin_grad_mean": -0.18884162604808807, "epsilon_dpo/beta_margin_grad_std": 0.14109890162944794, "epsilon_dpo/beta_margin_mean": 1.7546730041503906, "epsilon_dpo/beta_margin_std": 1.0252107381820679, "epsilon_dpo/loss_margin_mean": 300.8768615722656, "grad_norm": 44.063777923583984, "kl/avg_steps": 0.875, "kl/beta": 0.00589054124429822, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.6667091441120816e-07, "logits/chosen": -2.20405650138855, "logits/rejected": -2.084122657775879, "logps/chosen": -119.686767578125, "logps/ref_chosen": -23.62018585205078, "logps/ref_rejected": -79.18755340576172, "logps/rejected": -476.1310119628906, "loss": 0.4544, "rewards/accuracies": 0.96875, "rewards/chosen": -0.5620150566101074, "rewards/margins": 1.7546730041503906, "rewards/rejected": -2.316688060760498, "step": 363 }, { "epoch": 0.5345080763582967, "epsilon_dpo/beta": 0.005788929760456085, "epsilon_dpo/beta_margin_grad_mean": -0.23159046471118927, "epsilon_dpo/beta_margin_grad_std": 0.15345177054405212, "epsilon_dpo/beta_margin_mean": 1.4228205680847168, "epsilon_dpo/beta_margin_std": 0.979676365852356, "epsilon_dpo/loss_margin_mean": 246.19557189941406, "grad_norm": 54.10791015625, "kl/avg_steps": 0.875, "kl/beta": 0.005839446093887091, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.6539022653348575e-07, "logits/chosen": -2.1630282402038574, "logits/rejected": -2.1442086696624756, "logps/chosen": -151.59072875976562, "logps/ref_chosen": -31.6948184967041, "logps/ref_rejected": -98.25422668457031, "logps/rejected": -464.345703125, "loss": 0.5787, "rewards/accuracies": 0.953125, "rewards/chosen": -0.6957234144210815, "rewards/margins": 1.4228205680847168, "rewards/rejected": -2.118544101715088, "step": 364 }, { "epoch": 0.5359765051395007, "epsilon_dpo/beta": 0.005736907012760639, "epsilon_dpo/beta_margin_grad_mean": -0.22249744832515717, "epsilon_dpo/beta_margin_grad_std": 0.13856177031993866, "epsilon_dpo/beta_margin_mean": 1.4774224758148193, "epsilon_dpo/beta_margin_std": 0.9550539255142212, "epsilon_dpo/loss_margin_mean": 257.8067932128906, "grad_norm": 45.98625564575195, "kl/avg_steps": 0.90625, "kl/beta": 0.005788794253021479, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.641091331089811e-07, "logits/chosen": -2.2106542587280273, "logits/rejected": -2.1826648712158203, "logps/chosen": -125.05105590820312, "logps/ref_chosen": -28.370468139648438, "logps/ref_rejected": -97.9366226196289, "logps/rejected": -452.42401123046875, "loss": 0.5397, "rewards/accuracies": 0.953125, "rewards/chosen": -0.5555899739265442, "rewards/margins": 1.4774224758148193, "rewards/rejected": -2.0330123901367188, "step": 365 }, { "epoch": 0.5374449339207048, "epsilon_dpo/beta": 0.005688969045877457, "epsilon_dpo/beta_margin_grad_mean": -0.241429403424263, "epsilon_dpo/beta_margin_grad_std": 0.15970946848392487, "epsilon_dpo/beta_margin_mean": 1.3868852853775024, "epsilon_dpo/beta_margin_std": 1.0170059204101562, "epsilon_dpo/loss_margin_mean": 244.25833129882812, "grad_norm": 45.932926177978516, "kl/avg_steps": 0.84375, "kl/beta": 0.0057368045672774315, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.6282766789569736e-07, "logits/chosen": -2.1703920364379883, "logits/rejected": -2.1496224403381348, "logps/chosen": -141.24298095703125, "logps/ref_chosen": -29.695941925048828, "logps/ref_rejected": -91.68377685546875, "logps/rejected": -447.4891357421875, "loss": 0.605, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6359978914260864, "rewards/margins": 1.3868852853775024, "rewards/rejected": -2.022883176803589, "step": 366 }, { "epoch": 0.5389133627019089, "epsilon_dpo/beta": 0.005634258035570383, "epsilon_dpo/beta_margin_grad_mean": -0.23779228329658508, "epsilon_dpo/beta_margin_grad_std": 0.14979609847068787, "epsilon_dpo/beta_margin_mean": 1.3488836288452148, "epsilon_dpo/beta_margin_std": 0.8744153380393982, "epsilon_dpo/loss_margin_mean": 239.53407287597656, "grad_norm": 42.18586730957031, "kl/avg_steps": 0.96875, "kl/beta": 0.005688805133104324, "kl/n_epsilon_steps": 0.015625, "kl/p_epsilon_steps": 0.984375, "learning_rate": 2.615458646614349e-07, "logits/chosen": -2.2993650436401367, "logits/rejected": -2.1403145790100098, "logps/chosen": -153.87405395507812, "logps/ref_chosen": -43.07976531982422, "logps/ref_rejected": -82.26808166503906, "logps/rejected": -432.596435546875, "loss": 0.5899, "rewards/accuracies": 0.96875, "rewards/chosen": -0.624735951423645, "rewards/margins": 1.3488836288452148, "rewards/rejected": -1.9736195802688599, "step": 367 }, { "epoch": 0.540381791483113, "epsilon_dpo/beta": 0.005581961013376713, "epsilon_dpo/beta_margin_grad_mean": -0.2129683494567871, "epsilon_dpo/beta_margin_grad_std": 0.1246427670121193, "epsilon_dpo/beta_margin_mean": 1.4776982069015503, "epsilon_dpo/beta_margin_std": 0.806984007358551, "epsilon_dpo/loss_margin_mean": 264.9310607910156, "grad_norm": 42.2542610168457, "kl/avg_steps": 0.9375, "kl/beta": 0.005634223576635122, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 2.6026375718290083e-07, "logits/chosen": -2.189484119415283, "logits/rejected": -2.1851706504821777, "logps/chosen": -142.2340545654297, "logps/ref_chosen": -28.834022521972656, "logps/ref_rejected": -104.38568115234375, "logps/rejected": -482.7167663574219, "loss": 0.5089, "rewards/accuracies": 0.984375, "rewards/chosen": -0.633552074432373, "rewards/margins": 1.4776982069015503, "rewards/rejected": -2.111250400543213, "step": 368 }, { "epoch": 0.5418502202643172, "epsilon_dpo/beta": 0.0055318609811365604, "epsilon_dpo/beta_margin_grad_mean": -0.223898246884346, "epsilon_dpo/beta_margin_grad_std": 0.13859553635120392, "epsilon_dpo/beta_margin_mean": 1.431330919265747, "epsilon_dpo/beta_margin_std": 0.8688164353370667, "epsilon_dpo/loss_margin_mean": 259.0464782714844, "grad_norm": 39.924076080322266, "kl/avg_steps": 0.90625, "kl/beta": 0.005581893492490053, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.589813792448196e-07, "logits/chosen": -2.307586193084717, "logits/rejected": -2.1981377601623535, "logps/chosen": -161.64993286132812, "logps/ref_chosen": -46.21360778808594, "logps/ref_rejected": -98.12519836425781, "logps/rejected": -472.6080017089844, "loss": 0.5458, "rewards/accuracies": 0.96875, "rewards/chosen": -0.640053391456604, "rewards/margins": 1.431330919265747, "rewards/rejected": -2.0713844299316406, "step": 369 }, { "epoch": 0.5433186490455213, "epsilon_dpo/beta": 0.005487365182489157, "epsilon_dpo/beta_margin_grad_mean": -0.26117002964019775, "epsilon_dpo/beta_margin_grad_std": 0.16114374995231628, "epsilon_dpo/beta_margin_mean": 1.2338489294052124, "epsilon_dpo/beta_margin_std": 0.9565902352333069, "epsilon_dpo/loss_margin_mean": 225.38717651367188, "grad_norm": 44.1972770690918, "kl/avg_steps": 0.8125, "kl/beta": 0.005531761795282364, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.5769876463904263e-07, "logits/chosen": -2.3571860790252686, "logits/rejected": -2.2904090881347656, "logps/chosen": -190.09152221679688, "logps/ref_chosen": -51.13551330566406, "logps/ref_rejected": -101.64096069335938, "logps/rejected": -465.984130859375, "loss": 0.6656, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7640899419784546, "rewards/margins": 1.2338488101959229, "rewards/rejected": -1.997938871383667, "step": 370 }, { "epoch": 0.5447870778267254, "epsilon_dpo/beta": 0.005443139001727104, "epsilon_dpo/beta_margin_grad_mean": -0.2511758804321289, "epsilon_dpo/beta_margin_grad_std": 0.17598623037338257, "epsilon_dpo/beta_margin_mean": 1.3466187715530396, "epsilon_dpo/beta_margin_std": 1.0779163837432861, "epsilon_dpo/loss_margin_mean": 248.08079528808594, "grad_norm": 59.9031982421875, "kl/avg_steps": 0.8125, "kl/beta": 0.005487178452312946, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.5641594716365744e-07, "logits/chosen": -2.2778027057647705, "logits/rejected": -2.235936164855957, "logps/chosen": -181.4659881591797, "logps/ref_chosen": -44.41009521484375, "logps/ref_rejected": -106.13936614990234, "logps/rejected": -491.27606201171875, "loss": 0.65, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7477129697799683, "rewards/margins": 1.34661865234375, "rewards/rejected": -2.094331741333008, "step": 371 }, { "epoch": 0.5462555066079295, "epsilon_dpo/beta": 0.005397569388151169, "epsilon_dpo/beta_margin_grad_mean": -0.2212544083595276, "epsilon_dpo/beta_margin_grad_std": 0.16570396721363068, "epsilon_dpo/beta_margin_mean": 1.5037719011306763, "epsilon_dpo/beta_margin_std": 1.0139695405960083, "epsilon_dpo/loss_margin_mean": 279.17572021484375, "grad_norm": 42.31486511230469, "kl/avg_steps": 0.84375, "kl/beta": 0.00544295459985733, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.551329606220976e-07, "logits/chosen": -2.188450336456299, "logits/rejected": -1.9817776679992676, "logps/chosen": -168.07699584960938, "logps/ref_chosen": -37.96990203857422, "logps/ref_rejected": -85.50196838378906, "logps/rejected": -494.7847595214844, "loss": 0.5632, "rewards/accuracies": 0.921875, "rewards/chosen": -0.7031924724578857, "rewards/margins": 1.5037720203399658, "rewards/rejected": -2.2069644927978516, "step": 372 }, { "epoch": 0.5477239353891337, "epsilon_dpo/beta": 0.005349034443497658, "epsilon_dpo/beta_margin_grad_mean": -0.2134443074464798, "epsilon_dpo/beta_margin_grad_std": 0.14159946143627167, "epsilon_dpo/beta_margin_mean": 1.5223017930984497, "epsilon_dpo/beta_margin_std": 0.9210037589073181, "epsilon_dpo/loss_margin_mean": 284.92840576171875, "grad_norm": 46.6475944519043, "kl/avg_steps": 0.90625, "kl/beta": 0.005397413857281208, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.538498388222517e-07, "logits/chosen": -2.178375005722046, "logits/rejected": -2.014909267425537, "logps/chosen": -159.17428588867188, "logps/ref_chosen": -31.804325103759766, "logps/ref_rejected": -93.370361328125, "logps/rejected": -505.668701171875, "loss": 0.5204, "rewards/accuracies": 0.953125, "rewards/chosen": -0.6824929118156433, "rewards/margins": 1.5223019123077393, "rewards/rejected": -2.2047948837280273, "step": 373 }, { "epoch": 0.5491923641703378, "epsilon_dpo/beta": 0.005304337944835424, "epsilon_dpo/beta_margin_grad_mean": -0.24420872330665588, "epsilon_dpo/beta_margin_grad_std": 0.14603427052497864, "epsilon_dpo/beta_margin_mean": 1.3295738697052002, "epsilon_dpo/beta_margin_std": 0.9483336806297302, "epsilon_dpo/loss_margin_mean": 251.12574768066406, "grad_norm": 65.01420593261719, "kl/avg_steps": 0.84375, "kl/beta": 0.00534893898293376, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.525666155755725e-07, "logits/chosen": -2.1958394050598145, "logits/rejected": -2.096358060836792, "logps/chosen": -186.239501953125, "logps/ref_chosen": -47.937625885009766, "logps/ref_rejected": -99.9871826171875, "logps/rejected": -489.414794921875, "loss": 0.6052, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7352691888809204, "rewards/margins": 1.3295738697052002, "rewards/rejected": -2.06484317779541, "step": 374 }, { "epoch": 0.5506607929515418, "epsilon_dpo/beta": 0.005254983436316252, "epsilon_dpo/beta_margin_grad_mean": -0.20512135326862335, "epsilon_dpo/beta_margin_grad_std": 0.1477726697921753, "epsilon_dpo/beta_margin_mean": 1.6202079057693481, "epsilon_dpo/beta_margin_std": 1.0169481039047241, "epsilon_dpo/loss_margin_mean": 308.59869384765625, "grad_norm": 44.36325454711914, "kl/avg_steps": 0.9375, "kl/beta": 0.0053041847422719, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 2.512833246961859e-07, "logits/chosen": -2.20353364944458, "logits/rejected": -2.1390933990478516, "logps/chosen": -151.54449462890625, "logps/ref_chosen": -37.31241989135742, "logps/ref_rejected": -96.08183288574219, "logps/rejected": -518.91259765625, "loss": 0.5067, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6010797023773193, "rewards/margins": 1.6202080249786377, "rewards/rejected": -2.221287727355957, "step": 375 }, { "epoch": 0.5521292217327459, "epsilon_dpo/beta": 0.005209460388869047, "epsilon_dpo/beta_margin_grad_mean": -0.20545507967472076, "epsilon_dpo/beta_margin_grad_std": 0.1631988137960434, "epsilon_dpo/beta_margin_mean": 1.6594352722167969, "epsilon_dpo/beta_margin_std": 1.0614945888519287, "epsilon_dpo/loss_margin_mean": 319.01361083984375, "grad_norm": 37.207130432128906, "kl/avg_steps": 0.875, "kl/beta": 0.005254920106381178, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.5e-07, "logits/chosen": -2.2137274742126465, "logits/rejected": -2.0475354194641113, "logps/chosen": -162.23599243164062, "logps/ref_chosen": -46.223838806152344, "logps/ref_rejected": -112.95925903320312, "logps/rejected": -547.9850463867188, "loss": 0.5114, "rewards/accuracies": 0.9375, "rewards/chosen": -0.605557918548584, "rewards/margins": 1.6594352722167969, "rewards/rejected": -2.264993190765381, "step": 376 }, { "epoch": 0.55359765051395, "epsilon_dpo/beta": 0.005162645131349564, "epsilon_dpo/beta_margin_grad_mean": -0.24356918036937714, "epsilon_dpo/beta_margin_grad_std": 0.13881158828735352, "epsilon_dpo/beta_margin_mean": 1.3458126783370972, "epsilon_dpo/beta_margin_std": 0.9675434827804565, "epsilon_dpo/loss_margin_mean": 260.9751281738281, "grad_norm": 44.168060302734375, "kl/avg_steps": 0.90625, "kl/beta": 0.0052093383856117725, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.487166753038141e-07, "logits/chosen": -2.0519776344299316, "logits/rejected": -2.01825213432312, "logps/chosen": -185.23043823242188, "logps/ref_chosen": -34.36379623413086, "logps/ref_rejected": -105.25419616699219, "logps/rejected": -517.095947265625, "loss": 0.5956, "rewards/accuracies": 0.953125, "rewards/chosen": -0.7789356708526611, "rewards/margins": 1.3458126783370972, "rewards/rejected": -2.1247482299804688, "step": 377 }, { "epoch": 0.5550660792951542, "epsilon_dpo/beta": 0.005117891822010279, "epsilon_dpo/beta_margin_grad_mean": -0.22632895410060883, "epsilon_dpo/beta_margin_grad_std": 0.1629311591386795, "epsilon_dpo/beta_margin_mean": 1.4298152923583984, "epsilon_dpo/beta_margin_std": 0.9799423217773438, "epsilon_dpo/loss_margin_mean": 279.919189453125, "grad_norm": 52.31544876098633, "kl/avg_steps": 0.875, "kl/beta": 0.005162552464753389, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.4743338442442754e-07, "logits/chosen": -2.059394359588623, "logits/rejected": -1.9235854148864746, "logps/chosen": -163.18124389648438, "logps/ref_chosen": -31.191333770751953, "logps/ref_rejected": -94.8112564086914, "logps/rejected": -506.7203674316406, "loss": 0.5897, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6769013404846191, "rewards/margins": 1.4298152923583984, "rewards/rejected": -2.1067166328430176, "step": 378 }, { "epoch": 0.5565345080763583, "epsilon_dpo/beta": 0.005076698027551174, "epsilon_dpo/beta_margin_grad_mean": -0.23987461626529694, "epsilon_dpo/beta_margin_grad_std": 0.16330820322036743, "epsilon_dpo/beta_margin_mean": 1.4291728734970093, "epsilon_dpo/beta_margin_std": 1.0851447582244873, "epsilon_dpo/loss_margin_mean": 282.147705078125, "grad_norm": 51.661922454833984, "kl/avg_steps": 0.8125, "kl/beta": 0.005117772147059441, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.461501611777483e-07, "logits/chosen": -2.0814695358276367, "logits/rejected": -2.1467087268829346, "logps/chosen": -197.74441528320312, "logps/ref_chosen": -35.68910217285156, "logps/ref_rejected": -120.69367980957031, "logps/rejected": -564.896728515625, "loss": 0.6029, "rewards/accuracies": 0.875, "rewards/chosen": -0.8247177004814148, "rewards/margins": 1.4291728734970093, "rewards/rejected": -2.2538905143737793, "step": 379 }, { "epoch": 0.5580029368575624, "epsilon_dpo/beta": 0.005032609216868877, "epsilon_dpo/beta_margin_grad_mean": -0.20714667439460754, "epsilon_dpo/beta_margin_grad_std": 0.1527094542980194, "epsilon_dpo/beta_margin_mean": 1.6478782892227173, "epsilon_dpo/beta_margin_std": 1.1102581024169922, "epsilon_dpo/loss_margin_mean": 327.93218994140625, "grad_norm": 39.429542541503906, "kl/avg_steps": 0.875, "kl/beta": 0.005076525267213583, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.4486703937790243e-07, "logits/chosen": -2.105628728866577, "logits/rejected": -2.135561466217041, "logps/chosen": -184.11827087402344, "logps/ref_chosen": -30.20886993408203, "logps/ref_rejected": -111.17587280273438, "logps/rejected": -593.0174560546875, "loss": 0.5118, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7753435373306274, "rewards/margins": 1.6478781700134277, "rewards/rejected": -2.4232218265533447, "step": 380 }, { "epoch": 0.5594713656387665, "epsilon_dpo/beta": 0.004996819421648979, "epsilon_dpo/beta_margin_grad_mean": -0.2540000379085541, "epsilon_dpo/beta_margin_grad_std": 0.1617269366979599, "epsilon_dpo/beta_margin_mean": 1.3064846992492676, "epsilon_dpo/beta_margin_std": 1.0072511434555054, "epsilon_dpo/loss_margin_mean": 262.2966613769531, "grad_norm": 51.32357406616211, "kl/avg_steps": 0.71875, "kl/beta": 0.005032490938901901, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.435840528363426e-07, "logits/chosen": -2.112166404724121, "logits/rejected": -1.9345085620880127, "logps/chosen": -195.97369384765625, "logps/ref_chosen": -35.8305778503418, "logps/ref_rejected": -84.19041442871094, "logps/rejected": -506.63018798828125, "loss": 0.64, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8029674291610718, "rewards/margins": 1.3064846992492676, "rewards/rejected": -2.109452247619629, "step": 381 }, { "epoch": 0.5609397944199707, "epsilon_dpo/beta": 0.004958038218319416, "epsilon_dpo/beta_margin_grad_mean": -0.23661740124225616, "epsilon_dpo/beta_margin_grad_std": 0.16259969770908356, "epsilon_dpo/beta_margin_mean": 1.4187428951263428, "epsilon_dpo/beta_margin_std": 1.0073864459991455, "epsilon_dpo/loss_margin_mean": 286.858642578125, "grad_norm": 47.54762268066406, "kl/avg_steps": 0.78125, "kl/beta": 0.004996578209102154, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.4230123536095745e-07, "logits/chosen": -2.1727161407470703, "logits/rejected": -2.1786251068115234, "logps/chosen": -211.81280517578125, "logps/ref_chosen": -42.912322998046875, "logps/ref_rejected": -116.84707641601562, "logps/rejected": -572.606201171875, "loss": 0.5931, "rewards/accuracies": 0.90625, "rewards/chosen": -0.839522123336792, "rewards/margins": 1.4187428951263428, "rewards/rejected": -2.2582650184631348, "step": 382 }, { "epoch": 0.5624082232011748, "epsilon_dpo/beta": 0.00491960346698761, "epsilon_dpo/beta_margin_grad_mean": -0.2571463882923126, "epsilon_dpo/beta_margin_grad_std": 0.1763610541820526, "epsilon_dpo/beta_margin_mean": 1.305212140083313, "epsilon_dpo/beta_margin_std": 1.0758684873580933, "epsilon_dpo/loss_margin_mean": 266.0962219238281, "grad_norm": 56.97015380859375, "kl/avg_steps": 0.78125, "kl/beta": 0.004957844968885183, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.4101862075518037e-07, "logits/chosen": -2.0988528728485107, "logits/rejected": -2.077052354812622, "logps/chosen": -209.27877807617188, "logps/ref_chosen": -35.82965087890625, "logps/ref_rejected": -100.10868072509766, "logps/rejected": -539.654052734375, "loss": 0.6678, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8556368947029114, "rewards/margins": 1.3052120208740234, "rewards/rejected": -2.160849094390869, "step": 383 }, { "epoch": 0.5638766519823789, "epsilon_dpo/beta": 0.004878391977399588, "epsilon_dpo/beta_margin_grad_mean": -0.261491596698761, "epsilon_dpo/beta_margin_grad_std": 0.14361578226089478, "epsilon_dpo/beta_margin_mean": 1.1763639450073242, "epsilon_dpo/beta_margin_std": 0.7963204979896545, "epsilon_dpo/loss_margin_mean": 241.60084533691406, "grad_norm": 49.46163558959961, "kl/avg_steps": 0.84375, "kl/beta": 0.004919412080198526, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.397362428170992e-07, "logits/chosen": -2.1388659477233887, "logits/rejected": -2.1503121852874756, "logps/chosen": -204.53836059570312, "logps/ref_chosen": -36.32765579223633, "logps/ref_rejected": -88.09075927734375, "logps/rejected": -497.9023132324219, "loss": 0.6509, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8223646879196167, "rewards/margins": 1.1763639450073242, "rewards/rejected": -1.998728632926941, "step": 384 }, { "epoch": 0.5653450807635829, "epsilon_dpo/beta": 0.004834526218473911, "epsilon_dpo/beta_margin_grad_mean": -0.24184301495552063, "epsilon_dpo/beta_margin_grad_std": 0.1403016448020935, "epsilon_dpo/beta_margin_mean": 1.3218892812728882, "epsilon_dpo/beta_margin_std": 0.876196563243866, "epsilon_dpo/loss_margin_mean": 273.7486267089844, "grad_norm": 42.43720245361328, "kl/avg_steps": 0.90625, "kl/beta": 0.004878251813352108, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.3845413533856514e-07, "logits/chosen": -2.1851940155029297, "logits/rejected": -1.9630916118621826, "logps/chosen": -205.151123046875, "logps/ref_chosen": -43.28904724121094, "logps/ref_rejected": -85.31612396240234, "logps/rejected": -520.9268188476562, "loss": 0.595, "rewards/accuracies": 0.953125, "rewards/chosen": -0.7839208841323853, "rewards/margins": 1.3218892812728882, "rewards/rejected": -2.1058101654052734, "step": 385 }, { "epoch": 0.566813509544787, "epsilon_dpo/beta": 0.004791106563061476, "epsilon_dpo/beta_margin_grad_mean": -0.22500666975975037, "epsilon_dpo/beta_margin_grad_std": 0.1310817301273346, "epsilon_dpo/beta_margin_mean": 1.448143482208252, "epsilon_dpo/beta_margin_std": 0.916283369064331, "epsilon_dpo/loss_margin_mean": 302.5584411621094, "grad_norm": 44.11907196044922, "kl/avg_steps": 0.90625, "kl/beta": 0.004834439605474472, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.3717233210430254e-07, "logits/chosen": -2.1383421421051025, "logits/rejected": -2.0959932804107666, "logps/chosen": -186.66197204589844, "logps/ref_chosen": -34.78173828125, "logps/ref_rejected": -98.69821166992188, "logps/rejected": -553.1368408203125, "loss": 0.5412, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7281551957130432, "rewards/margins": 1.448143482208252, "rewards/rejected": -2.1762986183166504, "step": 386 }, { "epoch": 0.5682819383259912, "epsilon_dpo/beta": 0.004748077597469091, "epsilon_dpo/beta_margin_grad_mean": -0.2660858631134033, "epsilon_dpo/beta_margin_grad_std": 0.1432296484708786, "epsilon_dpo/beta_margin_mean": 1.1700471639633179, "epsilon_dpo/beta_margin_std": 0.8362559080123901, "epsilon_dpo/loss_margin_mean": 246.71267700195312, "grad_norm": 52.421470642089844, "kl/avg_steps": 0.90625, "kl/beta": 0.0047910213470458984, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.3589086689101889e-07, "logits/chosen": -2.315922737121582, "logits/rejected": -2.0701866149902344, "logps/chosen": -236.2892608642578, "logps/ref_chosen": -47.62559509277344, "logps/ref_rejected": -97.41331481933594, "logps/rejected": -532.7896728515625, "loss": 0.6613, "rewards/accuracies": 0.953125, "rewards/chosen": -0.8966184854507446, "rewards/margins": 1.1700471639633179, "rewards/rejected": -2.0666656494140625, "step": 387 }, { "epoch": 0.5697503671071953, "epsilon_dpo/beta": 0.004706918261945248, "epsilon_dpo/beta_margin_grad_mean": -0.23977012932300568, "epsilon_dpo/beta_margin_grad_std": 0.14799103140830994, "epsilon_dpo/beta_margin_mean": 1.386051893234253, "epsilon_dpo/beta_margin_std": 0.9813371300697327, "epsilon_dpo/loss_margin_mean": 294.90777587890625, "grad_norm": 50.50879669189453, "kl/avg_steps": 0.875, "kl/beta": 0.004747992381453514, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.3460977346651428e-07, "logits/chosen": -2.154555320739746, "logits/rejected": -2.115201473236084, "logps/chosen": -198.59793090820312, "logps/ref_chosen": -35.743316650390625, "logps/ref_rejected": -110.60121154785156, "logps/rejected": -568.3635864257812, "loss": 0.5907, "rewards/accuracies": 0.9375, "rewards/chosen": -0.767310619354248, "rewards/margins": 1.386051893234253, "rewards/rejected": -2.153362512588501, "step": 388 }, { "epoch": 0.5712187958883994, "epsilon_dpo/beta": 0.004673445131629705, "epsilon_dpo/beta_margin_grad_mean": -0.26743534207344055, "epsilon_dpo/beta_margin_grad_std": 0.18217839300632477, "epsilon_dpo/beta_margin_mean": 1.2179065942764282, "epsilon_dpo/beta_margin_std": 1.0329232215881348, "epsilon_dpo/loss_margin_mean": 261.6222839355469, "grad_norm": 57.76890563964844, "kl/avg_steps": 0.71875, "kl/beta": 0.004706807900220156, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.3332908558879177e-07, "logits/chosen": -2.1483242511749268, "logits/rejected": -2.025583267211914, "logps/chosen": -195.3277587890625, "logps/ref_chosen": -40.81889724731445, "logps/ref_rejected": -82.08807373046875, "logps/rejected": -498.21917724609375, "loss": 0.7075, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7245763540267944, "rewards/margins": 1.2179067134857178, "rewards/rejected": -1.9424829483032227, "step": 389 }, { "epoch": 0.5726872246696035, "epsilon_dpo/beta": 0.004632791969925165, "epsilon_dpo/beta_margin_grad_mean": -0.24001896381378174, "epsilon_dpo/beta_margin_grad_std": 0.13718777894973755, "epsilon_dpo/beta_margin_mean": 1.3450864553451538, "epsilon_dpo/beta_margin_std": 0.9246547222137451, "epsilon_dpo/loss_margin_mean": 290.7643737792969, "grad_norm": 47.92219161987305, "kl/avg_steps": 0.875, "kl/beta": 0.004673219285905361, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.320488370051681e-07, "logits/chosen": -2.114452838897705, "logits/rejected": -1.998061180114746, "logps/chosen": -161.92718505859375, "logps/ref_chosen": -29.450889587402344, "logps/ref_rejected": -92.78984069824219, "logps/rejected": -516.030517578125, "loss": 0.5862, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6144026517868042, "rewards/margins": 1.3450864553451538, "rewards/rejected": -1.959489107131958, "step": 390 }, { "epoch": 0.5741556534508077, "epsilon_dpo/beta": 0.004595502279698849, "epsilon_dpo/beta_margin_grad_mean": -0.273894727230072, "epsilon_dpo/beta_margin_grad_std": 0.1488787978887558, "epsilon_dpo/beta_margin_mean": 1.1127126216888428, "epsilon_dpo/beta_margin_std": 0.8292704820632935, "epsilon_dpo/loss_margin_mean": 242.7264404296875, "grad_norm": 53.62302017211914, "kl/avg_steps": 0.8125, "kl/beta": 0.004632683005183935, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.3076906145138405e-07, "logits/chosen": -2.1756703853607178, "logits/rejected": -2.104757785797119, "logps/chosen": -198.61831665039062, "logps/ref_chosen": -41.40957260131836, "logps/ref_rejected": -93.46768951416016, "logps/rejected": -493.40289306640625, "loss": 0.6924, "rewards/accuracies": 0.921875, "rewards/chosen": -0.7247334718704224, "rewards/margins": 1.1127126216888428, "rewards/rejected": -1.8374460935592651, "step": 391 }, { "epoch": 0.5756240822320118, "epsilon_dpo/beta": 0.004555592313408852, "epsilon_dpo/beta_margin_grad_mean": -0.2217557728290558, "epsilon_dpo/beta_margin_grad_std": 0.1372254192829132, "epsilon_dpo/beta_margin_mean": 1.4560474157333374, "epsilon_dpo/beta_margin_std": 0.9119820594787598, "epsilon_dpo/loss_margin_mean": 320.1082763671875, "grad_norm": 46.09185791015625, "kl/avg_steps": 0.875, "kl/beta": 0.004595345817506313, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.294897926507156e-07, "logits/chosen": -2.129703998565674, "logits/rejected": -2.0612103939056396, "logps/chosen": -166.53561401367188, "logps/ref_chosen": -29.435636520385742, "logps/ref_rejected": -109.47046661376953, "logps/rejected": -566.6787109375, "loss": 0.5405, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6257168054580688, "rewards/margins": 1.456047534942627, "rewards/rejected": -2.0817642211914062, "step": 392 }, { "epoch": 0.5770925110132159, "epsilon_dpo/beta": 0.004520347807556391, "epsilon_dpo/beta_margin_grad_mean": -0.2750376760959625, "epsilon_dpo/beta_margin_grad_std": 0.16051527857780457, "epsilon_dpo/beta_margin_mean": 1.1350780725479126, "epsilon_dpo/beta_margin_std": 0.9084685444831848, "epsilon_dpo/loss_margin_mean": 251.88442993164062, "grad_norm": 51.396175384521484, "kl/avg_steps": 0.78125, "kl/beta": 0.004555485676974058, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.2821106431308543e-07, "logits/chosen": -2.204120635986328, "logits/rejected": -2.0319063663482666, "logps/chosen": -160.30789184570312, "logps/ref_chosen": -27.59358024597168, "logps/ref_rejected": -77.63695526123047, "logps/rejected": -462.23565673828125, "loss": 0.7035, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6019172668457031, "rewards/margins": 1.1350780725479126, "rewards/rejected": -1.7369953393936157, "step": 393 }, { "epoch": 0.57856093979442, "epsilon_dpo/beta": 0.004478242713958025, "epsilon_dpo/beta_margin_grad_mean": -0.22447995841503143, "epsilon_dpo/beta_margin_grad_std": 0.1293361932039261, "epsilon_dpo/beta_margin_mean": 1.4324853420257568, "epsilon_dpo/beta_margin_std": 0.8796045184135437, "epsilon_dpo/loss_margin_mean": 320.1289367675781, "grad_norm": 36.82295227050781, "kl/avg_steps": 0.9375, "kl/beta": 0.0045201717875897884, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 2.2693291013417452e-07, "logits/chosen": -2.1519179344177246, "logits/rejected": -2.0851173400878906, "logps/chosen": -171.11074829101562, "logps/ref_chosen": -33.609615325927734, "logps/ref_rejected": -97.24693298339844, "logps/rejected": -554.876953125, "loss": 0.5407, "rewards/accuracies": 0.96875, "rewards/chosen": -0.61604905128479, "rewards/margins": 1.4324853420257568, "rewards/rejected": -2.048534393310547, "step": 394 }, { "epoch": 0.580029368575624, "epsilon_dpo/beta": 0.004446446429938078, "epsilon_dpo/beta_margin_grad_mean": -0.2658129632472992, "epsilon_dpo/beta_margin_grad_std": 0.17667248845100403, "epsilon_dpo/beta_margin_mean": 1.248099446296692, "epsilon_dpo/beta_margin_std": 1.0518742799758911, "epsilon_dpo/loss_margin_mean": 281.7148132324219, "grad_norm": 55.038551330566406, "kl/avg_steps": 0.71875, "kl/beta": 0.004478188697248697, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.2565536379453404e-07, "logits/chosen": -2.1458773612976074, "logits/rejected": -2.072359085083008, "logps/chosen": -198.61692810058594, "logps/ref_chosen": -42.13022994995117, "logps/ref_rejected": -89.85330963134766, "logps/rejected": -528.0548095703125, "loss": 0.6914, "rewards/accuracies": 0.875, "rewards/chosen": -0.6987878084182739, "rewards/margins": 1.248099446296692, "rewards/rejected": -1.9468872547149658, "step": 395 }, { "epoch": 0.5814977973568282, "epsilon_dpo/beta": 0.004410546738654375, "epsilon_dpo/beta_margin_grad_mean": -0.25534769892692566, "epsilon_dpo/beta_margin_grad_std": 0.17317545413970947, "epsilon_dpo/beta_margin_mean": 1.317757248878479, "epsilon_dpo/beta_margin_std": 1.0815587043762207, "epsilon_dpo/loss_margin_mean": 299.5541687011719, "grad_norm": 54.677188873291016, "kl/avg_steps": 0.8125, "kl/beta": 0.004446231294423342, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.2437845895869825e-07, "logits/chosen": -2.223626136779785, "logits/rejected": -2.068613290786743, "logps/chosen": -198.652099609375, "logps/ref_chosen": -39.016197204589844, "logps/ref_rejected": -94.28376007080078, "logps/rejected": -553.473876953125, "loss": 0.6594, "rewards/accuracies": 0.90625, "rewards/chosen": -0.706026017665863, "rewards/margins": 1.3177571296691895, "rewards/rejected": -2.0237832069396973, "step": 396 }, { "epoch": 0.5829662261380323, "epsilon_dpo/beta": 0.004374999552965164, "epsilon_dpo/beta_margin_grad_mean": -0.2254161238670349, "epsilon_dpo/beta_margin_grad_std": 0.1620989739894867, "epsilon_dpo/beta_margin_mean": 1.489986777305603, "epsilon_dpo/beta_margin_std": 1.04046630859375, "epsilon_dpo/loss_margin_mean": 341.36114501953125, "grad_norm": 48.635677337646484, "kl/avg_steps": 0.8125, "kl/beta": 0.004410396795719862, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.2310222927429716e-07, "logits/chosen": -2.2994096279144287, "logits/rejected": -2.131638526916504, "logps/chosen": -192.5784149169922, "logps/ref_chosen": -42.487823486328125, "logps/ref_rejected": -111.77519226074219, "logps/rejected": -603.2269287109375, "loss": 0.5673, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6591649055480957, "rewards/margins": 1.4899868965148926, "rewards/rejected": -2.1491518020629883, "step": 397 }, { "epoch": 0.5844346549192364, "epsilon_dpo/beta": 0.004342474043369293, "epsilon_dpo/beta_margin_grad_mean": -0.2421146184206009, "epsilon_dpo/beta_margin_grad_std": 0.1806570291519165, "epsilon_dpo/beta_margin_mean": 1.4041334390640259, "epsilon_dpo/beta_margin_std": 1.1149554252624512, "epsilon_dpo/loss_margin_mean": 324.445556640625, "grad_norm": 49.05772399902344, "kl/avg_steps": 0.75, "kl/beta": 0.004374851007014513, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.2182670837116972e-07, "logits/chosen": -2.292893409729004, "logits/rejected": -2.2275965213775635, "logps/chosen": -206.82467651367188, "logps/ref_chosen": -47.66093444824219, "logps/ref_rejected": -116.59112548828125, "logps/rejected": -600.200439453125, "loss": 0.6398, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6942225694656372, "rewards/margins": 1.4041335582733154, "rewards/rejected": -2.098356246948242, "step": 398 }, { "epoch": 0.5859030837004405, "epsilon_dpo/beta": 0.004307433497160673, "epsilon_dpo/beta_margin_grad_mean": -0.24833783507347107, "epsilon_dpo/beta_margin_grad_std": 0.16362829506397247, "epsilon_dpo/beta_margin_mean": 1.3256672620773315, "epsilon_dpo/beta_margin_std": 0.9880812168121338, "epsilon_dpo/loss_margin_mean": 308.531494140625, "grad_norm": 54.13144302368164, "kl/avg_steps": 0.8125, "kl/beta": 0.004342284053564072, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.2055192986047804e-07, "logits/chosen": -2.2859010696411133, "logits/rejected": -2.0517654418945312, "logps/chosen": -172.4034423828125, "logps/ref_chosen": -35.37146759033203, "logps/ref_rejected": -83.08477020263672, "logps/rejected": -528.648193359375, "loss": 0.6303, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5922940969467163, "rewards/margins": 1.325667142868042, "rewards/rejected": -1.9179613590240479, "step": 399 }, { "epoch": 0.5873715124816447, "epsilon_dpo/beta": 0.0042700255289673805, "epsilon_dpo/beta_margin_grad_mean": -0.23046433925628662, "epsilon_dpo/beta_margin_grad_std": 0.13983549177646637, "epsilon_dpo/beta_margin_mean": 1.362850308418274, "epsilon_dpo/beta_margin_std": 0.8101769089698792, "epsilon_dpo/loss_margin_mean": 319.66986083984375, "grad_norm": 42.21733093261719, "kl/avg_steps": 0.875, "kl/beta": 0.0043072872795164585, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.192779273338215e-07, "logits/chosen": -2.346558094024658, "logits/rejected": -2.2209692001342773, "logps/chosen": -214.174560546875, "logps/ref_chosen": -52.37220001220703, "logps/ref_rejected": -110.96676635742188, "logps/rejected": -592.43896484375, "loss": 0.5665, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6924352645874023, "rewards/margins": 1.3628504276275635, "rewards/rejected": -2.0552854537963867, "step": 400 }, { "epoch": 0.5873715124816447, "eval_epsilon_dpo/beta": 0.004249695222824812, "eval_epsilon_dpo/beta_margin_grad_mean": -0.35010644793510437, "eval_epsilon_dpo/beta_margin_grad_std": 0.18771161139011383, "eval_epsilon_dpo/beta_margin_mean": 0.7586736679077148, "eval_epsilon_dpo/beta_margin_std": 0.9838672876358032, "eval_epsilon_dpo/loss_margin_mean": 180.02191162109375, "eval_kl/n_epsilon_steps": 0.25813356041908264, "eval_kl/p_epsilon_steps": 0.741866409778595, "eval_logits/chosen": -2.246497631072998, "eval_logits/rejected": -2.1281936168670654, "eval_logps/chosen": -320.8791198730469, "eval_logps/ref_chosen": -68.29110717773438, "eval_logps/ref_rejected": -92.08038330078125, "eval_logps/rejected": -524.6903076171875, "eval_loss": 0.4863179922103882, "eval_rewards/accuracies": 0.7564212083816528, "eval_rewards/chosen": -1.077276587486267, "eval_rewards/margins": 0.7586737275123596, "eval_rewards/rejected": -1.835950255393982, "eval_runtime": 38.3555, "eval_samples_per_second": 60.982, "eval_steps_per_second": 1.929, "step": 400 }, { "epoch": 0.5888399412628488, "epsilon_dpo/beta": 0.0042383247055113316, "epsilon_dpo/beta_margin_grad_mean": -0.2859787940979004, "epsilon_dpo/beta_margin_grad_std": 0.17702849209308624, "epsilon_dpo/beta_margin_mean": 1.153497338294983, "epsilon_dpo/beta_margin_std": 1.093888521194458, "epsilon_dpo/loss_margin_mean": 273.0599365234375, "grad_norm": 55.85029983520508, "kl/avg_steps": 0.75, "kl/beta": 0.004269925411790609, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.1800473436235136e-07, "logits/chosen": -2.2151618003845215, "logits/rejected": -2.1000635623931885, "logps/chosen": -206.0033721923828, "logps/ref_chosen": -39.69450378417969, "logps/ref_rejected": -90.86283874511719, "logps/rejected": -530.2316284179688, "loss": 0.7457, "rewards/accuracies": 0.890625, "rewards/chosen": -0.7069000005722046, "rewards/margins": 1.153497338294983, "rewards/rejected": -1.8603973388671875, "step": 401 }, { "epoch": 0.5903083700440529, "epsilon_dpo/beta": 0.004204125143587589, "epsilon_dpo/beta_margin_grad_mean": -0.251901239156723, "epsilon_dpo/beta_margin_grad_std": 0.14963571727275848, "epsilon_dpo/beta_margin_mean": 1.2545827627182007, "epsilon_dpo/beta_margin_std": 0.8603823184967041, "epsilon_dpo/loss_margin_mean": 299.1294860839844, "grad_norm": 44.19596862792969, "kl/avg_steps": 0.8125, "kl/beta": 0.004238139372318983, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.1673238449588665e-07, "logits/chosen": -2.216102123260498, "logits/rejected": -2.0876975059509277, "logps/chosen": -187.8063201904297, "logps/ref_chosen": -38.76295852661133, "logps/ref_rejected": -86.50106811523438, "logps/rejected": -534.6738891601562, "loss": 0.6286, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6281874179840088, "rewards/margins": 1.2545827627182007, "rewards/rejected": -1.8827701807022095, "step": 402 }, { "epoch": 0.591776798825257, "epsilon_dpo/beta": 0.004170241765677929, "epsilon_dpo/beta_margin_grad_mean": -0.2570376694202423, "epsilon_dpo/beta_margin_grad_std": 0.15749378502368927, "epsilon_dpo/beta_margin_mean": 1.245935082435608, "epsilon_dpo/beta_margin_std": 0.9069526791572571, "epsilon_dpo/loss_margin_mean": 299.4497985839844, "grad_norm": 45.329769134521484, "kl/avg_steps": 0.8125, "kl/beta": 0.004203982185572386, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.154609112620295e-07, "logits/chosen": -2.254790782928467, "logits/rejected": -2.070467472076416, "logps/chosen": -162.46539306640625, "logps/ref_chosen": -29.60453224182129, "logps/ref_rejected": -82.97395324707031, "logps/rejected": -515.2846069335938, "loss": 0.6468, "rewards/accuracies": 0.921875, "rewards/chosen": -0.5560154914855957, "rewards/margins": 1.2459352016448975, "rewards/rejected": -1.8019505739212036, "step": 403 }, { "epoch": 0.593245227606461, "epsilon_dpo/beta": 0.004143148194998503, "epsilon_dpo/beta_margin_grad_mean": -0.2758185565471649, "epsilon_dpo/beta_margin_grad_std": 0.16760212182998657, "epsilon_dpo/beta_margin_mean": 1.1664576530456543, "epsilon_dpo/beta_margin_std": 0.9855588674545288, "epsilon_dpo/loss_margin_mean": 282.7168884277344, "grad_norm": 52.8735466003418, "kl/avg_steps": 0.65625, "kl/beta": 0.004170100204646587, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.1419034816528218e-07, "logits/chosen": -2.253624439239502, "logits/rejected": -2.092195987701416, "logps/chosen": -182.46783447265625, "logps/ref_chosen": -32.369415283203125, "logps/ref_rejected": -84.27439880371094, "logps/rejected": -517.0897216796875, "loss": 0.7084, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6246769428253174, "rewards/margins": 1.1664576530456543, "rewards/rejected": -1.7911345958709717, "step": 404 }, { "epoch": 0.5947136563876652, "epsilon_dpo/beta": 0.004105777945369482, "epsilon_dpo/beta_margin_grad_mean": -0.27076855301856995, "epsilon_dpo/beta_margin_grad_std": 0.15409667789936066, "epsilon_dpo/beta_margin_mean": 1.1753402948379517, "epsilon_dpo/beta_margin_std": 0.9294463396072388, "epsilon_dpo/loss_margin_mean": 286.6537170410156, "grad_norm": 45.45560836791992, "kl/avg_steps": 0.90625, "kl/beta": 0.004142912104725838, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.129207286861638e-07, "logits/chosen": -2.2662670612335205, "logits/rejected": -2.1083085536956787, "logps/chosen": -207.39706420898438, "logps/ref_chosen": -45.16187286376953, "logps/ref_rejected": -93.87014770507812, "logps/rejected": -542.759033203125, "loss": 0.6822, "rewards/accuracies": 0.953125, "rewards/chosen": -0.6669937372207642, "rewards/margins": 1.1753404140472412, "rewards/rejected": -1.8423340320587158, "step": 405 }, { "epoch": 0.5961820851688693, "epsilon_dpo/beta": 0.004071469884365797, "epsilon_dpo/beta_margin_grad_mean": -0.24695971608161926, "epsilon_dpo/beta_margin_grad_std": 0.14529289305210114, "epsilon_dpo/beta_margin_mean": 1.2771347761154175, "epsilon_dpo/beta_margin_std": 0.8445379137992859, "epsilon_dpo/loss_margin_mean": 314.2767639160156, "grad_norm": 44.63689422607422, "kl/avg_steps": 0.84375, "kl/beta": 0.004105704370886087, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.1165208628032861e-07, "logits/chosen": -2.2001147270202637, "logits/rejected": -2.1016151905059814, "logps/chosen": -175.9896240234375, "logps/ref_chosen": -32.108238220214844, "logps/ref_rejected": -99.3056640625, "logps/rejected": -557.4638671875, "loss": 0.612, "rewards/accuracies": 0.921875, "rewards/chosen": -0.5877770185470581, "rewards/margins": 1.2771347761154175, "rewards/rejected": -1.8649117946624756, "step": 406 }, { "epoch": 0.5976505139500734, "epsilon_dpo/beta": 0.0040450384840369225, "epsilon_dpo/beta_margin_grad_mean": -0.2901814579963684, "epsilon_dpo/beta_margin_grad_std": 0.17480535805225372, "epsilon_dpo/beta_margin_mean": 1.060813307762146, "epsilon_dpo/beta_margin_std": 0.9306771159172058, "epsilon_dpo/loss_margin_mean": 263.4065246582031, "grad_norm": 52.973270416259766, "kl/avg_steps": 0.65625, "kl/beta": 0.004071352072060108, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.1038445437768375e-07, "logits/chosen": -2.2982513904571533, "logits/rejected": -2.1020822525024414, "logps/chosen": -181.23577880859375, "logps/ref_chosen": -34.63081359863281, "logps/ref_rejected": -83.40669250488281, "logps/rejected": -493.41815185546875, "loss": 0.7582, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5955917835235596, "rewards/margins": 1.060813307762146, "rewards/rejected": -1.6564050912857056, "step": 407 }, { "epoch": 0.5991189427312775, "epsilon_dpo/beta": 0.004012344870716333, "epsilon_dpo/beta_margin_grad_mean": -0.2729249894618988, "epsilon_dpo/beta_margin_grad_std": 0.14225371181964874, "epsilon_dpo/beta_margin_mean": 1.1006709337234497, "epsilon_dpo/beta_margin_std": 0.758716344833374, "epsilon_dpo/loss_margin_mean": 274.95880126953125, "grad_norm": 44.9481201171875, "kl/avg_steps": 0.8125, "kl/beta": 0.004044807981699705, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.0911786638150872e-07, "logits/chosen": -2.361016273498535, "logits/rejected": -2.139800548553467, "logps/chosen": -198.23095703125, "logps/ref_chosen": -46.1392822265625, "logps/ref_rejected": -96.3233642578125, "logps/rejected": -523.3738403320312, "loss": 0.682, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6118342280387878, "rewards/margins": 1.1006710529327393, "rewards/rejected": -1.7125052213668823, "step": 408 }, { "epoch": 0.6005873715124816, "epsilon_dpo/beta": 0.003982515539973974, "epsilon_dpo/beta_margin_grad_mean": -0.294739693403244, "epsilon_dpo/beta_margin_grad_std": 0.16220302879810333, "epsilon_dpo/beta_margin_mean": 1.007659673690796, "epsilon_dpo/beta_margin_std": 0.8669053316116333, "epsilon_dpo/loss_margin_mean": 253.92869567871094, "grad_norm": 45.330448150634766, "kl/avg_steps": 0.75, "kl/beta": 0.0040122088976204395, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.0785235566757517e-07, "logits/chosen": -2.2953333854675293, "logits/rejected": -2.1765618324279785, "logps/chosen": -214.66848754882812, "logps/ref_chosen": -48.41924285888672, "logps/ref_rejected": -88.46084594726562, "logps/rejected": -508.6387939453125, "loss": 0.7654, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6647806167602539, "rewards/margins": 1.007659673690796, "rewards/rejected": -1.6724402904510498, "step": 409 }, { "epoch": 0.6020558002936858, "epsilon_dpo/beta": 0.003950380254536867, "epsilon_dpo/beta_margin_grad_mean": -0.27121517062187195, "epsilon_dpo/beta_margin_grad_std": 0.15460430085659027, "epsilon_dpo/beta_margin_mean": 1.1261425018310547, "epsilon_dpo/beta_margin_std": 0.8273832201957703, "epsilon_dpo/loss_margin_mean": 285.80035400390625, "grad_norm": 39.14712905883789, "kl/avg_steps": 0.8125, "kl/beta": 0.003982341382652521, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.065879555832674e-07, "logits/chosen": -2.258359909057617, "logits/rejected": -2.2356643676757812, "logps/chosen": -171.04891967773438, "logps/ref_chosen": -32.20702362060547, "logps/ref_rejected": -90.97166442871094, "logps/rejected": -515.6138916015625, "loss": 0.6897, "rewards/accuracies": 0.921875, "rewards/chosen": -0.5501143336296082, "rewards/margins": 1.1261425018310547, "rewards/rejected": -1.676256775856018, "step": 410 }, { "epoch": 0.6035242290748899, "epsilon_dpo/beta": 0.0039160726591944695, "epsilon_dpo/beta_margin_grad_mean": -0.2674121856689453, "epsilon_dpo/beta_margin_grad_std": 0.12682108581066132, "epsilon_dpo/beta_margin_mean": 1.1220488548278809, "epsilon_dpo/beta_margin_std": 0.7217124104499817, "epsilon_dpo/loss_margin_mean": 286.949951171875, "grad_norm": 38.4173698425293, "kl/avg_steps": 0.875, "kl/beta": 0.003950245678424835, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.0532469944670343e-07, "logits/chosen": -2.2671849727630615, "logits/rejected": -2.18176007270813, "logps/chosen": -163.12564086914062, "logps/ref_chosen": -27.866039276123047, "logps/ref_rejected": -87.75438690185547, "logps/rejected": -509.96392822265625, "loss": 0.6557, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5305934548377991, "rewards/margins": 1.1220488548278809, "rewards/rejected": -1.6526423692703247, "step": 411 }, { "epoch": 0.604992657856094, "epsilon_dpo/beta": 0.0038821042980998755, "epsilon_dpo/beta_margin_grad_mean": -0.254239022731781, "epsilon_dpo/beta_margin_grad_std": 0.13515068590641022, "epsilon_dpo/beta_margin_mean": 1.2237414121627808, "epsilon_dpo/beta_margin_std": 0.7976933121681213, "epsilon_dpo/loss_margin_mean": 315.66949462890625, "grad_norm": 39.3294563293457, "kl/avg_steps": 0.875, "kl/beta": 0.00391598092392087, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.0406262054585738e-07, "logits/chosen": -2.2864794731140137, "logits/rejected": -2.262669563293457, "logps/chosen": -154.23080444335938, "logps/ref_chosen": -31.307266235351562, "logps/ref_rejected": -107.21038818359375, "logps/rejected": -545.803466796875, "loss": 0.6232, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4781460464000702, "rewards/margins": 1.2237414121627808, "rewards/rejected": -1.7018874883651733, "step": 412 }, { "epoch": 0.6064610866372981, "epsilon_dpo/beta": 0.003849643748253584, "epsilon_dpo/beta_margin_grad_mean": -0.2595198452472687, "epsilon_dpo/beta_margin_grad_std": 0.14788804948329926, "epsilon_dpo/beta_margin_mean": 1.1739351749420166, "epsilon_dpo/beta_margin_std": 0.7997394800186157, "epsilon_dpo/loss_margin_mean": 305.6657409667969, "grad_norm": 38.31694412231445, "kl/avg_steps": 0.84375, "kl/beta": 0.003882013261318207, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.0280175213768205e-07, "logits/chosen": -2.353483200073242, "logits/rejected": -2.2761778831481934, "logps/chosen": -189.5083465576172, "logps/ref_chosen": -43.01777648925781, "logps/ref_rejected": -106.10704040527344, "logps/rejected": -558.2633666992188, "loss": 0.6576, "rewards/accuracies": 0.921875, "rewards/chosen": -0.5653949975967407, "rewards/margins": 1.1739351749420166, "rewards/rejected": -1.7393302917480469, "step": 413 }, { "epoch": 0.6079295154185022, "epsilon_dpo/beta": 0.0038186372257769108, "epsilon_dpo/beta_margin_grad_mean": -0.2614062428474426, "epsilon_dpo/beta_margin_grad_std": 0.14156121015548706, "epsilon_dpo/beta_margin_mean": 1.181972861289978, "epsilon_dpo/beta_margin_std": 0.8064230680465698, "epsilon_dpo/loss_margin_mean": 310.2454833984375, "grad_norm": 46.121341705322266, "kl/avg_steps": 0.8125, "kl/beta": 0.003849532688036561, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.0154212744723247e-07, "logits/chosen": -2.2362513542175293, "logits/rejected": -2.2514965534210205, "logps/chosen": -161.10008239746094, "logps/ref_chosen": -33.92742919921875, "logps/ref_rejected": -93.77487182617188, "logps/rejected": -531.1929931640625, "loss": 0.6491, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4871448874473572, "rewards/margins": 1.181972861289978, "rewards/rejected": -1.6691176891326904, "step": 414 }, { "epoch": 0.6093979441997063, "epsilon_dpo/beta": 0.0037890539970248938, "epsilon_dpo/beta_margin_grad_mean": -0.281541645526886, "epsilon_dpo/beta_margin_grad_std": 0.14272256195545197, "epsilon_dpo/beta_margin_mean": 1.0638197660446167, "epsilon_dpo/beta_margin_std": 0.7809419631958008, "epsilon_dpo/loss_margin_mean": 281.4879150390625, "grad_norm": 42.91494369506836, "kl/avg_steps": 0.78125, "kl/beta": 0.003818507306277752, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.002837796667909e-07, "logits/chosen": -2.451190948486328, "logits/rejected": -2.4177732467651367, "logps/chosen": -195.28121948242188, "logps/ref_chosen": -45.32706832885742, "logps/ref_rejected": -108.54624938964844, "logps/rejected": -539.98828125, "loss": 0.7054, "rewards/accuracies": 0.921875, "rewards/chosen": -0.5699288845062256, "rewards/margins": 1.0638197660446167, "rewards/rejected": -1.6337485313415527, "step": 415 }, { "epoch": 0.6108663729809104, "epsilon_dpo/beta": 0.00375731335952878, "epsilon_dpo/beta_margin_grad_mean": -0.2511499524116516, "epsilon_dpo/beta_margin_grad_std": 0.1389520764350891, "epsilon_dpo/beta_margin_mean": 1.231960654258728, "epsilon_dpo/beta_margin_std": 0.7866246700286865, "epsilon_dpo/loss_margin_mean": 328.50146484375, "grad_norm": 41.69633483886719, "kl/avg_steps": 0.84375, "kl/beta": 0.003788906615227461, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.990267419549914e-07, "logits/chosen": -2.335287094116211, "logits/rejected": -2.2950291633605957, "logps/chosen": -167.67477416992188, "logps/ref_chosen": -38.674949645996094, "logps/ref_rejected": -98.17755889892578, "logps/rejected": -555.6788330078125, "loss": 0.6203, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4858553111553192, "rewards/margins": 1.2319605350494385, "rewards/rejected": -1.71781587600708, "step": 416 }, { "epoch": 0.6123348017621145, "epsilon_dpo/beta": 0.0037270504981279373, "epsilon_dpo/beta_margin_grad_mean": -0.26951104402542114, "epsilon_dpo/beta_margin_grad_std": 0.15652604401111603, "epsilon_dpo/beta_margin_mean": 1.1333545446395874, "epsilon_dpo/beta_margin_std": 0.8189650774002075, "epsilon_dpo/loss_margin_mean": 304.8978271484375, "grad_norm": 43.83847427368164, "kl/avg_steps": 0.8125, "kl/beta": 0.003757205093279481, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.9777104743594686e-07, "logits/chosen": -2.3649911880493164, "logits/rejected": -2.098527431488037, "logps/chosen": -147.36915588378906, "logps/ref_chosen": -26.17800521850586, "logps/ref_rejected": -74.53215026855469, "logps/rejected": -500.62115478515625, "loss": 0.6858, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4534587562084198, "rewards/margins": 1.1333545446395874, "rewards/rejected": -1.5868133306503296, "step": 417 }, { "epoch": 0.6138032305433186, "epsilon_dpo/beta": 0.0036970123182982206, "epsilon_dpo/beta_margin_grad_mean": -0.25392869114875793, "epsilon_dpo/beta_margin_grad_std": 0.1371525526046753, "epsilon_dpo/beta_margin_mean": 1.2147231101989746, "epsilon_dpo/beta_margin_std": 0.7788877487182617, "epsilon_dpo/loss_margin_mean": 329.29913330078125, "grad_norm": 42.56073760986328, "kl/avg_steps": 0.8125, "kl/beta": 0.0037269238382577896, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.965167291983757e-07, "logits/chosen": -2.3986239433288574, "logits/rejected": -2.3206000328063965, "logps/chosen": -180.94284057617188, "logps/ref_chosen": -51.68669128417969, "logps/ref_rejected": -112.65290832519531, "logps/rejected": -571.2081298828125, "loss": 0.6258, "rewards/accuracies": 0.9375, "rewards/chosen": -0.47949230670928955, "rewards/margins": 1.2147231101989746, "rewards/rejected": -1.6942154169082642, "step": 418 }, { "epoch": 0.6152716593245228, "epsilon_dpo/beta": 0.003663749899715185, "epsilon_dpo/beta_margin_grad_mean": -0.25385352969169617, "epsilon_dpo/beta_margin_grad_std": 0.13596518337726593, "epsilon_dpo/beta_margin_mean": 1.1844698190689087, "epsilon_dpo/beta_margin_std": 0.7216576337814331, "epsilon_dpo/loss_margin_mean": 323.8036804199219, "grad_norm": 44.2584228515625, "kl/avg_steps": 0.90625, "kl/beta": 0.0036968865897506475, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.9526382029472988e-07, "logits/chosen": -2.3918709754943848, "logits/rejected": -2.2970142364501953, "logps/chosen": -165.10519409179688, "logps/ref_chosen": -34.45082473754883, "logps/ref_rejected": -98.03851318359375, "logps/rejected": -552.49658203125, "loss": 0.6311, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4797900915145874, "rewards/margins": 1.1844696998596191, "rewards/rejected": -1.664259910583496, "step": 419 }, { "epoch": 0.6167400881057269, "epsilon_dpo/beta": 0.0036365704145282507, "epsilon_dpo/beta_margin_grad_mean": -0.27526789903640747, "epsilon_dpo/beta_margin_grad_std": 0.14264513552188873, "epsilon_dpo/beta_margin_mean": 1.0957496166229248, "epsilon_dpo/beta_margin_std": 0.781441330909729, "epsilon_dpo/loss_margin_mean": 302.1807556152344, "grad_norm": 52.36576843261719, "kl/avg_steps": 0.75, "kl/beta": 0.0036636844743043184, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.9401235374032425e-07, "logits/chosen": -2.424182891845703, "logits/rejected": -2.128852605819702, "logps/chosen": -174.64231872558594, "logps/ref_chosen": -37.82621765136719, "logps/ref_rejected": -76.69117736816406, "logps/rejected": -515.6880493164062, "loss": 0.6883, "rewards/accuracies": 0.921875, "rewards/chosen": -0.499639630317688, "rewards/margins": 1.0957496166229248, "rewards/rejected": -1.5953891277313232, "step": 420 }, { "epoch": 0.618208516886931, "epsilon_dpo/beta": 0.003604953410103917, "epsilon_dpo/beta_margin_grad_mean": -0.27535223960876465, "epsilon_dpo/beta_margin_grad_std": 0.13682620227336884, "epsilon_dpo/beta_margin_mean": 1.0881061553955078, "epsilon_dpo/beta_margin_std": 0.751617968082428, "epsilon_dpo/loss_margin_mean": 302.30877685546875, "grad_norm": 42.273597717285156, "kl/avg_steps": 0.875, "kl/beta": 0.003636411391198635, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.9276236251246653e-07, "logits/chosen": -2.356675863265991, "logits/rejected": -2.3096842765808105, "logps/chosen": -170.40980529785156, "logps/ref_chosen": -33.550575256347656, "logps/ref_rejected": -93.99878692626953, "logps/rejected": -533.166748046875, "loss": 0.6841, "rewards/accuracies": 0.953125, "rewards/chosen": -0.49425339698791504, "rewards/margins": 1.0881061553955078, "rewards/rejected": -1.5823595523834229, "step": 421 }, { "epoch": 0.6196769456681351, "epsilon_dpo/beta": 0.0035804433282464743, "epsilon_dpo/beta_margin_grad_mean": -0.29801541566848755, "epsilon_dpo/beta_margin_grad_std": 0.17275013029575348, "epsilon_dpo/beta_margin_mean": 0.9903641939163208, "epsilon_dpo/beta_margin_std": 0.89506596326828, "epsilon_dpo/loss_margin_mean": 277.8594970703125, "grad_norm": 46.02511215209961, "kl/avg_steps": 0.6875, "kl/beta": 0.0036048688925802708, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.9151387954958792e-07, "logits/chosen": -2.373016357421875, "logits/rejected": -2.3339807987213135, "logps/chosen": -231.19760131835938, "logps/ref_chosen": -53.665977478027344, "logps/ref_rejected": -91.93289947509766, "logps/rejected": -547.3240356445312, "loss": 0.7896, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6386697292327881, "rewards/margins": 0.9903641939163208, "rewards/rejected": -1.6290339231491089, "step": 422 }, { "epoch": 0.6211453744493393, "epsilon_dpo/beta": 0.003549282206222415, "epsilon_dpo/beta_margin_grad_mean": -0.2701137661933899, "epsilon_dpo/beta_margin_grad_std": 0.13685345649719238, "epsilon_dpo/beta_margin_mean": 1.1097465753555298, "epsilon_dpo/beta_margin_std": 0.7358165383338928, "epsilon_dpo/loss_margin_mean": 313.14617919921875, "grad_norm": 43.717838287353516, "kl/avg_steps": 0.875, "kl/beta": 0.003580254502594471, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.902669377503756e-07, "logits/chosen": -2.299365758895874, "logits/rejected": -2.2865123748779297, "logps/chosen": -190.4375762939453, "logps/ref_chosen": -35.00245666503906, "logps/ref_rejected": -91.82589721679688, "logps/rejected": -560.4072265625, "loss": 0.6702, "rewards/accuracies": 0.921875, "rewards/chosen": -0.5520890951156616, "rewards/margins": 1.1097465753555298, "rewards/rejected": -1.6618356704711914, "step": 423 }, { "epoch": 0.6226138032305433, "epsilon_dpo/beta": 0.0035229322966188192, "epsilon_dpo/beta_margin_grad_mean": -0.30060890316963196, "epsilon_dpo/beta_margin_grad_std": 0.15238063037395477, "epsilon_dpo/beta_margin_mean": 0.9499651789665222, "epsilon_dpo/beta_margin_std": 0.7828956842422485, "epsilon_dpo/loss_margin_mean": 270.63690185546875, "grad_norm": 42.60936737060547, "kl/avg_steps": 0.75, "kl/beta": 0.0035491990856826305, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.890215699729057e-07, "logits/chosen": -2.3963804244995117, "logits/rejected": -2.1616666316986084, "logps/chosen": -187.34298706054688, "logps/ref_chosen": -33.510772705078125, "logps/ref_rejected": -70.15070343017578, "logps/rejected": -494.61981201171875, "loss": 0.7749, "rewards/accuracies": 0.875, "rewards/chosen": -0.543306291103363, "rewards/margins": 0.9499651789665222, "rewards/rejected": -1.4932714700698853, "step": 424 }, { "epoch": 0.6240822320117474, "epsilon_dpo/beta": 0.0034945050720125437, "epsilon_dpo/beta_margin_grad_mean": -0.27929848432540894, "epsilon_dpo/beta_margin_grad_std": 0.15515829622745514, "epsilon_dpo/beta_margin_mean": 1.0812467336654663, "epsilon_dpo/beta_margin_std": 0.8364912867546082, "epsilon_dpo/loss_margin_mean": 310.2441101074219, "grad_norm": 47.06675338745117, "kl/avg_steps": 0.8125, "kl/beta": 0.0035227781627327204, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.8777780903377732e-07, "logits/chosen": -2.3238930702209473, "logits/rejected": -2.3823156356811523, "logps/chosen": -167.17559814453125, "logps/ref_chosen": -31.619510650634766, "logps/ref_rejected": -103.71711730957031, "logps/rejected": -549.517333984375, "loss": 0.7154, "rewards/accuracies": 0.90625, "rewards/chosen": -0.47508615255355835, "rewards/margins": 1.0812466144561768, "rewards/rejected": -1.5563328266143799, "step": 425 }, { "epoch": 0.6255506607929515, "epsilon_dpo/beta": 0.003468525130301714, "epsilon_dpo/beta_margin_grad_mean": -0.28968408703804016, "epsilon_dpo/beta_margin_grad_std": 0.1473844200372696, "epsilon_dpo/beta_margin_mean": 1.007444977760315, "epsilon_dpo/beta_margin_std": 0.7751356959342957, "epsilon_dpo/loss_margin_mean": 291.418701171875, "grad_norm": 66.66638946533203, "kl/avg_steps": 0.75, "kl/beta": 0.00349438632838428, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.8653568770724803e-07, "logits/chosen": -2.42264986038208, "logits/rejected": -2.2819976806640625, "logps/chosen": -187.71844482421875, "logps/ref_chosen": -49.42609405517578, "logps/ref_rejected": -86.20869445800781, "logps/rejected": -515.9197387695312, "loss": 0.7381, "rewards/accuracies": 0.890625, "rewards/chosen": -0.48144960403442383, "rewards/margins": 1.007444977760315, "rewards/rejected": -1.4888947010040283, "step": 426 }, { "epoch": 0.6270190895741556, "epsilon_dpo/beta": 0.0034443254116922617, "epsilon_dpo/beta_margin_grad_mean": -0.3163089156150818, "epsilon_dpo/beta_margin_grad_std": 0.1538737267255783, "epsilon_dpo/beta_margin_mean": 0.8614251017570496, "epsilon_dpo/beta_margin_std": 0.760391354560852, "epsilon_dpo/loss_margin_mean": 251.1796112060547, "grad_norm": 60.45020294189453, "kl/avg_steps": 0.703125, "kl/beta": 0.0034683735575526953, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -2.4879813194274902, "logits/rejected": -2.38423490524292, "logps/chosen": -194.640869140625, "logps/ref_chosen": -40.53302001953125, "logps/ref_rejected": -84.44095611572266, "logps/rejected": -489.7284240722656, "loss": 0.8242, "rewards/accuracies": 0.875, "rewards/chosen": -0.5330853462219238, "rewards/margins": 0.8614251613616943, "rewards/rejected": -1.3945105075836182, "step": 427 }, { "epoch": 0.6284875183553598, "epsilon_dpo/beta": 0.0034143617376685143, "epsilon_dpo/beta_margin_grad_mean": -0.2803729176521301, "epsilon_dpo/beta_margin_grad_std": 0.14126239717006683, "epsilon_dpo/beta_margin_mean": 1.0690611600875854, "epsilon_dpo/beta_margin_std": 0.7979580163955688, "epsilon_dpo/loss_margin_mean": 313.6771545410156, "grad_norm": 48.36793899536133, "kl/avg_steps": 0.875, "kl/beta": 0.0034441568423062563, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.8405649477212697e-07, "logits/chosen": -2.412050724029541, "logits/rejected": -2.416749954223633, "logps/chosen": -196.82138061523438, "logps/ref_chosen": -36.43036651611328, "logps/ref_rejected": -111.91488647460938, "logps/rejected": -585.9830322265625, "loss": 0.7038, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5489051938056946, "rewards/margins": 1.069061040878296, "rewards/rejected": -1.6179664134979248, "step": 428 }, { "epoch": 0.6299559471365639, "epsilon_dpo/beta": 0.0033868795726448298, "epsilon_dpo/beta_margin_grad_mean": -0.2734263241291046, "epsilon_dpo/beta_margin_grad_std": 0.13189740478992462, "epsilon_dpo/beta_margin_mean": 1.0902152061462402, "epsilon_dpo/beta_margin_std": 0.7191058397293091, "epsilon_dpo/loss_margin_mean": 322.5122375488281, "grad_norm": 45.01988220214844, "kl/avg_steps": 0.8125, "kl/beta": 0.003414281876757741, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.828194884925749e-07, "logits/chosen": -2.5120902061462402, "logits/rejected": -2.3635692596435547, "logps/chosen": -211.84674072265625, "logps/ref_chosen": -49.30812072753906, "logps/ref_rejected": -98.94145965576172, "logps/rejected": -583.9923095703125, "loss": 0.6748, "rewards/accuracies": 0.921875, "rewards/chosen": -0.5519835352897644, "rewards/margins": 1.0902152061462402, "rewards/rejected": -1.6421988010406494, "step": 429 }, { "epoch": 0.631424375917768, "epsilon_dpo/beta": 0.0033638167660683393, "epsilon_dpo/beta_margin_grad_mean": -0.2967393100261688, "epsilon_dpo/beta_margin_grad_std": 0.14850302040576935, "epsilon_dpo/beta_margin_mean": 0.9720072150230408, "epsilon_dpo/beta_margin_std": 0.7672375440597534, "epsilon_dpo/loss_margin_mean": 290.09161376953125, "grad_norm": 43.82745361328125, "kl/avg_steps": 0.6875, "kl/beta": 0.003386764321476221, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -2.4356656074523926, "logits/rejected": -2.5032784938812256, "logps/chosen": -194.5563507080078, "logps/ref_chosen": -45.3841438293457, "logps/ref_rejected": -110.27545928955078, "logps/rejected": -549.539306640625, "loss": 0.7561, "rewards/accuracies": 0.875, "rewards/chosen": -0.5032367706298828, "rewards/margins": 0.9720072746276855, "rewards/rejected": -1.4752440452575684, "step": 430 }, { "epoch": 0.6328928046989721, "epsilon_dpo/beta": 0.00333664333447814, "epsilon_dpo/beta_margin_grad_mean": -0.2679252624511719, "epsilon_dpo/beta_margin_grad_std": 0.13725493848323822, "epsilon_dpo/beta_margin_mean": 1.1313966512680054, "epsilon_dpo/beta_margin_std": 0.7779242396354675, "epsilon_dpo/loss_margin_mean": 339.86541748046875, "grad_norm": 37.97023391723633, "kl/avg_steps": 0.8125, "kl/beta": 0.003363639349117875, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.8035081928995788e-07, "logits/chosen": -2.4460361003875732, "logits/rejected": -2.4488284587860107, "logps/chosen": -159.66354370117188, "logps/ref_chosen": -34.30770492553711, "logps/ref_rejected": -98.43756866455078, "logps/rejected": -563.6588134765625, "loss": 0.6656, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4194945991039276, "rewards/margins": 1.1313966512680054, "rewards/rejected": -1.5508911609649658, "step": 431 }, { "epoch": 0.6343612334801763, "epsilon_dpo/beta": 0.0033087090123444796, "epsilon_dpo/beta_margin_grad_mean": -0.28203877806663513, "epsilon_dpo/beta_margin_grad_std": 0.12971222400665283, "epsilon_dpo/beta_margin_mean": 1.032950520515442, "epsilon_dpo/beta_margin_std": 0.699227511882782, "epsilon_dpo/loss_margin_mean": 312.8155822753906, "grad_norm": 44.22251892089844, "kl/avg_steps": 0.84375, "kl/beta": 0.003336530178785324, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.791192214186223e-07, "logits/chosen": -2.466712236404419, "logits/rejected": -2.4444732666015625, "logps/chosen": -184.24755859375, "logps/ref_chosen": -38.85405349731445, "logps/ref_rejected": -105.27049255371094, "logps/rejected": -563.4796142578125, "loss": 0.7004, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4821677803993225, "rewards/margins": 1.0329504013061523, "rewards/rejected": -1.5151182413101196, "step": 432 }, { "epoch": 0.6358296622613803, "epsilon_dpo/beta": 0.0032830932177603245, "epsilon_dpo/beta_margin_grad_mean": -0.2790597379207611, "epsilon_dpo/beta_margin_grad_std": 0.14551861584186554, "epsilon_dpo/beta_margin_mean": 1.0783592462539673, "epsilon_dpo/beta_margin_std": 0.7789259552955627, "epsilon_dpo/loss_margin_mean": 329.2322692871094, "grad_norm": 45.43602752685547, "kl/avg_steps": 0.78125, "kl/beta": 0.0033086135517805815, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.7788949132172193e-07, "logits/chosen": -2.3216044902801514, "logits/rejected": -2.3884963989257812, "logps/chosen": -213.7806396484375, "logps/ref_chosen": -35.81452178955078, "logps/ref_rejected": -103.11997985839844, "logps/rejected": -610.318359375, "loss": 0.6991, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5854209661483765, "rewards/margins": 1.0783591270446777, "rewards/rejected": -1.6637802124023438, "step": 433 }, { "epoch": 0.6372980910425844, "epsilon_dpo/beta": 0.00325866905041039, "epsilon_dpo/beta_margin_grad_mean": -0.31264179944992065, "epsilon_dpo/beta_margin_grad_std": 0.1338089406490326, "epsilon_dpo/beta_margin_mean": 0.8834754228591919, "epsilon_dpo/beta_margin_std": 0.7322390079498291, "epsilon_dpo/loss_margin_mean": 271.9414367675781, "grad_norm": 43.43961715698242, "kl/avg_steps": 0.75, "kl/beta": 0.0032829653937369585, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.7666166140378853e-07, "logits/chosen": -2.4602980613708496, "logits/rejected": -2.3824265003204346, "logps/chosen": -245.33560180664062, "logps/ref_chosen": -49.93016052246094, "logps/ref_rejected": -83.06277465820312, "logps/rejected": -550.40966796875, "loss": 0.7919, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6385765075683594, "rewards/margins": 0.8834754228591919, "rewards/rejected": -1.5220519304275513, "step": 434 }, { "epoch": 0.6387665198237885, "epsilon_dpo/beta": 0.003229319117963314, "epsilon_dpo/beta_margin_grad_mean": -0.2789570689201355, "epsilon_dpo/beta_margin_grad_std": 0.13298888504505157, "epsilon_dpo/beta_margin_mean": 1.0672143697738647, "epsilon_dpo/beta_margin_std": 0.7533097863197327, "epsilon_dpo/loss_margin_mean": 330.9088439941406, "grad_norm": 42.341400146484375, "kl/avg_steps": 0.90625, "kl/beta": 0.0032585265580564737, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.7543576401928218e-07, "logits/chosen": -2.4340808391571045, "logits/rejected": -2.4357552528381348, "logps/chosen": -165.0016326904297, "logps/ref_chosen": -33.67361831665039, "logps/ref_rejected": -93.73330688476562, "logps/rejected": -555.97021484375, "loss": 0.6928, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4249953627586365, "rewards/margins": 1.0672142505645752, "rewards/rejected": -1.4922096729278564, "step": 435 }, { "epoch": 0.6402349486049926, "epsilon_dpo/beta": 0.0032013256568461657, "epsilon_dpo/beta_margin_grad_mean": -0.2841736078262329, "epsilon_dpo/beta_margin_grad_std": 0.11989546567201614, "epsilon_dpo/beta_margin_mean": 1.0001612901687622, "epsilon_dpo/beta_margin_std": 0.6187871694564819, "epsilon_dpo/loss_margin_mean": 312.92425537109375, "grad_norm": 51.16184616088867, "kl/avg_steps": 0.875, "kl/beta": 0.003229261375963688, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.742118314717391e-07, "logits/chosen": -2.5231666564941406, "logits/rejected": -2.4640743732452393, "logps/chosen": -188.12747192382812, "logps/ref_chosen": -42.048667907714844, "logps/ref_rejected": -88.41279602050781, "logps/rejected": -547.4158325195312, "loss": 0.7008, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4683375954627991, "rewards/margins": 1.0001612901687622, "rewards/rejected": -1.468498945236206, "step": 436 }, { "epoch": 0.6417033773861968, "epsilon_dpo/beta": 0.0031775590032339096, "epsilon_dpo/beta_margin_grad_mean": -0.30295178294181824, "epsilon_dpo/beta_margin_grad_std": 0.14245736598968506, "epsilon_dpo/beta_margin_mean": 0.9152756929397583, "epsilon_dpo/beta_margin_std": 0.6979488134384155, "epsilon_dpo/loss_margin_mean": 289.0637512207031, "grad_norm": 45.89207458496094, "kl/avg_steps": 0.75, "kl/beta": 0.0032012504525482655, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.7298989601292036e-07, "logits/chosen": -2.4697906970977783, "logits/rejected": -2.390430450439453, "logps/chosen": -226.48953247070312, "logps/ref_chosen": -44.77692413330078, "logps/ref_rejected": -86.48928833007812, "logps/rejected": -557.265625, "loss": 0.7736, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5791641473770142, "rewards/margins": 0.9152756929397583, "rewards/rejected": -1.4944398403167725, "step": 437 }, { "epoch": 0.6431718061674009, "epsilon_dpo/beta": 0.0031519182957708836, "epsilon_dpo/beta_margin_grad_mean": -0.2808303236961365, "epsilon_dpo/beta_margin_grad_std": 0.12779134511947632, "epsilon_dpo/beta_margin_mean": 1.035179853439331, "epsilon_dpo/beta_margin_std": 0.6745886206626892, "epsilon_dpo/loss_margin_mean": 329.1087341308594, "grad_norm": 40.396175384521484, "kl/avg_steps": 0.8125, "kl/beta": 0.0031774197705090046, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.7176998984196144e-07, "logits/chosen": -2.473397731781006, "logits/rejected": -2.3974690437316895, "logps/chosen": -164.937744140625, "logps/ref_chosen": -33.662109375, "logps/ref_rejected": -89.52166748046875, "logps/rejected": -549.906005859375, "loss": 0.6943, "rewards/accuracies": 0.9375, "rewards/chosen": -0.41470038890838623, "rewards/margins": 1.035179853439331, "rewards/rejected": -1.4498802423477173, "step": 438 }, { "epoch": 0.644640234948605, "epsilon_dpo/beta": 0.003126515308395028, "epsilon_dpo/beta_margin_grad_mean": -0.3024454414844513, "epsilon_dpo/beta_margin_grad_std": 0.1240093857049942, "epsilon_dpo/beta_margin_mean": 0.91387939453125, "epsilon_dpo/beta_margin_std": 0.6512243747711182, "epsilon_dpo/loss_margin_mean": 292.9576721191406, "grad_norm": 45.90419006347656, "kl/avg_steps": 0.8125, "kl/beta": 0.0031518111936748028, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.7055214510452458e-07, "logits/chosen": -2.472698926925659, "logits/rejected": -2.5124192237854004, "logps/chosen": -196.35198974609375, "logps/ref_chosen": -34.986392974853516, "logps/ref_rejected": -87.497314453125, "logps/rejected": -541.820556640625, "loss": 0.7563, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5056626200675964, "rewards/margins": 0.91387939453125, "rewards/rejected": -1.4195419549942017, "step": 439 }, { "epoch": 0.6461086637298091, "epsilon_dpo/beta": 0.003103271359577775, "epsilon_dpo/beta_margin_grad_mean": -0.3023149073123932, "epsilon_dpo/beta_margin_grad_std": 0.134578138589859, "epsilon_dpo/beta_margin_mean": 0.9336312413215637, "epsilon_dpo/beta_margin_std": 0.7152769565582275, "epsilon_dpo/loss_margin_mean": 301.76104736328125, "grad_norm": 51.92120361328125, "kl/avg_steps": 0.75, "kl/beta": 0.003126409137621522, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.6933639389195134e-07, "logits/chosen": -2.5072128772735596, "logits/rejected": -2.5406949520111084, "logps/chosen": -238.4813232421875, "logps/ref_chosen": -53.56586837768555, "logps/ref_rejected": -104.3643569946289, "logps/rejected": -591.0408935546875, "loss": 0.761, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5756343603134155, "rewards/margins": 0.9336312413215637, "rewards/rejected": -1.509265661239624, "step": 440 }, { "epoch": 0.6475770925110133, "epsilon_dpo/beta": 0.0030753209721297026, "epsilon_dpo/beta_margin_grad_mean": -0.2959996163845062, "epsilon_dpo/beta_margin_grad_std": 0.12961319088935852, "epsilon_dpo/beta_margin_mean": 0.9479500651359558, "epsilon_dpo/beta_margin_std": 0.6735605597496033, "epsilon_dpo/loss_margin_mean": 308.67352294921875, "grad_norm": 40.502708435058594, "kl/avg_steps": 0.90625, "kl/beta": 0.003103135619312525, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.681227682404166e-07, "logits/chosen": -2.5055270195007324, "logits/rejected": -2.469489812850952, "logps/chosen": -200.94158935546875, "logps/ref_chosen": -39.209449768066406, "logps/ref_rejected": -102.78851318359375, "logps/rejected": -573.1942138671875, "loss": 0.7438, "rewards/accuracies": 0.9375, "rewards/chosen": -0.49797898530960083, "rewards/margins": 0.947950005531311, "rewards/rejected": -1.4459290504455566, "step": 441 }, { "epoch": 0.6490455212922174, "epsilon_dpo/beta": 0.003050584578886628, "epsilon_dpo/beta_margin_grad_mean": -0.2755725681781769, "epsilon_dpo/beta_margin_grad_std": 0.11769750714302063, "epsilon_dpo/beta_margin_mean": 1.046021819114685, "epsilon_dpo/beta_margin_std": 0.6180484294891357, "epsilon_dpo/loss_margin_mean": 343.6279602050781, "grad_norm": 37.710105895996094, "kl/avg_steps": 0.8125, "kl/beta": 0.0030752660240978003, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.669113001300851e-07, "logits/chosen": -2.491802930831909, "logits/rejected": -2.457737922668457, "logps/chosen": -159.370849609375, "logps/ref_chosen": -29.0069580078125, "logps/ref_rejected": -83.71453857421875, "logps/rejected": -557.7063598632812, "loss": 0.675, "rewards/accuracies": 0.921875, "rewards/chosen": -0.39891505241394043, "rewards/margins": 1.046021819114685, "rewards/rejected": -1.444936990737915, "step": 442 }, { "epoch": 0.6505139500734214, "epsilon_dpo/beta": 0.0030279052443802357, "epsilon_dpo/beta_margin_grad_mean": -0.3231920897960663, "epsilon_dpo/beta_margin_grad_std": 0.15162836015224457, "epsilon_dpo/beta_margin_mean": 0.8422841429710388, "epsilon_dpo/beta_margin_std": 0.7962964177131653, "epsilon_dpo/loss_margin_mean": 279.1518249511719, "grad_norm": 46.755645751953125, "kl/avg_steps": 0.75, "kl/beta": 0.003050480969250202, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.6570202148426815e-07, "logits/chosen": -2.5168378353118896, "logits/rejected": -2.499556541442871, "logps/chosen": -233.57965087890625, "logps/ref_chosen": -52.68767166137695, "logps/ref_rejected": -93.39274597167969, "logps/rejected": -553.4365234375, "loss": 0.841, "rewards/accuracies": 0.890625, "rewards/chosen": -0.549048900604248, "rewards/margins": 0.8422842025756836, "rewards/rejected": -1.3913331031799316, "step": 443 }, { "epoch": 0.6519823788546255, "epsilon_dpo/beta": 0.0029996871016919613, "epsilon_dpo/beta_margin_grad_mean": -0.26602280139923096, "epsilon_dpo/beta_margin_grad_std": 0.12350434064865112, "epsilon_dpo/beta_margin_mean": 1.1127806901931763, "epsilon_dpo/beta_margin_std": 0.6654544472694397, "epsilon_dpo/loss_margin_mean": 371.2488708496094, "grad_norm": 38.41836929321289, "kl/avg_steps": 0.9375, "kl/beta": 0.003027772530913353, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -2.462678909301758, "logits/rejected": -2.5427470207214355, "logps/chosen": -178.10812377929688, "logps/ref_chosen": -35.20741271972656, "logps/ref_rejected": -104.47367858886719, "logps/rejected": -618.6232299804688, "loss": 0.6502, "rewards/accuracies": 0.984375, "rewards/chosen": -0.4290057420730591, "rewards/margins": 1.1127806901931763, "rewards/rejected": -1.5417864322662354, "step": 444 }, { "epoch": 0.6534508076358296, "epsilon_dpo/beta": 0.0029783889185637236, "epsilon_dpo/beta_margin_grad_mean": -0.31596365571022034, "epsilon_dpo/beta_margin_grad_std": 0.1648440957069397, "epsilon_dpo/beta_margin_mean": 0.8814136385917664, "epsilon_dpo/beta_margin_std": 0.8328824639320374, "epsilon_dpo/loss_margin_mean": 297.23583984375, "grad_norm": 48.016639709472656, "kl/avg_steps": 0.71875, "kl/beta": 0.002999651012942195, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.6329015999011182e-07, "logits/chosen": -2.5741279125213623, "logits/rejected": -2.5458946228027344, "logps/chosen": -216.25303649902344, "logps/ref_chosen": -46.78347396850586, "logps/ref_rejected": -99.2047119140625, "logps/rejected": -565.91015625, "loss": 0.8334, "rewards/accuracies": 0.875, "rewards/chosen": -0.5069864392280579, "rewards/margins": 0.8814135789871216, "rewards/rejected": -1.3884000778198242, "step": 445 }, { "epoch": 0.6549192364170338, "epsilon_dpo/beta": 0.0029506187420338392, "epsilon_dpo/beta_margin_grad_mean": -0.2692401707172394, "epsilon_dpo/beta_margin_grad_std": 0.11489293724298477, "epsilon_dpo/beta_margin_mean": 1.0957218408584595, "epsilon_dpo/beta_margin_std": 0.6562060713768005, "epsilon_dpo/loss_margin_mean": 371.6014099121094, "grad_norm": 50.814979553222656, "kl/avg_steps": 0.9375, "kl/beta": 0.002978244796395302, "kl/n_epsilon_steps": 0.03125, "kl/p_epsilon_steps": 0.96875, "learning_rate": 1.6208764069656578e-07, "logits/chosen": -2.528761863708496, "logits/rejected": -2.6162219047546387, "logps/chosen": -160.44252014160156, "logps/ref_chosen": -34.56015396118164, "logps/ref_rejected": -109.97004699707031, "logps/rejected": -607.453857421875, "loss": 0.6534, "rewards/accuracies": 0.96875, "rewards/chosen": -0.37178486585617065, "rewards/margins": 1.0957218408584595, "rewards/rejected": -1.4675066471099854, "step": 446 }, { "epoch": 0.6563876651982379, "epsilon_dpo/beta": 0.0029278243891894817, "epsilon_dpo/beta_margin_grad_mean": -0.2999436855316162, "epsilon_dpo/beta_margin_grad_std": 0.13803143799304962, "epsilon_dpo/beta_margin_mean": 0.9522778391838074, "epsilon_dpo/beta_margin_std": 0.7474956512451172, "epsilon_dpo/loss_margin_mean": 326.1642150878906, "grad_norm": 40.274986267089844, "kl/avg_steps": 0.78125, "kl/beta": 0.00295058311894536, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.608874379754465e-07, "logits/chosen": -2.502809762954712, "logits/rejected": -2.5910682678222656, "logps/chosen": -199.97299194335938, "logps/ref_chosen": -39.49730682373047, "logps/ref_rejected": -105.97085571289062, "logps/rejected": -592.6107788085938, "loss": 0.7578, "rewards/accuracies": 0.890625, "rewards/chosen": -0.471713662147522, "rewards/margins": 0.9522777795791626, "rewards/rejected": -1.4239914417266846, "step": 447 }, { "epoch": 0.657856093979442, "epsilon_dpo/beta": 0.0029014681931585073, "epsilon_dpo/beta_margin_grad_mean": -0.2846612334251404, "epsilon_dpo/beta_margin_grad_std": 0.12346479296684265, "epsilon_dpo/beta_margin_mean": 1.0155407190322876, "epsilon_dpo/beta_margin_std": 0.674639880657196, "epsilon_dpo/loss_margin_mean": 350.43487548828125, "grad_norm": 41.37987518310547, "kl/avg_steps": 0.90625, "kl/beta": 0.0029277103021740913, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.5968958345321177e-07, "logits/chosen": -2.5615386962890625, "logits/rejected": -2.621204376220703, "logps/chosen": -227.657958984375, "logps/ref_chosen": -42.827239990234375, "logps/ref_rejected": -109.36424255371094, "logps/rejected": -644.6298217773438, "loss": 0.7027, "rewards/accuracies": 0.953125, "rewards/chosen": -0.5373802781105042, "rewards/margins": 1.0155407190322876, "rewards/rejected": -1.5529210567474365, "step": 448 }, { "epoch": 0.6593245227606461, "epsilon_dpo/beta": 0.002878130180761218, "epsilon_dpo/beta_margin_grad_mean": -0.28992605209350586, "epsilon_dpo/beta_margin_grad_std": 0.13492745161056519, "epsilon_dpo/beta_margin_mean": 0.9840430617332458, "epsilon_dpo/beta_margin_std": 0.6940675973892212, "epsilon_dpo/loss_margin_mean": 342.7997131347656, "grad_norm": 43.268863677978516, "kl/avg_steps": 0.8125, "kl/beta": 0.0029014162719249725, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.584941086944423e-07, "logits/chosen": -2.5850870609283447, "logits/rejected": -2.595974922180176, "logps/chosen": -194.70394897460938, "logps/ref_chosen": -36.90496826171875, "logps/ref_rejected": -95.95344543457031, "logps/rejected": -596.5521240234375, "loss": 0.7298, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4556068181991577, "rewards/margins": 0.9840430021286011, "rewards/rejected": -1.4396498203277588, "step": 449 }, { "epoch": 0.6607929515418502, "epsilon_dpo/beta": 0.002853134647011757, "epsilon_dpo/beta_margin_grad_mean": -0.2585708200931549, "epsilon_dpo/beta_margin_grad_std": 0.11334740370512009, "epsilon_dpo/beta_margin_mean": 1.1357218027114868, "epsilon_dpo/beta_margin_std": 0.607581377029419, "epsilon_dpo/loss_margin_mean": 398.61895751953125, "grad_norm": 37.674678802490234, "kl/avg_steps": 0.875, "kl/beta": 0.00287803215906024, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.573010452010098e-07, "logits/chosen": -2.546546459197998, "logits/rejected": -2.698519229888916, "logps/chosen": -175.88995361328125, "logps/ref_chosen": -34.65415573120117, "logps/ref_rejected": -108.24179077148438, "logps/rejected": -648.0965576171875, "loss": 0.6252, "rewards/accuracies": 0.953125, "rewards/chosen": -0.40372031927108765, "rewards/margins": 1.1357218027114868, "rewards/rejected": -1.5394420623779297, "step": 450 }, { "epoch": 0.6622613803230544, "epsilon_dpo/beta": 0.002834628103300929, "epsilon_dpo/beta_margin_grad_mean": -0.32642942667007446, "epsilon_dpo/beta_margin_grad_std": 0.15271244943141937, "epsilon_dpo/beta_margin_mean": 0.8098753690719604, "epsilon_dpo/beta_margin_std": 0.763114869594574, "epsilon_dpo/loss_margin_mean": 287.087158203125, "grad_norm": 46.581146240234375, "kl/avg_steps": 0.65625, "kl/beta": 0.0028530678246170282, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.5611042441124687e-07, "logits/chosen": -2.5236382484436035, "logits/rejected": -2.479279041290283, "logps/chosen": -234.43658447265625, "logps/ref_chosen": -42.703250885009766, "logps/ref_rejected": -79.43376159667969, "logps/rejected": -558.2542724609375, "loss": 0.8563, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5459603071212769, "rewards/margins": 0.8098753690719604, "rewards/rejected": -1.3558356761932373, "step": 451 }, { "epoch": 0.6637298091042585, "epsilon_dpo/beta": 0.0028108321130275726, "epsilon_dpo/beta_margin_grad_mean": -0.2964177429676056, "epsilon_dpo/beta_margin_grad_std": 0.12497884780168533, "epsilon_dpo/beta_margin_mean": 0.9474288821220398, "epsilon_dpo/beta_margin_std": 0.651831328868866, "epsilon_dpo/loss_margin_mean": 337.69549560546875, "grad_norm": 40.06062316894531, "kl/avg_steps": 0.84375, "kl/beta": 0.002834466751664877, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.549222776991186e-07, "logits/chosen": -2.5114731788635254, "logits/rejected": -2.680518627166748, "logps/chosen": -166.42276000976562, "logps/ref_chosen": -35.80718231201172, "logps/ref_rejected": -102.27734375, "logps/rejected": -570.5884399414062, "loss": 0.7379, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3679235577583313, "rewards/margins": 0.9474288821220398, "rewards/rejected": -1.315352439880371, "step": 452 }, { "epoch": 0.6651982378854625, "epsilon_dpo/beta": 0.002787314122542739, "epsilon_dpo/beta_margin_grad_mean": -0.2953357696533203, "epsilon_dpo/beta_margin_grad_std": 0.12554973363876343, "epsilon_dpo/beta_margin_mean": 0.9552209973335266, "epsilon_dpo/beta_margin_std": 0.6552383899688721, "epsilon_dpo/loss_margin_mean": 343.3067321777344, "grad_norm": 39.58503723144531, "kl/avg_steps": 0.84375, "kl/beta": 0.002810751087963581, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.5373663637339584e-07, "logits/chosen": -2.605818748474121, "logits/rejected": -2.60537052154541, "logps/chosen": -195.30682373046875, "logps/ref_chosen": -41.37712860107422, "logps/ref_rejected": -87.86880493164062, "logps/rejected": -585.105224609375, "loss": 0.7343, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4305250644683838, "rewards/margins": 0.9552209973335266, "rewards/rejected": -1.3857460021972656, "step": 453 }, { "epoch": 0.6666666666666666, "epsilon_dpo/beta": 0.0027683484368026257, "epsilon_dpo/beta_margin_grad_mean": -0.29989561438560486, "epsilon_dpo/beta_margin_grad_std": 0.14955060184001923, "epsilon_dpo/beta_margin_mean": 0.9520212411880493, "epsilon_dpo/beta_margin_std": 0.7603475451469421, "epsilon_dpo/loss_margin_mean": 345.2506103515625, "grad_norm": 55.10441207885742, "kl/avg_steps": 0.6875, "kl/beta": 0.0027872337959706783, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.5255353167683017e-07, "logits/chosen": -2.5729262828826904, "logits/rejected": -2.6592321395874023, "logps/chosen": -236.1055145263672, "logps/ref_chosen": -44.58696746826172, "logps/ref_rejected": -90.57184600830078, "logps/rejected": -627.3410034179688, "loss": 0.7676, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5324690341949463, "rewards/margins": 0.9520212411880493, "rewards/rejected": -1.4844902753829956, "step": 454 }, { "epoch": 0.6681350954478708, "epsilon_dpo/beta": 0.0027442548889666796, "epsilon_dpo/beta_margin_grad_mean": -0.26028749346733093, "epsilon_dpo/beta_margin_grad_std": 0.12892557680606842, "epsilon_dpo/beta_margin_mean": 1.1716121435165405, "epsilon_dpo/beta_margin_std": 0.7572227120399475, "epsilon_dpo/loss_margin_mean": 427.5795593261719, "grad_norm": 33.066959381103516, "kl/avg_steps": 0.875, "kl/beta": 0.002768202219158411, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.5137299478533064e-07, "logits/chosen": -2.561234474182129, "logits/rejected": -2.784088611602783, "logps/chosen": -160.47991943359375, "logps/ref_chosen": -22.870223999023438, "logps/ref_rejected": -121.32386779785156, "logps/rejected": -686.5131225585938, "loss": 0.6379, "rewards/accuracies": 0.953125, "rewards/chosen": -0.37842369079589844, "rewards/margins": 1.171612024307251, "rewards/rejected": -1.5500357151031494, "step": 455 }, { "epoch": 0.6696035242290749, "epsilon_dpo/beta": 0.002723881509155035, "epsilon_dpo/beta_margin_grad_mean": -0.2826097905635834, "epsilon_dpo/beta_margin_grad_std": 0.13924254477024078, "epsilon_dpo/beta_margin_mean": 1.053339958190918, "epsilon_dpo/beta_margin_std": 0.7655569314956665, "epsilon_dpo/loss_margin_mean": 387.81005859375, "grad_norm": 35.979026794433594, "kl/avg_steps": 0.75, "kl/beta": 0.002744190627709031, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.5019505680714232e-07, "logits/chosen": -2.559840202331543, "logits/rejected": -2.7625198364257812, "logps/chosen": -205.63754272460938, "logps/ref_chosen": -40.844276428222656, "logps/ref_rejected": -111.70032501220703, "logps/rejected": -664.3036499023438, "loss": 0.7064, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4502452611923218, "rewards/margins": 1.053339958190918, "rewards/rejected": -1.5035851001739502, "step": 456 }, { "epoch": 0.671071953010279, "epsilon_dpo/beta": 0.0027019022963941097, "epsilon_dpo/beta_margin_grad_mean": -0.3032781481742859, "epsilon_dpo/beta_margin_grad_std": 0.13411802053451538, "epsilon_dpo/beta_margin_mean": 0.900545597076416, "epsilon_dpo/beta_margin_std": 0.6524316668510437, "epsilon_dpo/loss_margin_mean": 334.1934509277344, "grad_norm": 48.26573944091797, "kl/avg_steps": 0.8125, "kl/beta": 0.002723762532696128, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.4901974878202627e-07, "logits/chosen": -2.6022448539733887, "logits/rejected": -2.619676351547241, "logps/chosen": -215.42941284179688, "logps/ref_chosen": -38.554141998291016, "logps/ref_rejected": -90.09440612792969, "logps/rejected": -601.1631469726562, "loss": 0.7706, "rewards/accuracies": 0.921875, "rewards/chosen": -0.47887659072875977, "rewards/margins": 0.900545597076416, "rewards/rejected": -1.3794221878051758, "step": 457 }, { "epoch": 0.6725403817914831, "epsilon_dpo/beta": 0.002680970588698983, "epsilon_dpo/beta_margin_grad_mean": -0.29450592398643494, "epsilon_dpo/beta_margin_grad_std": 0.12352831661701202, "epsilon_dpo/beta_margin_mean": 0.951833963394165, "epsilon_dpo/beta_margin_std": 0.6352096199989319, "epsilon_dpo/loss_margin_mean": 355.9402770996094, "grad_norm": 49.126129150390625, "kl/avg_steps": 0.78125, "kl/beta": 0.0027018103282898664, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.4784710168044212e-07, "logits/chosen": -2.625985622406006, "logits/rejected": -2.772733211517334, "logps/chosen": -206.733642578125, "logps/ref_chosen": -37.41191482543945, "logps/ref_rejected": -104.581298828125, "logps/rejected": -629.8433227539062, "loss": 0.732, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4552924931049347, "rewards/margins": 0.951833963394165, "rewards/rejected": -1.4071264266967773, "step": 458 }, { "epoch": 0.6740088105726872, "epsilon_dpo/beta": 0.002659349935129285, "epsilon_dpo/beta_margin_grad_mean": -0.31732696294784546, "epsilon_dpo/beta_margin_grad_std": 0.12530642747879028, "epsilon_dpo/beta_margin_mean": 0.8335932493209839, "epsilon_dpo/beta_margin_std": 0.6189974546432495, "epsilon_dpo/loss_margin_mean": 314.1438903808594, "grad_norm": 36.275413513183594, "kl/avg_steps": 0.8125, "kl/beta": 0.0026808660477399826, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.466771464027316e-07, "logits/chosen": -2.501951217651367, "logits/rejected": -2.667633533477783, "logps/chosen": -192.38290405273438, "logps/ref_chosen": -32.51487350463867, "logps/ref_rejected": -90.99087524414062, "logps/rejected": -565.0028076171875, "loss": 0.8004, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4260733127593994, "rewards/margins": 0.8335932493209839, "rewards/rejected": -1.2596666812896729, "step": 459 }, { "epoch": 0.6754772393538914, "epsilon_dpo/beta": 0.0026354235596954823, "epsilon_dpo/beta_margin_grad_mean": -0.2933025658130646, "epsilon_dpo/beta_margin_grad_std": 0.11362046003341675, "epsilon_dpo/beta_margin_mean": 0.9537962079048157, "epsilon_dpo/beta_margin_std": 0.6000516414642334, "epsilon_dpo/loss_margin_mean": 362.2628479003906, "grad_norm": 46.03992462158203, "kl/avg_steps": 0.90625, "kl/beta": 0.0026592593640089035, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.4550991377830423e-07, "logits/chosen": -2.543520212173462, "logits/rejected": -2.7920000553131104, "logps/chosen": -206.57542419433594, "logps/ref_chosen": -31.02279281616211, "logps/ref_rejected": -110.9461669921875, "logps/rejected": -648.7615966796875, "loss": 0.7211, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4630896747112274, "rewards/margins": 0.9537962079048157, "rewards/rejected": -1.4168858528137207, "step": 460 }, { "epoch": 0.6769456681350955, "epsilon_dpo/beta": 0.002612577984109521, "epsilon_dpo/beta_margin_grad_mean": -0.31691277027130127, "epsilon_dpo/beta_margin_grad_std": 0.11491651087999344, "epsilon_dpo/beta_margin_mean": 0.8249220252037048, "epsilon_dpo/beta_margin_std": 0.5639069676399231, "epsilon_dpo/loss_margin_mean": 316.2416687011719, "grad_norm": 37.75797653198242, "kl/avg_steps": 0.875, "kl/beta": 0.0026353762950748205, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.4434543456482518e-07, "logits/chosen": -2.542848587036133, "logits/rejected": -2.7294955253601074, "logps/chosen": -206.56666564941406, "logps/ref_chosen": -35.32524108886719, "logps/ref_rejected": -93.41868591308594, "logps/rejected": -580.9017333984375, "loss": 0.7926, "rewards/accuracies": 0.953125, "rewards/chosen": -0.44818025827407837, "rewards/margins": 0.8249219655990601, "rewards/rejected": -1.2731022834777832, "step": 461 }, { "epoch": 0.6784140969162996, "epsilon_dpo/beta": 0.0025939985644072294, "epsilon_dpo/beta_margin_grad_mean": -0.3237418830394745, "epsilon_dpo/beta_margin_grad_std": 0.13112208247184753, "epsilon_dpo/beta_margin_mean": 0.8129886984825134, "epsilon_dpo/beta_margin_std": 0.6604252457618713, "epsilon_dpo/loss_margin_mean": 314.4653015136719, "grad_norm": 40.553749084472656, "kl/avg_steps": 0.71875, "kl/beta": 0.00261251674965024, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.4318373944740484e-07, "logits/chosen": -2.639768123626709, "logits/rejected": -2.776127338409424, "logps/chosen": -216.20462036132812, "logps/ref_chosen": -44.890872955322266, "logps/ref_rejected": -85.42142486572266, "logps/rejected": -571.200439453125, "loss": 0.8226, "rewards/accuracies": 0.859375, "rewards/chosen": -0.446227490901947, "rewards/margins": 0.8129886984825134, "rewards/rejected": -1.2592161893844604, "step": 462 }, { "epoch": 0.6798825256975036, "epsilon_dpo/beta": 0.0025706232991069555, "epsilon_dpo/beta_margin_grad_mean": -0.28741469979286194, "epsilon_dpo/beta_margin_grad_std": 0.12530295550823212, "epsilon_dpo/beta_margin_mean": 1.0049502849578857, "epsilon_dpo/beta_margin_std": 0.6909827589988708, "epsilon_dpo/loss_margin_mean": 391.4453125, "grad_norm": 36.294189453125, "kl/avg_steps": 0.90625, "kl/beta": 0.0025938733015209436, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.4202485903778976e-07, "logits/chosen": -2.6299610137939453, "logits/rejected": -2.773591995239258, "logps/chosen": -152.20263671875, "logps/ref_chosen": -27.038963317871094, "logps/ref_rejected": -98.03726196289062, "logps/rejected": -614.646240234375, "loss": 0.712, "rewards/accuracies": 0.953125, "rewards/chosen": -0.32232174277305603, "rewards/margins": 1.0049502849578857, "rewards/rejected": -1.3272720575332642, "step": 463 }, { "epoch": 0.6813509544787077, "epsilon_dpo/beta": 0.00254994654096663, "epsilon_dpo/beta_margin_grad_mean": -0.2655731737613678, "epsilon_dpo/beta_margin_grad_std": 0.12443613260984421, "epsilon_dpo/beta_margin_mean": 1.1106587648391724, "epsilon_dpo/beta_margin_std": 0.6612549424171448, "epsilon_dpo/loss_margin_mean": 436.4868469238281, "grad_norm": 48.614654541015625, "kl/avg_steps": 0.8125, "kl/beta": 0.0025705774314701557, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.4086882387355658e-07, "logits/chosen": -2.5940399169921875, "logits/rejected": -2.805851697921753, "logps/chosen": -189.71588134765625, "logps/ref_chosen": -33.55242919921875, "logps/ref_rejected": -109.08905029296875, "logps/rejected": -701.7393798828125, "loss": 0.6506, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3993784487247467, "rewards/margins": 1.1106586456298828, "rewards/rejected": -1.5100371837615967, "step": 464 }, { "epoch": 0.6828193832599119, "epsilon_dpo/beta": 0.0025285982992500067, "epsilon_dpo/beta_margin_grad_mean": -0.28670158982276917, "epsilon_dpo/beta_margin_grad_std": 0.11507753282785416, "epsilon_dpo/beta_margin_mean": 0.9909960627555847, "epsilon_dpo/beta_margin_std": 0.621567964553833, "epsilon_dpo/loss_margin_mean": 392.5635986328125, "grad_norm": 35.46124267578125, "kl/avg_steps": 0.84375, "kl/beta": 0.0025498599279671907, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.3971566441730714e-07, "logits/chosen": -2.6725172996520996, "logits/rejected": -2.8897171020507812, "logps/chosen": -178.53143310546875, "logps/ref_chosen": -35.28005599975586, "logps/ref_rejected": -116.3499755859375, "logps/rejected": -652.1649780273438, "loss": 0.7035, "rewards/accuracies": 0.953125, "rewards/chosen": -0.36298391222953796, "rewards/margins": 0.9909961223602295, "rewards/rejected": -1.3539800643920898, "step": 465 }, { "epoch": 0.684287812041116, "epsilon_dpo/beta": 0.0025074416771531105, "epsilon_dpo/beta_margin_grad_mean": -0.3023064434528351, "epsilon_dpo/beta_margin_grad_std": 0.11443696171045303, "epsilon_dpo/beta_margin_mean": 0.9071800112724304, "epsilon_dpo/beta_margin_std": 0.6066918969154358, "epsilon_dpo/loss_margin_mean": 362.46087646484375, "grad_norm": 32.29966354370117, "kl/avg_steps": 0.84375, "kl/beta": 0.002528525423258543, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.3856541105586545e-07, "logits/chosen": -2.5877268314361572, "logits/rejected": -2.7581279277801514, "logps/chosen": -175.187255859375, "logps/ref_chosen": -28.18646240234375, "logps/ref_rejected": -97.64432525634766, "logps/rejected": -607.10595703125, "loss": 0.749, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3691789507865906, "rewards/margins": 0.9071800112724304, "rewards/rejected": -1.2763590812683105, "step": 466 }, { "epoch": 0.6857562408223201, "epsilon_dpo/beta": 0.0024880296550691128, "epsilon_dpo/beta_margin_grad_mean": -0.2942372262477875, "epsilon_dpo/beta_margin_grad_std": 0.15898768603801727, "epsilon_dpo/beta_margin_mean": 0.9923102259635925, "epsilon_dpo/beta_margin_std": 0.829995334148407, "epsilon_dpo/loss_margin_mean": 400.21441650390625, "grad_norm": 47.59917449951172, "kl/avg_steps": 0.78125, "kl/beta": 0.002507369499653578, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.3741809409947729e-07, "logits/chosen": -2.691765308380127, "logits/rejected": -2.9094629287719727, "logps/chosen": -223.679443359375, "logps/ref_chosen": -46.7025146484375, "logps/ref_rejected": -110.00337982177734, "logps/rejected": -687.1947021484375, "loss": 0.7652, "rewards/accuracies": 0.890625, "rewards/chosen": -0.44280189275741577, "rewards/margins": 0.9923102259635925, "rewards/rejected": -1.4351122379302979, "step": 467 }, { "epoch": 0.6872246696035242, "epsilon_dpo/beta": 0.0024679647758603096, "epsilon_dpo/beta_margin_grad_mean": -0.29025694727897644, "epsilon_dpo/beta_margin_grad_std": 0.14004936814308167, "epsilon_dpo/beta_margin_mean": 1.0126533508300781, "epsilon_dpo/beta_margin_std": 0.7630032896995544, "epsilon_dpo/loss_margin_mean": 411.1828918457031, "grad_norm": 40.286834716796875, "kl/avg_steps": 0.8125, "kl/beta": 0.002487932564690709, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.362737437810114e-07, "logits/chosen": -2.67857027053833, "logits/rejected": -2.87001895904541, "logps/chosen": -192.12745666503906, "logps/ref_chosen": -42.05735778808594, "logps/ref_rejected": -109.48826599121094, "logps/rejected": -670.7412109375, "loss": 0.728, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37121495604515076, "rewards/margins": 1.0126533508300781, "rewards/rejected": -1.3838683366775513, "step": 468 }, { "epoch": 0.6886930983847284, "epsilon_dpo/beta": 0.002445760415866971, "epsilon_dpo/beta_margin_grad_mean": -0.2844110131263733, "epsilon_dpo/beta_margin_grad_std": 0.1195027157664299, "epsilon_dpo/beta_margin_mean": 1.0001988410949707, "epsilon_dpo/beta_margin_std": 0.6260125637054443, "epsilon_dpo/loss_margin_mean": 409.49169921875, "grad_norm": 48.018531799316406, "kl/avg_steps": 0.90625, "kl/beta": 0.002467880956828594, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.351323902551631e-07, "logits/chosen": -2.6168570518493652, "logits/rejected": -2.8470990657806396, "logps/chosen": -206.51370239257812, "logps/ref_chosen": -36.20582580566406, "logps/ref_rejected": -111.1355972290039, "logps/rejected": -690.9351806640625, "loss": 0.7019, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4174906611442566, "rewards/margins": 1.0001988410949707, "rewards/rejected": -1.417689561843872, "step": 469 }, { "epoch": 0.6901615271659325, "epsilon_dpo/beta": 0.002425323473289609, "epsilon_dpo/beta_margin_grad_mean": -0.2954522371292114, "epsilon_dpo/beta_margin_grad_std": 0.12080518156290054, "epsilon_dpo/beta_margin_mean": 0.9496338963508606, "epsilon_dpo/beta_margin_std": 0.6434259414672852, "epsilon_dpo/loss_margin_mean": 392.281005859375, "grad_norm": 35.75682830810547, "kl/avg_steps": 0.84375, "kl/beta": 0.002445716643705964, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.339940635976592e-07, "logits/chosen": -2.6013760566711426, "logits/rejected": -2.87990665435791, "logps/chosen": -157.40719604492188, "logps/ref_chosen": -31.09160804748535, "logps/ref_rejected": -87.30916595458984, "logps/rejected": -605.90576171875, "loss": 0.7329, "rewards/accuracies": 0.953125, "rewards/chosen": -0.30687910318374634, "rewards/margins": 0.9496338367462158, "rewards/rejected": -1.2565128803253174, "step": 470 }, { "epoch": 0.6916299559471366, "epsilon_dpo/beta": 0.002405789215117693, "epsilon_dpo/beta_margin_grad_mean": -0.30035099387168884, "epsilon_dpo/beta_margin_grad_std": 0.11763791739940643, "epsilon_dpo/beta_margin_mean": 0.9167090654373169, "epsilon_dpo/beta_margin_std": 0.606320858001709, "epsilon_dpo/loss_margin_mean": 381.869140625, "grad_norm": 35.150997161865234, "kl/avg_steps": 0.8125, "kl/beta": 0.002425253624096513, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.3285879380446563e-07, "logits/chosen": -2.7026596069335938, "logits/rejected": -2.8277225494384766, "logps/chosen": -211.00067138671875, "logps/ref_chosen": -44.132484436035156, "logps/ref_rejected": -89.70744323730469, "logps/rejected": -638.4447631835938, "loss": 0.745, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4026249945163727, "rewards/margins": 0.9167090654373169, "rewards/rejected": -1.3193340301513672, "step": 471 }, { "epoch": 0.6930983847283406, "epsilon_dpo/beta": 0.002386399544775486, "epsilon_dpo/beta_margin_grad_mean": -0.3045946955680847, "epsilon_dpo/beta_margin_grad_std": 0.14598874747753143, "epsilon_dpo/beta_margin_mean": 0.9562135934829712, "epsilon_dpo/beta_margin_std": 0.8416454792022705, "epsilon_dpo/loss_margin_mean": 401.76422119140625, "grad_norm": 36.8669548034668, "kl/avg_steps": 0.8125, "kl/beta": 0.002405707258731127, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.317266107909975e-07, "logits/chosen": -2.770143985748291, "logits/rejected": -2.981727361679077, "logps/chosen": -265.7175598144531, "logps/ref_chosen": -59.834197998046875, "logps/ref_rejected": -123.57960510253906, "logps/rejected": -731.2271728515625, "loss": 0.778, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4928061366081238, "rewards/margins": 0.9562135934829712, "rewards/rejected": -1.4490196704864502, "step": 472 }, { "epoch": 0.6945668135095447, "epsilon_dpo/beta": 0.0023671663366258144, "epsilon_dpo/beta_margin_grad_mean": -0.3334435224533081, "epsilon_dpo/beta_margin_grad_std": 0.130352184176445, "epsilon_dpo/beta_margin_mean": 0.7624538540840149, "epsilon_dpo/beta_margin_std": 0.6814138889312744, "epsilon_dpo/loss_margin_mean": 323.02276611328125, "grad_norm": 50.768165588378906, "kl/avg_steps": 0.8125, "kl/beta": 0.0023863185197114944, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.3059754439133002e-07, "logits/chosen": -2.749878406524658, "logits/rejected": -2.8519415855407715, "logps/chosen": -238.464599609375, "logps/ref_chosen": -44.87860870361328, "logps/ref_rejected": -85.82889556884766, "logps/rejected": -602.4376220703125, "loss": 0.8578, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4595049023628235, "rewards/margins": 0.7624538540840149, "rewards/rejected": -1.2219587564468384, "step": 473 }, { "epoch": 0.6960352422907489, "epsilon_dpo/beta": 0.0023473482578992844, "epsilon_dpo/beta_margin_grad_mean": -0.3070417642593384, "epsilon_dpo/beta_margin_grad_std": 0.12120737135410309, "epsilon_dpo/beta_margin_mean": 0.889529287815094, "epsilon_dpo/beta_margin_std": 0.6321231126785278, "epsilon_dpo/loss_margin_mean": 379.68218994140625, "grad_norm": 44.010650634765625, "kl/avg_steps": 0.84375, "kl/beta": 0.00236708577722311, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.2947162435741277e-07, "logits/chosen": -2.6113648414611816, "logits/rejected": -2.8044967651367188, "logps/chosen": -195.01571655273438, "logps/ref_chosen": -30.269367218017578, "logps/ref_rejected": -97.37470245361328, "logps/rejected": -641.80322265625, "loss": 0.7665, "rewards/accuracies": 0.921875, "rewards/chosen": -0.38724249601364136, "rewards/margins": 0.889529287815094, "rewards/rejected": -1.2767717838287354, "step": 474 }, { "epoch": 0.697503671071953, "epsilon_dpo/beta": 0.002328441943973303, "epsilon_dpo/beta_margin_grad_mean": -0.32659581303596497, "epsilon_dpo/beta_margin_grad_std": 0.1249208152294159, "epsilon_dpo/beta_margin_mean": 0.8016044497489929, "epsilon_dpo/beta_margin_std": 0.6676663160324097, "epsilon_dpo/loss_margin_mean": 345.1247863769531, "grad_norm": 35.23362350463867, "kl/avg_steps": 0.8125, "kl/beta": 0.0023472807370126247, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.2834888035828596e-07, "logits/chosen": -2.689065933227539, "logits/rejected": -2.8997483253479004, "logps/chosen": -206.45596313476562, "logps/ref_chosen": -34.96168518066406, "logps/ref_rejected": -94.91036987304688, "logps/rejected": -611.5294189453125, "loss": 0.8274, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4007193148136139, "rewards/margins": 0.8016044497489929, "rewards/rejected": -1.2023236751556396, "step": 475 }, { "epoch": 0.6989720998531571, "epsilon_dpo/beta": 0.002310403622686863, "epsilon_dpo/beta_margin_grad_mean": -0.32414335012435913, "epsilon_dpo/beta_margin_grad_std": 0.12600372731685638, "epsilon_dpo/beta_margin_mean": 0.8043047189712524, "epsilon_dpo/beta_margin_std": 0.6316364407539368, "epsilon_dpo/loss_margin_mean": 349.0835876464844, "grad_norm": 40.693206787109375, "kl/avg_steps": 0.78125, "kl/beta": 0.0023283627815544605, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.2722934197929802e-07, "logits/chosen": -2.668698310852051, "logits/rejected": -2.884255886077881, "logps/chosen": -192.21990966796875, "logps/ref_chosen": -28.34685516357422, "logps/ref_rejected": -78.29444885253906, "logps/rejected": -591.2510986328125, "loss": 0.8208, "rewards/accuracies": 0.90625, "rewards/chosen": -0.37969422340393066, "rewards/margins": 0.8043047189712524, "rewards/rejected": -1.183998942375183, "step": 476 }, { "epoch": 0.7004405286343612, "epsilon_dpo/beta": 0.0022924933582544327, "epsilon_dpo/beta_margin_grad_mean": -0.3193974196910858, "epsilon_dpo/beta_margin_grad_std": 0.11756820976734161, "epsilon_dpo/beta_margin_mean": 0.8161155581474304, "epsilon_dpo/beta_margin_std": 0.5828248858451843, "epsilon_dpo/loss_margin_mean": 356.9313049316406, "grad_norm": 37.30575942993164, "kl/avg_steps": 0.78125, "kl/beta": 0.002310313517227769, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.2611303872132631e-07, "logits/chosen": -2.7471470832824707, "logits/rejected": -2.842437267303467, "logps/chosen": -217.6685028076172, "logps/ref_chosen": -42.40911865234375, "logps/ref_rejected": -82.68942260742188, "logps/rejected": -614.880126953125, "loss": 0.802, "rewards/accuracies": 0.921875, "rewards/chosen": -0.40344566106796265, "rewards/margins": 0.8161154985427856, "rewards/rejected": -1.219561219215393, "step": 477 }, { "epoch": 0.7019089574155654, "epsilon_dpo/beta": 0.0022747220937162638, "epsilon_dpo/beta_margin_grad_mean": -0.3177875578403473, "epsilon_dpo/beta_margin_grad_std": 0.11528382450342178, "epsilon_dpo/beta_margin_mean": 0.8234969973564148, "epsilon_dpo/beta_margin_std": 0.577464759349823, "epsilon_dpo/loss_margin_mean": 362.92669677734375, "grad_norm": 39.92375564575195, "kl/avg_steps": 0.78125, "kl/beta": 0.0022924039512872696, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -2.6334609985351562, "logits/rejected": -2.921363592147827, "logps/chosen": -207.5630340576172, "logps/ref_chosen": -28.737815856933594, "logps/ref_rejected": -90.47331237792969, "logps/rejected": -632.2252197265625, "loss": 0.7954, "rewards/accuracies": 0.90625, "rewards/chosen": -0.407898485660553, "rewards/margins": 0.8234970569610596, "rewards/rejected": -1.2313954830169678, "step": 478 }, { "epoch": 0.7033773861967695, "epsilon_dpo/beta": 0.002257088664919138, "epsilon_dpo/beta_margin_grad_mean": -0.3112712800502777, "epsilon_dpo/beta_margin_grad_std": 0.1403777301311493, "epsilon_dpo/beta_margin_mean": 0.9059340357780457, "epsilon_dpo/beta_margin_std": 0.7668508291244507, "epsilon_dpo/loss_margin_mean": 402.3663330078125, "grad_norm": 42.03719711303711, "kl/avg_steps": 0.78125, "kl/beta": 0.0022746333852410316, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.2389025514492456e-07, "logits/chosen": -2.710920810699463, "logits/rejected": -3.0226893424987793, "logps/chosen": -194.02374267578125, "logps/ref_chosen": -29.93898582458496, "logps/ref_rejected": -104.23573303222656, "logps/rejected": -670.686767578125, "loss": 0.7896, "rewards/accuracies": 0.90625, "rewards/chosen": -0.37124890089035034, "rewards/margins": 0.9059340357780457, "rewards/rejected": -1.2771828174591064, "step": 479 }, { "epoch": 0.7048458149779736, "epsilon_dpo/beta": 0.0022395916748791933, "epsilon_dpo/beta_margin_grad_mean": -0.3270338773727417, "epsilon_dpo/beta_margin_grad_std": 0.12246443331241608, "epsilon_dpo/beta_margin_mean": 0.7808691263198853, "epsilon_dpo/beta_margin_std": 0.6041759848594666, "epsilon_dpo/loss_margin_mean": 349.7110595703125, "grad_norm": 41.93212127685547, "kl/avg_steps": 0.78125, "kl/beta": 0.002257000654935837, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.227838333989088e-07, "logits/chosen": -2.7833218574523926, "logits/rejected": -2.883632183074951, "logps/chosen": -257.56707763671875, "logps/ref_chosen": -43.97426223754883, "logps/ref_rejected": -87.41323852539062, "logps/rejected": -650.7171020507812, "loss": 0.8298, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4799705743789673, "rewards/margins": 0.7808691263198853, "rewards/rejected": -1.2608397006988525, "step": 480 }, { "epoch": 0.7063142437591777, "epsilon_dpo/beta": 0.0022222306579351425, "epsilon_dpo/beta_margin_grad_mean": -0.31972822546958923, "epsilon_dpo/beta_margin_grad_std": 0.13162846863269806, "epsilon_dpo/beta_margin_mean": 0.8373962044715881, "epsilon_dpo/beta_margin_std": 0.6917349100112915, "epsilon_dpo/loss_margin_mean": 377.9779968261719, "grad_norm": 38.608699798583984, "kl/avg_steps": 0.78125, "kl/beta": 0.0022395045962184668, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.2168076391719489e-07, "logits/chosen": -2.710186004638672, "logits/rejected": -3.0134506225585938, "logps/chosen": -229.94204711914062, "logps/ref_chosen": -36.98882293701172, "logps/ref_rejected": -98.65377807617188, "logps/rejected": -669.5849609375, "loss": 0.8128, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4300834834575653, "rewards/margins": 0.8373961448669434, "rewards/rejected": -1.267479658126831, "step": 481 }, { "epoch": 0.7077826725403817, "epsilon_dpo/beta": 0.0022050042171031237, "epsilon_dpo/beta_margin_grad_mean": -0.34148189425468445, "epsilon_dpo/beta_margin_grad_std": 0.12353808432817459, "epsilon_dpo/beta_margin_mean": 0.7151001691818237, "epsilon_dpo/beta_margin_std": 0.6155868172645569, "epsilon_dpo/loss_margin_mean": 325.2435607910156, "grad_norm": 43.73499298095703, "kl/avg_steps": 0.78125, "kl/beta": 0.0022221440449357033, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.2058107576668938e-07, "logits/chosen": -2.777127504348755, "logits/rejected": -3.0118846893310547, "logps/chosen": -252.25537109375, "logps/ref_chosen": -47.419219970703125, "logps/ref_rejected": -92.47096252441406, "logps/rejected": -622.5506591796875, "loss": 0.875, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4530524015426636, "rewards/margins": 0.7151001691818237, "rewards/rejected": -1.1681525707244873, "step": 482 }, { "epoch": 0.7092511013215859, "epsilon_dpo/beta": 0.0021872217766940594, "epsilon_dpo/beta_margin_grad_mean": -0.3099426329135895, "epsilon_dpo/beta_margin_grad_std": 0.1277041733264923, "epsilon_dpo/beta_margin_mean": 0.8772286176681519, "epsilon_dpo/beta_margin_std": 0.6511039733886719, "epsilon_dpo/loss_margin_mean": 402.0494689941406, "grad_norm": 37.85783386230469, "kl/avg_steps": 0.8125, "kl/beta": 0.0022049180697649717, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.194847979251979e-07, "logits/chosen": -2.7550508975982666, "logits/rejected": -3.005962371826172, "logps/chosen": -219.78411865234375, "logps/ref_chosen": -39.672393798828125, "logps/ref_rejected": -100.94681549072266, "logps/rejected": -683.1080322265625, "loss": 0.7801, "rewards/accuracies": 0.921875, "rewards/chosen": -0.39537718892097473, "rewards/margins": 0.8772286176681519, "rewards/rejected": -1.2726057767868042, "step": 483 }, { "epoch": 0.71071953010279, "epsilon_dpo/beta": 0.002174378838390112, "epsilon_dpo/beta_margin_grad_mean": -0.35192134976387024, "epsilon_dpo/beta_margin_grad_std": 0.1350928395986557, "epsilon_dpo/beta_margin_mean": 0.6707573533058167, "epsilon_dpo/beta_margin_std": 0.6563224792480469, "epsilon_dpo/loss_margin_mean": 310.1882019042969, "grad_norm": 40.76497268676758, "kl/avg_steps": 0.59375, "kl/beta": 0.002187147503718734, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.1839195928066101e-07, "logits/chosen": -2.699538230895996, "logits/rejected": -2.9171078205108643, "logps/chosen": -237.5784149169922, "logps/ref_chosen": -43.43277359008789, "logps/ref_rejected": -89.96736145019531, "logps/rejected": -594.3012084960938, "loss": 0.9168, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4243118464946747, "rewards/margins": 0.6707573533058167, "rewards/rejected": -1.095069169998169, "step": 484 }, { "epoch": 0.7121879588839941, "epsilon_dpo/beta": 0.0021574674174189568, "epsilon_dpo/beta_margin_grad_mean": -0.3015494644641876, "epsilon_dpo/beta_margin_grad_std": 0.12973572313785553, "epsilon_dpo/beta_margin_mean": 0.9309406876564026, "epsilon_dpo/beta_margin_std": 0.6903952360153198, "epsilon_dpo/loss_margin_mean": 432.610107421875, "grad_norm": 32.71714782714844, "kl/avg_steps": 0.78125, "kl/beta": 0.0021742379758507013, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.1730258863039347e-07, "logits/chosen": -2.6770548820495605, "logits/rejected": -2.999387264251709, "logps/chosen": -169.16915893554688, "logps/ref_chosen": -37.34454345703125, "logps/ref_rejected": -111.42447662353516, "logps/rejected": -675.8591918945312, "loss": 0.7557, "rewards/accuracies": 0.921875, "rewards/chosen": -0.28553080558776855, "rewards/margins": 0.9309406876564026, "rewards/rejected": -1.2164714336395264, "step": 485 }, { "epoch": 0.7136563876651982, "epsilon_dpo/beta": 0.002140068681910634, "epsilon_dpo/beta_margin_grad_mean": -0.3228636085987091, "epsilon_dpo/beta_margin_grad_std": 0.1375182420015335, "epsilon_dpo/beta_margin_mean": 0.8183300495147705, "epsilon_dpo/beta_margin_std": 0.6934942007064819, "epsilon_dpo/loss_margin_mean": 383.51116943359375, "grad_norm": 35.33386993408203, "kl/avg_steps": 0.8125, "kl/beta": 0.0021573833655565977, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.1621671468032493e-07, "logits/chosen": -2.755924701690674, "logits/rejected": -3.058840274810791, "logps/chosen": -193.83453369140625, "logps/ref_chosen": -35.52677536010742, "logps/ref_rejected": -99.17495727539062, "logps/rejected": -640.993896484375, "loss": 0.8285, "rewards/accuracies": 0.90625, "rewards/chosen": -0.34016796946525574, "rewards/margins": 0.8183300495147705, "rewards/rejected": -1.1584980487823486, "step": 486 }, { "epoch": 0.7151248164464024, "epsilon_dpo/beta": 0.0021214832086116076, "epsilon_dpo/beta_margin_grad_mean": -0.30856984853744507, "epsilon_dpo/beta_margin_grad_std": 0.10273387283086777, "epsilon_dpo/beta_margin_mean": 0.8478483557701111, "epsilon_dpo/beta_margin_std": 0.48607146739959717, "epsilon_dpo/loss_margin_mean": 400.2981872558594, "grad_norm": 43.43318557739258, "kl/avg_steps": 0.875, "kl/beta": 0.002139996038749814, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.1513436604424378e-07, "logits/chosen": -2.668581247329712, "logits/rejected": -3.001206636428833, "logps/chosen": -184.3863525390625, "logps/ref_chosen": -31.08715057373047, "logps/ref_rejected": -98.84352111816406, "logps/rejected": -652.44091796875, "loss": 0.763, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3262515366077423, "rewards/margins": 0.8478483557701111, "rewards/rejected": -1.1740999221801758, "step": 487 }, { "epoch": 0.7165932452276065, "epsilon_dpo/beta": 0.0021050700452178717, "epsilon_dpo/beta_margin_grad_mean": -0.33707907795906067, "epsilon_dpo/beta_margin_grad_std": 0.12150773406028748, "epsilon_dpo/beta_margin_mean": 0.7329168319702148, "epsilon_dpo/beta_margin_std": 0.5872458815574646, "epsilon_dpo/loss_margin_mean": 349.0928039550781, "grad_norm": 35.68146514892578, "kl/avg_steps": 0.78125, "kl/beta": 0.0021214333828538656, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.1405557124304335e-07, "logits/chosen": -2.7139856815338135, "logits/rejected": -2.9660186767578125, "logps/chosen": -184.39353942871094, "logps/ref_chosen": -35.27953338623047, "logps/ref_rejected": -89.09225463867188, "logps/rejected": -587.299072265625, "loss": 0.8576, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3142842650413513, "rewards/margins": 0.7329168319702148, "rewards/rejected": -1.047201156616211, "step": 488 }, { "epoch": 0.7180616740088106, "epsilon_dpo/beta": 0.0020880938973277807, "epsilon_dpo/beta_margin_grad_mean": -0.34188124537467957, "epsilon_dpo/beta_margin_grad_std": 0.10775003582239151, "epsilon_dpo/beta_margin_mean": 0.6980612874031067, "epsilon_dpo/beta_margin_std": 0.5159537196159363, "epsilon_dpo/loss_margin_mean": 335.0322570800781, "grad_norm": 40.08867263793945, "kl/avg_steps": 0.8125, "kl/beta": 0.0021049880888313055, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.1298035870396985e-07, "logits/chosen": -2.7437334060668945, "logits/rejected": -2.992414712905884, "logps/chosen": -199.35702514648438, "logps/ref_chosen": -37.423851013183594, "logps/ref_rejected": -87.10142517089844, "logps/rejected": -584.06689453125, "loss": 0.8644, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3389902710914612, "rewards/margins": 0.6980613470077515, "rewards/rejected": -1.0370516777038574, "step": 489 }, { "epoch": 0.7195301027900147, "epsilon_dpo/beta": 0.002076485427096486, "epsilon_dpo/beta_margin_grad_mean": -0.3476724922657013, "epsilon_dpo/beta_margin_grad_std": 0.1495998054742813, "epsilon_dpo/beta_margin_mean": 0.7162193655967712, "epsilon_dpo/beta_margin_std": 0.7605259418487549, "epsilon_dpo/loss_margin_mean": 346.98077392578125, "grad_norm": 42.104305267333984, "kl/avg_steps": 0.5625, "kl/beta": 0.0020880228839814663, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.1190875675987355e-07, "logits/chosen": -2.7477211952209473, "logits/rejected": -3.100935935974121, "logps/chosen": -247.21926879882812, "logps/ref_chosen": -41.46424102783203, "logps/ref_rejected": -115.67326354980469, "logps/rejected": -668.4090576171875, "loss": 0.9131, "rewards/accuracies": 0.796875, "rewards/chosen": -0.42905905842781067, "rewards/margins": 0.7162193059921265, "rewards/rejected": -1.1452784538269043, "step": 490 }, { "epoch": 0.7209985315712188, "epsilon_dpo/beta": 0.002060000551864505, "epsilon_dpo/beta_margin_grad_mean": -0.3543255031108856, "epsilon_dpo/beta_margin_grad_std": 0.11612068116664886, "epsilon_dpo/beta_margin_mean": 0.6437206268310547, "epsilon_dpo/beta_margin_std": 0.547960102558136, "epsilon_dpo/loss_margin_mean": 313.3808288574219, "grad_norm": 40.64850997924805, "kl/avg_steps": 0.796875, "kl/beta": 0.002076343633234501, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.1084079364846241e-07, "logits/chosen": -2.7214245796203613, "logits/rejected": -3.0119452476501465, "logps/chosen": -226.95785522460938, "logps/ref_chosen": -41.33907699584961, "logps/ref_rejected": -79.69932556152344, "logps/rejected": -578.698974609375, "loss": 0.9096, "rewards/accuracies": 0.90625, "rewards/chosen": -0.38356825709342957, "rewards/margins": 0.6437206268310547, "rewards/rejected": -1.0272889137268066, "step": 491 }, { "epoch": 0.7224669603524229, "epsilon_dpo/beta": 0.0020440397784113884, "epsilon_dpo/beta_margin_grad_mean": -0.360959529876709, "epsilon_dpo/beta_margin_grad_std": 0.11504260450601578, "epsilon_dpo/beta_margin_mean": 0.6149997711181641, "epsilon_dpo/beta_margin_std": 0.5465402603149414, "epsilon_dpo/loss_margin_mean": 301.7585754394531, "grad_norm": 34.33412551879883, "kl/avg_steps": 0.78125, "kl/beta": 0.0020599286071956158, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.097764975115576e-07, "logits/chosen": -2.7548234462738037, "logits/rejected": -2.9680824279785156, "logps/chosen": -195.3032989501953, "logps/ref_chosen": -31.90703582763672, "logps/ref_rejected": -79.67924499511719, "logps/rejected": -544.8341064453125, "loss": 0.929, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3345920443534851, "rewards/margins": 0.6149997711181641, "rewards/rejected": -0.9495918154716492, "step": 492 }, { "epoch": 0.723935389133627, "epsilon_dpo/beta": 0.0020301109179854393, "epsilon_dpo/beta_margin_grad_mean": -0.3570352792739868, "epsilon_dpo/beta_margin_grad_std": 0.11298612505197525, "epsilon_dpo/beta_margin_mean": 0.6241260170936584, "epsilon_dpo/beta_margin_std": 0.5125545859336853, "epsilon_dpo/loss_margin_mean": 308.62005615234375, "grad_norm": 38.02676773071289, "kl/avg_steps": 0.6875, "kl/beta": 0.0020439601503312588, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.0871589639435203e-07, "logits/chosen": -2.806032657623291, "logits/rejected": -3.0264089107513428, "logps/chosen": -245.80068969726562, "logps/ref_chosen": -52.45185089111328, "logps/ref_rejected": -91.2623291015625, "logps/rejected": -593.231201171875, "loss": 0.9167, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3943876028060913, "rewards/margins": 0.6241260766983032, "rewards/rejected": -1.0185136795043945, "step": 493 }, { "epoch": 0.7254038179148311, "epsilon_dpo/beta": 0.0020137112587690353, "epsilon_dpo/beta_margin_grad_mean": -0.3272671103477478, "epsilon_dpo/beta_margin_grad_std": 0.12577760219573975, "epsilon_dpo/beta_margin_mean": 0.7729417085647583, "epsilon_dpo/beta_margin_std": 0.6120246052742004, "epsilon_dpo/loss_margin_mean": 384.9142150878906, "grad_norm": 42.489864349365234, "kl/avg_steps": 0.8125, "kl/beta": 0.0020300038158893585, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.0765901824467166e-07, "logits/chosen": -2.750584125518799, "logits/rejected": -3.075985908508301, "logps/chosen": -170.007568359375, "logps/ref_chosen": -27.903043746948242, "logps/ref_rejected": -92.12089538574219, "logps/rejected": -619.1396484375, "loss": 0.8385, "rewards/accuracies": 0.90625, "rewards/chosen": -0.28744399547576904, "rewards/margins": 0.7729417085647583, "rewards/rejected": -1.0603857040405273, "step": 494 }, { "epoch": 0.7268722466960352, "epsilon_dpo/beta": 0.0019955940078943968, "epsilon_dpo/beta_margin_grad_mean": -0.3212648630142212, "epsilon_dpo/beta_margin_grad_std": 0.1038970947265625, "epsilon_dpo/beta_margin_mean": 0.7920843362808228, "epsilon_dpo/beta_margin_std": 0.502342939376831, "epsilon_dpo/loss_margin_mean": 397.37994384765625, "grad_norm": 40.98225021362305, "kl/avg_steps": 0.90625, "kl/beta": 0.0020136430393904448, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.0660589091223854e-07, "logits/chosen": -2.7651305198669434, "logits/rejected": -3.138665199279785, "logps/chosen": -212.49075317382812, "logps/ref_chosen": -37.603515625, "logps/ref_rejected": -97.60113525390625, "logps/rejected": -669.8682861328125, "loss": 0.8, "rewards/accuracies": 0.953125, "rewards/chosen": -0.3492244780063629, "rewards/margins": 0.7920843362808228, "rewards/rejected": -1.1413087844848633, "step": 495 }, { "epoch": 0.7283406754772394, "epsilon_dpo/beta": 0.001978294923901558, "epsilon_dpo/beta_margin_grad_mean": -0.3371340036392212, "epsilon_dpo/beta_margin_grad_std": 0.09828732907772064, "epsilon_dpo/beta_margin_mean": 0.7093020081520081, "epsilon_dpo/beta_margin_std": 0.45665252208709717, "epsilon_dpo/loss_margin_mean": 359.1010437011719, "grad_norm": 36.258426666259766, "kl/avg_steps": 0.875, "kl/beta": 0.001995558151975274, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.0555654214793722e-07, "logits/chosen": -2.80366587638855, "logits/rejected": -3.1190853118896484, "logps/chosen": -235.90985107421875, "logps/ref_chosen": -45.088035583496094, "logps/ref_rejected": -92.02516174316406, "logps/rejected": -641.947998046875, "loss": 0.8456, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3781554102897644, "rewards/margins": 0.7093019485473633, "rewards/rejected": -1.0874574184417725, "step": 496 }, { "epoch": 0.7298091042584435, "epsilon_dpo/beta": 0.001961135072633624, "epsilon_dpo/beta_margin_grad_mean": -0.3387848436832428, "epsilon_dpo/beta_margin_grad_std": 0.10064252465963364, "epsilon_dpo/beta_margin_mean": 0.7051173448562622, "epsilon_dpo/beta_margin_std": 0.4776591360569, "epsilon_dpo/loss_margin_mean": 360.11083984375, "grad_norm": 31.69774627685547, "kl/avg_steps": 0.875, "kl/beta": 0.001978248590603471, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.0451099960308374e-07, "logits/chosen": -2.7518420219421387, "logits/rejected": -3.1248888969421387, "logps/chosen": -182.076416015625, "logps/ref_chosen": -30.02985382080078, "logps/ref_rejected": -81.73121643066406, "logps/rejected": -593.888671875, "loss": 0.8518, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2986891269683838, "rewards/margins": 0.7051173448562622, "rewards/rejected": -1.003806471824646, "step": 497 }, { "epoch": 0.7312775330396476, "epsilon_dpo/beta": 0.0019471884006634355, "epsilon_dpo/beta_margin_grad_mean": -0.35530975461006165, "epsilon_dpo/beta_margin_grad_std": 0.13022439181804657, "epsilon_dpo/beta_margin_mean": 0.6506726145744324, "epsilon_dpo/beta_margin_std": 0.6256114840507507, "epsilon_dpo/loss_margin_mean": 335.5245056152344, "grad_norm": 43.087093353271484, "kl/avg_steps": 0.71875, "kl/beta": 0.0019610889721661806, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.0346929082869641e-07, "logits/chosen": -2.85072660446167, "logits/rejected": -3.1245601177215576, "logps/chosen": -252.92184448242188, "logps/ref_chosen": -47.59989929199219, "logps/ref_rejected": -89.41059875488281, "logps/rejected": -630.257080078125, "loss": 0.9234, "rewards/accuracies": 0.890625, "rewards/chosen": -0.40171539783477783, "rewards/margins": 0.6506726741790771, "rewards/rejected": -1.0523879528045654, "step": 498 }, { "epoch": 0.7327459618208517, "epsilon_dpo/beta": 0.0019314672099426389, "epsilon_dpo/beta_margin_grad_mean": -0.3319936990737915, "epsilon_dpo/beta_margin_grad_std": 0.13098543882369995, "epsilon_dpo/beta_margin_mean": 0.765389621257782, "epsilon_dpo/beta_margin_std": 0.637798011302948, "epsilon_dpo/loss_margin_mean": 397.3015441894531, "grad_norm": 35.18621826171875, "kl/avg_steps": 0.8125, "kl/beta": 0.0019470942206680775, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.0243144327477013e-07, "logits/chosen": -2.809879779815674, "logits/rejected": -3.214648485183716, "logps/chosen": -222.212158203125, "logps/ref_chosen": -34.13922882080078, "logps/ref_rejected": -108.009521484375, "logps/rejected": -693.384033203125, "loss": 0.849, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3641180992126465, "rewards/margins": 0.765389621257782, "rewards/rejected": -1.1295077800750732, "step": 499 }, { "epoch": 0.7342143906020558, "epsilon_dpo/beta": 0.0019146933918818831, "epsilon_dpo/beta_margin_grad_mean": -0.3297097384929657, "epsilon_dpo/beta_margin_grad_std": 0.12436074763536453, "epsilon_dpo/beta_margin_mean": 0.7685431241989136, "epsilon_dpo/beta_margin_std": 0.6095584630966187, "epsilon_dpo/loss_margin_mean": 402.1811828613281, "grad_norm": 37.49474334716797, "kl/avg_steps": 0.875, "kl/beta": 0.0019314016681164503, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 1.0139748428955333e-07, "logits/chosen": -2.810401201248169, "logits/rejected": -3.235410451889038, "logps/chosen": -230.16627502441406, "logps/ref_chosen": -36.92897033691406, "logps/ref_rejected": -100.48208618164062, "logps/rejected": -695.9005737304688, "loss": 0.8394, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37080904841423035, "rewards/margins": 0.7685431241989136, "rewards/rejected": -1.1393522024154663, "step": 500 }, { "epoch": 0.7342143906020558, "eval_epsilon_dpo/beta": 0.0019060522317886353, "eval_epsilon_dpo/beta_margin_grad_mean": -0.4040687382221222, "eval_epsilon_dpo/beta_margin_grad_std": 0.1391373723745346, "eval_epsilon_dpo/beta_margin_mean": 0.4280838668346405, "eval_epsilon_dpo/beta_margin_std": 0.6343588829040527, "eval_epsilon_dpo/loss_margin_mean": 226.7985382080078, "eval_kl/n_epsilon_steps": 0.27011987566947937, "eval_kl/p_epsilon_steps": 0.7290239930152893, "eval_logits/chosen": -2.9733166694641113, "eval_logits/rejected": -3.179455518722534, "eval_logps/chosen": -384.0945129394531, "eval_logps/ref_chosen": -68.29110717773438, "eval_logps/ref_rejected": -92.08038330078125, "eval_logps/rejected": -634.682373046875, "eval_loss": 0.5502132177352905, "eval_rewards/accuracies": 0.7452911138534546, "eval_rewards/chosen": -0.6045443415641785, "eval_rewards/margins": 0.4280838668346405, "eval_rewards/rejected": -1.0326281785964966, "eval_runtime": 38.4009, "eval_samples_per_second": 60.91, "eval_steps_per_second": 1.927, "step": 500 }, { "epoch": 0.73568281938326, "epsilon_dpo/beta": 0.0018998802406713367, "epsilon_dpo/beta_margin_grad_mean": -0.3355919420719147, "epsilon_dpo/beta_margin_grad_std": 0.12509720027446747, "epsilon_dpo/beta_margin_mean": 0.7397939562797546, "epsilon_dpo/beta_margin_std": 0.6008874773979187, "epsilon_dpo/loss_margin_mean": 390.53387451171875, "grad_norm": 35.63896179199219, "kl/avg_steps": 0.78125, "kl/beta": 0.001914648455567658, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.0036744111882672e-07, "logits/chosen": -2.834397077560425, "logits/rejected": -3.2369680404663086, "logps/chosen": -224.27859497070312, "logps/ref_chosen": -39.85574722290039, "logps/ref_rejected": -85.64679718017578, "logps/rejected": -660.603515625, "loss": 0.8571, "rewards/accuracies": 0.90625, "rewards/chosen": -0.35178065299987793, "rewards/margins": 0.7397940158843994, "rewards/rejected": -1.0915746688842773, "step": 501 }, { "epoch": 0.737151248164464, "epsilon_dpo/beta": 0.0018857462564483285, "epsilon_dpo/beta_margin_grad_mean": -0.37188294529914856, "epsilon_dpo/beta_margin_grad_std": 0.12015336006879807, "epsilon_dpo/beta_margin_mean": 0.5563080906867981, "epsilon_dpo/beta_margin_std": 0.5363189578056335, "epsilon_dpo/loss_margin_mean": 296.284912109375, "grad_norm": 49.43293762207031, "kl/avg_steps": 0.75, "kl/beta": 0.0018998062005266547, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 9.934134090518592e-08, "logits/chosen": -2.929020404815674, "logits/rejected": -3.1855547428131104, "logps/chosen": -272.3970031738281, "logps/ref_chosen": -53.44129943847656, "logps/ref_rejected": -86.86759185791016, "logps/rejected": -602.1082153320312, "loss": 0.9721, "rewards/accuracies": 0.875, "rewards/chosen": -0.41469526290893555, "rewards/margins": 0.5563080906867981, "rewards/rejected": -0.9710033535957336, "step": 502 }, { "epoch": 0.7386196769456681, "epsilon_dpo/beta": 0.0018699404317885637, "epsilon_dpo/beta_margin_grad_mean": -0.34032920002937317, "epsilon_dpo/beta_margin_grad_std": 0.11132641136646271, "epsilon_dpo/beta_margin_mean": 0.7089167833328247, "epsilon_dpo/beta_margin_std": 0.5404109358787537, "epsilon_dpo/loss_margin_mean": 379.900390625, "grad_norm": 37.46735382080078, "kl/avg_steps": 0.84375, "kl/beta": 0.0018856637179851532, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 9.831921068732571e-08, "logits/chosen": -2.8521652221679688, "logits/rejected": -3.250486135482788, "logps/chosen": -186.5693817138672, "logps/ref_chosen": -34.42343521118164, "logps/ref_rejected": -88.69686889648438, "logps/rejected": -620.7432250976562, "loss": 0.862, "rewards/accuracies": 0.921875, "rewards/chosen": -0.28525102138519287, "rewards/margins": 0.7089167833328247, "rewards/rejected": -0.9941678047180176, "step": 503 }, { "epoch": 0.7400881057268722, "epsilon_dpo/beta": 0.0018554636044427752, "epsilon_dpo/beta_margin_grad_mean": -0.31814929842948914, "epsilon_dpo/beta_margin_grad_std": 0.12207160890102386, "epsilon_dpo/beta_margin_mean": 0.8290372490882874, "epsilon_dpo/beta_margin_std": 0.6291282176971436, "epsilon_dpo/loss_margin_mean": 448.0475769042969, "grad_norm": 43.66539001464844, "kl/avg_steps": 0.78125, "kl/beta": 0.0018698865314945579, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 9.730107739932805e-08, "logits/chosen": -2.828281879425049, "logits/rejected": -3.302220106124878, "logps/chosen": -209.00950622558594, "logps/ref_chosen": -46.35227966308594, "logps/ref_rejected": -111.0133056640625, "logps/rejected": -721.7181396484375, "loss": 0.8029, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3029179573059082, "rewards/margins": 0.8290372490882874, "rewards/rejected": -1.1319552659988403, "step": 504 }, { "epoch": 0.7415565345080763, "epsilon_dpo/beta": 0.0018439794657751918, "epsilon_dpo/beta_margin_grad_mean": -0.372370183467865, "epsilon_dpo/beta_margin_grad_std": 0.15154238045215607, "epsilon_dpo/beta_margin_mean": 0.5709735155105591, "epsilon_dpo/beta_margin_std": 0.7030720710754395, "epsilon_dpo/loss_margin_mean": 311.79742431640625, "grad_norm": 45.315521240234375, "kl/avg_steps": 0.625, "kl/beta": 0.0018553913105279207, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 9.628696786995188e-08, "logits/chosen": -2.8761417865753174, "logits/rejected": -3.2082276344299316, "logps/chosen": -259.9178771972656, "logps/ref_chosen": -47.89013671875, "logps/ref_rejected": -96.4730453491211, "logps/rejected": -620.2982177734375, "loss": 1.0064, "rewards/accuracies": 0.8125, "rewards/chosen": -0.39312461018562317, "rewards/margins": 0.5709735155105591, "rewards/rejected": -0.9640980958938599, "step": 505 }, { "epoch": 0.7430249632892805, "epsilon_dpo/beta": 0.0018290685256943107, "epsilon_dpo/beta_margin_grad_mean": -0.3460908532142639, "epsilon_dpo/beta_margin_grad_std": 0.11466103047132492, "epsilon_dpo/beta_margin_mean": 0.6695333123207092, "epsilon_dpo/beta_margin_std": 0.5264294743537903, "epsilon_dpo/loss_margin_mean": 367.1669616699219, "grad_norm": 39.87546920776367, "kl/avg_steps": 0.8125, "kl/beta": 0.0018438671249896288, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 9.527690882192635e-08, "logits/chosen": -2.8096556663513184, "logits/rejected": -3.270047187805176, "logps/chosen": -212.9695281982422, "logps/ref_chosen": -31.555944442749023, "logps/ref_rejected": -84.6425552368164, "logps/rejected": -633.22314453125, "loss": 0.8885, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3329414129257202, "rewards/margins": 0.669533371925354, "rewards/rejected": -1.0024747848510742, "step": 506 }, { "epoch": 0.7444933920704846, "epsilon_dpo/beta": 0.001816042116843164, "epsilon_dpo/beta_margin_grad_mean": -0.366396963596344, "epsilon_dpo/beta_margin_grad_std": 0.11917287856340408, "epsilon_dpo/beta_margin_mean": 0.5886867642402649, "epsilon_dpo/beta_margin_std": 0.5481820702552795, "epsilon_dpo/loss_margin_mean": 325.3662414550781, "grad_norm": 34.283470153808594, "kl/avg_steps": 0.71875, "kl/beta": 0.001829006476327777, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 9.427092687124691e-08, "logits/chosen": -2.9394993782043457, "logits/rejected": -3.2683815956115723, "logps/chosen": -282.3764953613281, "logps/ref_chosen": -50.91284942626953, "logps/ref_rejected": -101.67366790771484, "logps/rejected": -658.5035400390625, "loss": 0.9498, "rewards/accuracies": 0.875, "rewards/chosen": -0.42130210995674133, "rewards/margins": 0.5886867642402649, "rewards/rejected": -1.009988784790039, "step": 507 }, { "epoch": 0.7459618208516887, "epsilon_dpo/beta": 0.0018047851044684649, "epsilon_dpo/beta_margin_grad_mean": -0.3578824996948242, "epsilon_dpo/beta_margin_grad_std": 0.12713228166103363, "epsilon_dpo/beta_margin_mean": 0.6354185342788696, "epsilon_dpo/beta_margin_std": 0.6018913984298706, "epsilon_dpo/loss_margin_mean": 353.8747863769531, "grad_norm": 40.939327239990234, "kl/avg_steps": 0.625, "kl/beta": 0.0018159543396905065, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 9.326904852647344e-08, "logits/chosen": -2.883289337158203, "logits/rejected": -3.2906653881073, "logps/chosen": -260.6716613769531, "logps/ref_chosen": -50.73012924194336, "logps/ref_rejected": -101.8434829711914, "logps/rejected": -665.6597900390625, "loss": 0.9286, "rewards/accuracies": 0.84375, "rewards/chosen": -0.38065105676651, "rewards/margins": 0.6354185342788696, "rewards/rejected": -1.0160696506500244, "step": 508 }, { "epoch": 0.7474302496328928, "epsilon_dpo/beta": 0.0017901910468935966, "epsilon_dpo/beta_margin_grad_mean": -0.35225531458854675, "epsilon_dpo/beta_margin_grad_std": 0.10537883639335632, "epsilon_dpo/beta_margin_mean": 0.6424784660339355, "epsilon_dpo/beta_margin_std": 0.4860381782054901, "epsilon_dpo/loss_margin_mean": 359.7619323730469, "grad_norm": 38.303802490234375, "kl/avg_steps": 0.8125, "kl/beta": 0.0018046750919893384, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 9.227130018803195e-08, "logits/chosen": -2.86599063873291, "logits/rejected": -3.2632508277893066, "logps/chosen": -214.93182373046875, "logps/ref_chosen": -47.68657302856445, "logps/ref_rejected": -88.62358093261719, "logps/rejected": -615.6307373046875, "loss": 0.8974, "rewards/accuracies": 0.9375, "rewards/chosen": -0.30018889904022217, "rewards/margins": 0.6424784660339355, "rewards/rejected": -0.9426673650741577, "step": 509 }, { "epoch": 0.748898678414097, "epsilon_dpo/beta": 0.001774644129909575, "epsilon_dpo/beta_margin_grad_mean": -0.33876028656959534, "epsilon_dpo/beta_margin_grad_std": 0.0986710786819458, "epsilon_dpo/beta_margin_mean": 0.7086153626441956, "epsilon_dpo/beta_margin_std": 0.4846038222312927, "epsilon_dpo/loss_margin_mean": 399.8760070800781, "grad_norm": 32.25279235839844, "kl/avg_steps": 0.875, "kl/beta": 0.001790130278095603, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 9.127770814751932e-08, "logits/chosen": -2.808089256286621, "logits/rejected": -3.3828306198120117, "logps/chosen": -220.7513427734375, "logps/ref_chosen": -32.308326721191406, "logps/ref_rejected": -102.69437408447266, "logps/rejected": -691.013427734375, "loss": 0.8498, "rewards/accuracies": 0.9375, "rewards/chosen": -0.33473625779151917, "rewards/margins": 0.7086154222488403, "rewards/rejected": -1.043351650238037, "step": 510 }, { "epoch": 0.750367107195301, "epsilon_dpo/beta": 0.0017620236612856388, "epsilon_dpo/beta_margin_grad_mean": -0.34761884808540344, "epsilon_dpo/beta_margin_grad_std": 0.10665444284677505, "epsilon_dpo/beta_margin_mean": 0.6692096590995789, "epsilon_dpo/beta_margin_std": 0.5043637752532959, "epsilon_dpo/loss_margin_mean": 380.9803466796875, "grad_norm": 34.81239318847656, "kl/avg_steps": 0.71875, "kl/beta": 0.001774602453224361, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 9.028829858700973e-08, "logits/chosen": -2.88916015625, "logits/rejected": -3.333850860595703, "logps/chosen": -226.4573974609375, "logps/ref_chosen": -41.02062225341797, "logps/ref_rejected": -98.91937255859375, "logps/rejected": -665.3365478515625, "loss": 0.8819, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3278487026691437, "rewards/margins": 0.6692096590995789, "rewards/rejected": -0.9970583915710449, "step": 511 }, { "epoch": 0.7518355359765051, "epsilon_dpo/beta": 0.001748348237015307, "epsilon_dpo/beta_margin_grad_mean": -0.3473552465438843, "epsilon_dpo/beta_margin_grad_std": 0.12345966696739197, "epsilon_dpo/beta_margin_mean": 0.6806043386459351, "epsilon_dpo/beta_margin_std": 0.5956137180328369, "epsilon_dpo/loss_margin_mean": 390.4645690917969, "grad_norm": 34.644046783447266, "kl/avg_steps": 0.78125, "kl/beta": 0.0017619385616853833, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 8.930309757836516e-08, "logits/chosen": -2.82757306098938, "logits/rejected": -3.3794026374816895, "logps/chosen": -213.54293823242188, "logps/ref_chosen": -37.26538848876953, "logps/ref_rejected": -88.17929077148438, "logps/rejected": -654.92138671875, "loss": 0.8953, "rewards/accuracies": 0.890625, "rewards/chosen": -0.30911028385162354, "rewards/margins": 0.6806043386459351, "rewards/rejected": -0.9897146224975586, "step": 512 }, { "epoch": 0.7533039647577092, "epsilon_dpo/beta": 0.0017353416187688708, "epsilon_dpo/beta_margin_grad_mean": -0.3601101040840149, "epsilon_dpo/beta_margin_grad_std": 0.1102689802646637, "epsilon_dpo/beta_margin_mean": 0.6162811517715454, "epsilon_dpo/beta_margin_std": 0.5283299684524536, "epsilon_dpo/loss_margin_mean": 356.2281494140625, "grad_norm": 37.334041595458984, "kl/avg_steps": 0.75, "kl/beta": 0.0017482801340520382, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 8.832213108254863e-08, "logits/chosen": -2.8295106887817383, "logits/rejected": -3.2998032569885254, "logps/chosen": -189.08787536621094, "logps/ref_chosen": -35.62997055053711, "logps/ref_rejected": -81.91015625, "logps/rejected": -591.59619140625, "loss": 0.9234, "rewards/accuracies": 0.875, "rewards/chosen": -0.2673969268798828, "rewards/margins": 0.6162811517715454, "rewards/rejected": -0.8836780786514282, "step": 513 }, { "epoch": 0.7547723935389133, "epsilon_dpo/beta": 0.0017213387181982398, "epsilon_dpo/beta_margin_grad_mean": -0.35787054896354675, "epsilon_dpo/beta_margin_grad_std": 0.11788881570100784, "epsilon_dpo/beta_margin_mean": 0.6275299191474915, "epsilon_dpo/beta_margin_std": 0.5512734055519104, "epsilon_dpo/loss_margin_mean": 365.5804748535156, "grad_norm": 29.60846519470215, "kl/avg_steps": 0.8125, "kl/beta": 0.0017352655995637178, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 8.734542494893954e-08, "logits/chosen": -2.9027023315429688, "logits/rejected": -3.4183146953582764, "logps/chosen": -241.52822875976562, "logps/ref_chosen": -43.213531494140625, "logps/ref_rejected": -84.48947143554688, "logps/rejected": -648.3846435546875, "loss": 0.9222, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3422423005104065, "rewards/margins": 0.6275299787521362, "rewards/rejected": -0.969772219657898, "step": 514 }, { "epoch": 0.7562408223201175, "epsilon_dpo/beta": 0.0017085415311157703, "epsilon_dpo/beta_margin_grad_mean": -0.3623103201389313, "epsilon_dpo/beta_margin_grad_std": 0.13232813775539398, "epsilon_dpo/beta_margin_mean": 0.6002452969551086, "epsilon_dpo/beta_margin_std": 0.5947220921516418, "epsilon_dpo/loss_margin_mean": 352.9983825683594, "grad_norm": 36.903953552246094, "kl/avg_steps": 0.75, "kl/beta": 0.0017212802777066827, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 8.637300491465272e-08, "logits/chosen": -2.862236738204956, "logits/rejected": -3.4654173851013184, "logps/chosen": -235.07046508789062, "logps/ref_chosen": -33.94335174560547, "logps/ref_rejected": -92.82087707519531, "logps/rejected": -646.9464111328125, "loss": 0.9554, "rewards/accuracies": 0.890625, "rewards/chosen": -0.34564244747161865, "rewards/margins": 0.6002452969551086, "rewards/rejected": -0.9458876848220825, "step": 515 }, { "epoch": 0.7577092511013216, "epsilon_dpo/beta": 0.0016963566886261106, "epsilon_dpo/beta_margin_grad_mean": -0.3456820547580719, "epsilon_dpo/beta_margin_grad_std": 0.12934663891792297, "epsilon_dpo/beta_margin_mean": 0.6971822381019592, "epsilon_dpo/beta_margin_std": 0.6565114259719849, "epsilon_dpo/loss_margin_mean": 412.7080993652344, "grad_norm": 35.10322952270508, "kl/avg_steps": 0.71875, "kl/beta": 0.0017084666760638356, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 8.540489660386064e-08, "logits/chosen": -2.9307332038879395, "logits/rejected": -3.505221366882324, "logps/chosen": -244.57504272460938, "logps/ref_chosen": -48.20185852050781, "logps/ref_rejected": -119.76963806152344, "logps/rejected": -728.8509521484375, "loss": 0.8964, "rewards/accuracies": 0.875, "rewards/chosen": -0.33476826548576355, "rewards/margins": 0.6971822381019592, "rewards/rejected": -1.0319504737854004, "step": 516 }, { "epoch": 0.7591776798825257, "epsilon_dpo/beta": 0.0016842512413859367, "epsilon_dpo/beta_margin_grad_mean": -0.3475935757160187, "epsilon_dpo/beta_margin_grad_std": 0.12928926944732666, "epsilon_dpo/beta_margin_mean": 0.6708550453186035, "epsilon_dpo/beta_margin_std": 0.5967744588851929, "epsilon_dpo/loss_margin_mean": 400.1361083984375, "grad_norm": 38.30036163330078, "kl/avg_steps": 0.71875, "kl/beta": 0.0016962747322395444, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 8.444112552711752e-08, "logits/chosen": -2.9165990352630615, "logits/rejected": -3.483713150024414, "logps/chosen": -237.81472778320312, "logps/ref_chosen": -40.42920684814453, "logps/ref_rejected": -98.80826568603516, "logps/rejected": -696.3298950195312, "loss": 0.9048, "rewards/accuracies": 0.859375, "rewards/chosen": -0.33436378836631775, "rewards/margins": 0.6708550453186035, "rewards/rejected": -1.0052188634872437, "step": 517 }, { "epoch": 0.7606461086637298, "epsilon_dpo/beta": 0.0016727583715692163, "epsilon_dpo/beta_margin_grad_mean": -0.35805386304855347, "epsilon_dpo/beta_margin_grad_std": 0.1211152896285057, "epsilon_dpo/beta_margin_mean": 0.6288034915924072, "epsilon_dpo/beta_margin_std": 0.5658082962036133, "epsilon_dpo/loss_margin_mean": 377.4985046386719, "grad_norm": 43.182456970214844, "kl/avg_steps": 0.6875, "kl/beta": 0.0016841697506606579, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 8.348171708068747e-08, "logits/chosen": -2.878077983856201, "logits/rejected": -3.482807159423828, "logps/chosen": -228.24789428710938, "logps/ref_chosen": -37.2679557800293, "logps/ref_rejected": -90.86311340332031, "logps/rejected": -659.341552734375, "loss": 0.9249, "rewards/accuracies": 0.859375, "rewards/chosen": -0.3209276795387268, "rewards/margins": 0.6288034915924072, "rewards/rejected": -0.9497312307357788, "step": 518 }, { "epoch": 0.762114537444934, "epsilon_dpo/beta": 0.0016592455795034766, "epsilon_dpo/beta_margin_grad_mean": -0.35770776867866516, "epsilon_dpo/beta_margin_grad_std": 0.13007250428199768, "epsilon_dpo/beta_margin_mean": 0.644246518611908, "epsilon_dpo/beta_margin_std": 0.639655590057373, "epsilon_dpo/loss_margin_mean": 389.47509765625, "grad_norm": 37.42156982421875, "kl/avg_steps": 0.8125, "kl/beta": 0.0016726701287552714, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 8.25266965458755e-08, "logits/chosen": -2.999945878982544, "logits/rejected": -3.565791606903076, "logps/chosen": -240.69334411621094, "logps/ref_chosen": -47.603675842285156, "logps/ref_rejected": -110.82022094726562, "logps/rejected": -693.385009765625, "loss": 0.9302, "rewards/accuracies": 0.875, "rewards/chosen": -0.32136669754981995, "rewards/margins": 0.6442465782165527, "rewards/rejected": -0.9656132459640503, "step": 519 }, { "epoch": 0.7635829662261381, "epsilon_dpo/beta": 0.0016458729514852166, "epsilon_dpo/beta_margin_grad_mean": -0.3453068435192108, "epsilon_dpo/beta_margin_grad_std": 0.10654862225055695, "epsilon_dpo/beta_margin_mean": 0.685132622718811, "epsilon_dpo/beta_margin_std": 0.5305820107460022, "epsilon_dpo/loss_margin_mean": 417.23046875, "grad_norm": 30.94179916381836, "kl/avg_steps": 0.8125, "kl/beta": 0.0016591892344877124, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 8.15760890883607e-08, "logits/chosen": -2.9207892417907715, "logits/rejected": -3.5509166717529297, "logps/chosen": -229.61695861816406, "logps/ref_chosen": -43.462921142578125, "logps/ref_rejected": -109.22837829589844, "logps/rejected": -712.6129150390625, "loss": 0.8746, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3073316216468811, "rewards/margins": 0.685132622718811, "rewards/rejected": -0.9924643039703369, "step": 520 }, { "epoch": 0.7650513950073421, "epsilon_dpo/beta": 0.001633122330531478, "epsilon_dpo/beta_margin_grad_mean": -0.3637232482433319, "epsilon_dpo/beta_margin_grad_std": 0.10722323507070541, "epsilon_dpo/beta_margin_mean": 0.5909576416015625, "epsilon_dpo/beta_margin_std": 0.4859505593776703, "epsilon_dpo/loss_margin_mean": 362.88031005859375, "grad_norm": 37.41933059692383, "kl/avg_steps": 0.78125, "kl/beta": 0.0016458169557154179, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 8.062991975753378e-08, "logits/chosen": -2.949124336242676, "logits/rejected": -3.4791650772094727, "logps/chosen": -218.08642578125, "logps/ref_chosen": -39.79935836791992, "logps/ref_rejected": -89.2203140258789, "logps/rejected": -630.3876953125, "loss": 0.9344, "rewards/accuracies": 0.921875, "rewards/chosen": -0.2921355366706848, "rewards/margins": 0.5909576416015625, "rewards/rejected": -0.8830931782722473, "step": 521 }, { "epoch": 0.7665198237885462, "epsilon_dpo/beta": 0.0016209728782996535, "epsilon_dpo/beta_margin_grad_mean": -0.38271984457969666, "epsilon_dpo/beta_margin_grad_std": 0.12090186774730682, "epsilon_dpo/beta_margin_mean": 0.511188805103302, "epsilon_dpo/beta_margin_std": 0.5572635531425476, "epsilon_dpo/loss_margin_mean": 316.7029724121094, "grad_norm": 34.424713134765625, "kl/avg_steps": 0.75, "kl/beta": 0.0016330587677657604, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 7.968821348583643e-08, "logits/chosen": -2.922499895095825, "logits/rejected": -3.3887128829956055, "logps/chosen": -223.6859893798828, "logps/ref_chosen": -32.023380279541016, "logps/ref_rejected": -72.67029571533203, "logps/rejected": -581.035888671875, "loss": 1.0096, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3123043179512024, "rewards/margins": 0.511188805103302, "rewards/rejected": -0.8234931230545044, "step": 522 }, { "epoch": 0.7679882525697503, "epsilon_dpo/beta": 0.0016109324060380459, "epsilon_dpo/beta_margin_grad_mean": -0.36688175797462463, "epsilon_dpo/beta_margin_grad_std": 0.12298644334077835, "epsilon_dpo/beta_margin_mean": 0.5938741564750671, "epsilon_dpo/beta_margin_std": 0.5870850682258606, "epsilon_dpo/loss_margin_mean": 370.4969482421875, "grad_norm": 34.04196548461914, "kl/avg_steps": 0.625, "kl/beta": 0.001620901981368661, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 7.875099508810484e-08, "logits/chosen": -2.9228086471557617, "logits/rejected": -3.478931427001953, "logps/chosen": -218.07928466796875, "logps/ref_chosen": -44.56959533691406, "logps/ref_rejected": -93.40034484863281, "logps/rejected": -637.406982421875, "loss": 0.9538, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2807372212409973, "rewards/margins": 0.5938742160797119, "rewards/rejected": -0.8746113777160645, "step": 523 }, { "epoch": 0.7694566813509545, "epsilon_dpo/beta": 0.001596899121068418, "epsilon_dpo/beta_margin_grad_mean": -0.3489263653755188, "epsilon_dpo/beta_margin_grad_std": 0.09371962398290634, "epsilon_dpo/beta_margin_mean": 0.6547738313674927, "epsilon_dpo/beta_margin_std": 0.4421177804470062, "epsilon_dpo/loss_margin_mean": 410.6026916503906, "grad_norm": 34.071510314941406, "kl/avg_steps": 0.875, "kl/beta": 0.0016108342679217458, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 7.781828926091535e-08, "logits/chosen": -3.0543670654296875, "logits/rejected": -3.656444549560547, "logps/chosen": -233.17068481445312, "logps/ref_chosen": -47.35930252075195, "logps/ref_rejected": -87.55693054199219, "logps/rejected": -683.9710083007812, "loss": 0.8791, "rewards/accuracies": 0.9375, "rewards/chosen": -0.296951562166214, "rewards/margins": 0.6547738313674927, "rewards/rejected": -0.9517253637313843, "step": 524 }, { "epoch": 0.7709251101321586, "epsilon_dpo/beta": 0.001583546632900834, "epsilon_dpo/beta_margin_grad_mean": -0.33597642183303833, "epsilon_dpo/beta_margin_grad_std": 0.11459914594888687, "epsilon_dpo/beta_margin_mean": 0.7328599691390991, "epsilon_dpo/beta_margin_std": 0.5668560862541199, "epsilon_dpo/loss_margin_mean": 463.7811584472656, "grad_norm": 33.28862380981445, "kl/avg_steps": 0.84375, "kl/beta": 0.0015968617517501116, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 7.689012058193384e-08, "logits/chosen": -2.9925098419189453, "logits/rejected": -3.6460928916931152, "logps/chosen": -205.41815185546875, "logps/ref_chosen": -29.801021575927734, "logps/ref_rejected": -107.73757934570312, "logps/rejected": -747.1358642578125, "loss": 0.8517, "rewards/accuracies": 0.9375, "rewards/chosen": -0.27915799617767334, "rewards/margins": 0.7328599691390991, "rewards/rejected": -1.0120179653167725, "step": 525 }, { "epoch": 0.7723935389133627, "epsilon_dpo/beta": 0.0015698021743446589, "epsilon_dpo/beta_margin_grad_mean": -0.3435947597026825, "epsilon_dpo/beta_margin_grad_std": 0.0916726216673851, "epsilon_dpo/beta_margin_mean": 0.6770256757736206, "epsilon_dpo/beta_margin_std": 0.4311753809452057, "epsilon_dpo/loss_margin_mean": 431.86358642578125, "grad_norm": 33.44818115234375, "kl/avg_steps": 0.875, "kl/beta": 0.001583500881679356, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 7.596651350926836e-08, "logits/chosen": -2.96604061126709, "logits/rejected": -3.718456983566284, "logps/chosen": -219.9847412109375, "logps/ref_chosen": -37.456085205078125, "logps/ref_rejected": -93.045654296875, "logps/rejected": -707.4378662109375, "loss": 0.8619, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2869272232055664, "rewards/margins": 0.6770256757736206, "rewards/rejected": -0.963952898979187, "step": 526 }, { "epoch": 0.7738619676945668, "epsilon_dpo/beta": 0.0015561856562271714, "epsilon_dpo/beta_margin_grad_mean": -0.35842540860176086, "epsilon_dpo/beta_margin_grad_std": 0.09837586432695389, "epsilon_dpo/beta_margin_mean": 0.6056324243545532, "epsilon_dpo/beta_margin_std": 0.44141340255737305, "epsilon_dpo/loss_margin_mean": 389.9037170410156, "grad_norm": 33.482730865478516, "kl/avg_steps": 0.875, "kl/beta": 0.0015697655035182834, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 7.504749238082414e-08, "logits/chosen": -3.048184871673584, "logits/rejected": -3.7029123306274414, "logps/chosen": -219.21990966796875, "logps/ref_chosen": -43.55718231201172, "logps/ref_rejected": -83.77217102050781, "logps/rejected": -649.338623046875, "loss": 0.9152, "rewards/accuracies": 0.953125, "rewards/chosen": -0.27390962839126587, "rewards/margins": 0.6056324243545532, "rewards/rejected": -0.8795421123504639, "step": 527 }, { "epoch": 0.775330396475771, "epsilon_dpo/beta": 0.0015451188664883375, "epsilon_dpo/beta_margin_grad_mean": -0.35285684466362, "epsilon_dpo/beta_margin_grad_std": 0.11853712797164917, "epsilon_dpo/beta_margin_mean": 0.6550965309143066, "epsilon_dpo/beta_margin_std": 0.5716320276260376, "epsilon_dpo/loss_margin_mean": 425.4979248046875, "grad_norm": 36.37355041503906, "kl/avg_steps": 0.71875, "kl/beta": 0.0015561492182314396, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 7.413308141366254e-08, "logits/chosen": -3.052718162536621, "logits/rejected": -3.7526984214782715, "logps/chosen": -250.51231384277344, "logps/ref_chosen": -46.319541931152344, "logps/ref_rejected": -101.05429077148438, "logps/rejected": -730.7449951171875, "loss": 0.9063, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3164703845977783, "rewards/margins": 0.6550965309143066, "rewards/rejected": -0.971566915512085, "step": 528 }, { "epoch": 0.7767988252569751, "epsilon_dpo/beta": 0.0015350582543760538, "epsilon_dpo/beta_margin_grad_mean": -0.3771320581436157, "epsilon_dpo/beta_margin_grad_std": 0.11737023293972015, "epsilon_dpo/beta_margin_mean": 0.531406581401825, "epsilon_dpo/beta_margin_std": 0.5209585428237915, "epsilon_dpo/loss_margin_mean": 347.9163513183594, "grad_norm": 36.469417572021484, "kl/avg_steps": 0.65625, "kl/beta": 0.0015450441278517246, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 7.322330470336313e-08, "logits/chosen": -3.203193187713623, "logits/rejected": -3.8966808319091797, "logps/chosen": -311.4798889160156, "logps/ref_chosen": -39.26038360595703, "logps/ref_rejected": -94.54532623291016, "logps/rejected": -714.68115234375, "loss": 0.987, "rewards/accuracies": 0.84375, "rewards/chosen": -0.41968581080436707, "rewards/margins": 0.531406581401825, "rewards/rejected": -0.9510923624038696, "step": 529 }, { "epoch": 0.7782672540381792, "epsilon_dpo/beta": 0.0015236110193654895, "epsilon_dpo/beta_margin_grad_mean": -0.3523128926753998, "epsilon_dpo/beta_margin_grad_std": 0.11749440431594849, "epsilon_dpo/beta_margin_mean": 0.665305495262146, "epsilon_dpo/beta_margin_std": 0.6024741530418396, "epsilon_dpo/loss_margin_mean": 438.0719299316406, "grad_norm": 38.925601959228516, "kl/avg_steps": 0.75, "kl/beta": 0.0015349709428846836, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 7.231818622338822e-08, "logits/chosen": -3.1173648834228516, "logits/rejected": -3.8287353515625, "logps/chosen": -200.4312744140625, "logps/ref_chosen": -27.232646942138672, "logps/ref_rejected": -92.85646057128906, "logps/rejected": -704.1270751953125, "loss": 0.9029, "rewards/accuracies": 0.875, "rewards/chosen": -0.264974981546402, "rewards/margins": 0.665305495262146, "rewards/rejected": -0.9302804470062256, "step": 530 }, { "epoch": 0.7797356828193832, "epsilon_dpo/beta": 0.0015113166300579906, "epsilon_dpo/beta_margin_grad_mean": -0.36579129099845886, "epsilon_dpo/beta_margin_grad_std": 0.10368164628744125, "epsilon_dpo/beta_margin_mean": 0.5806282758712769, "epsilon_dpo/beta_margin_std": 0.4778251051902771, "epsilon_dpo/loss_margin_mean": 385.13055419921875, "grad_norm": 39.48409652709961, "kl/avg_steps": 0.8125, "kl/beta": 0.0015235443133860826, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 7.141774982445147e-08, "logits/chosen": -3.1506361961364746, "logits/rejected": -3.9508585929870605, "logps/chosen": -226.21939086914062, "logps/ref_chosen": -29.433807373046875, "logps/ref_rejected": -77.6962890625, "logps/rejected": -659.6124267578125, "loss": 0.9395, "rewards/accuracies": 0.890625, "rewards/chosen": -0.2982354760169983, "rewards/margins": 0.5806282758712769, "rewards/rejected": -0.8788637518882751, "step": 531 }, { "epoch": 0.7812041116005873, "epsilon_dpo/beta": 0.0015024424064904451, "epsilon_dpo/beta_margin_grad_mean": -0.36811915040016174, "epsilon_dpo/beta_margin_grad_std": 0.13502071797847748, "epsilon_dpo/beta_margin_mean": 0.5831210017204285, "epsilon_dpo/beta_margin_std": 0.6058715581893921, "epsilon_dpo/loss_margin_mean": 390.57232666015625, "grad_norm": 41.38243103027344, "kl/avg_steps": 0.59375, "kl/beta": 0.0015112652909010649, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 7.052201923388953e-08, "logits/chosen": -3.245467185974121, "logits/rejected": -3.924285888671875, "logps/chosen": -292.22418212890625, "logps/ref_chosen": -49.116477966308594, "logps/ref_rejected": -93.35752868652344, "logps/rejected": -727.0375366210938, "loss": 0.9702, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3671477138996124, "rewards/margins": 0.5831209421157837, "rewards/rejected": -0.9502686858177185, "step": 532 }, { "epoch": 0.7826725403817915, "epsilon_dpo/beta": 0.0014921656111255288, "epsilon_dpo/beta_margin_grad_mean": -0.3761315643787384, "epsilon_dpo/beta_margin_grad_std": 0.12172158807516098, "epsilon_dpo/beta_margin_mean": 0.5404428243637085, "epsilon_dpo/beta_margin_std": 0.5591951012611389, "epsilon_dpo/loss_margin_mean": 364.0026550292969, "grad_norm": 38.90617752075195, "kl/avg_steps": 0.6875, "kl/beta": 0.0015023450832813978, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 6.963101805503646e-08, "logits/chosen": -3.1959118843078613, "logits/rejected": -3.9355404376983643, "logps/chosen": -277.5054016113281, "logps/ref_chosen": -39.03050994873047, "logps/ref_rejected": -81.61624145507812, "logps/rejected": -684.0938110351562, "loss": 0.9883, "rewards/accuracies": 0.84375, "rewards/chosen": -0.35740402340888977, "rewards/margins": 0.5404428243637085, "rewards/rejected": -0.8978468179702759, "step": 533 }, { "epoch": 0.7841409691629956, "epsilon_dpo/beta": 0.0014812753070145845, "epsilon_dpo/beta_margin_grad_mean": -0.35220903158187866, "epsilon_dpo/beta_margin_grad_std": 0.10367898643016815, "epsilon_dpo/beta_margin_mean": 0.6407039165496826, "epsilon_dpo/beta_margin_std": 0.46845534443855286, "epsilon_dpo/loss_margin_mean": 433.8296813964844, "grad_norm": 43.0300178527832, "kl/avg_steps": 0.734375, "kl/beta": 0.0014920870307832956, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.859375, "learning_rate": 6.874476976660184e-08, "logits/chosen": -3.1139631271362305, "logits/rejected": -3.9010872840881348, "logps/chosen": -213.8875732421875, "logps/ref_chosen": -33.198890686035156, "logps/ref_rejected": -85.75593566894531, "logps/rejected": -700.2742919921875, "loss": 0.8956, "rewards/accuracies": 0.90625, "rewards/chosen": -0.26844221353530884, "rewards/margins": 0.6407039165496826, "rewards/rejected": -0.9091461896896362, "step": 534 }, { "epoch": 0.7856093979441997, "epsilon_dpo/beta": 0.0014679327141493559, "epsilon_dpo/beta_margin_grad_mean": -0.34026262164115906, "epsilon_dpo/beta_margin_grad_std": 0.11611247062683105, "epsilon_dpo/beta_margin_mean": 0.7134142518043518, "epsilon_dpo/beta_margin_std": 0.5855581164360046, "epsilon_dpo/loss_margin_mean": 486.8583068847656, "grad_norm": 40.018455505371094, "kl/avg_steps": 0.90625, "kl/beta": 0.0014812094159424305, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 6.786329772205246e-08, "logits/chosen": -3.2459397315979004, "logits/rejected": -4.044425010681152, "logps/chosen": -248.54649353027344, "logps/ref_chosen": -39.410072326660156, "logps/ref_rejected": -95.90589904785156, "logps/rejected": -791.900634765625, "loss": 0.8681, "rewards/accuracies": 0.953125, "rewards/chosen": -0.3082220256328583, "rewards/margins": 0.7134143114089966, "rewards/rejected": -1.0216362476348877, "step": 535 }, { "epoch": 0.7870778267254038, "epsilon_dpo/beta": 0.001455666613765061, "epsilon_dpo/beta_margin_grad_mean": -0.34118029475212097, "epsilon_dpo/beta_margin_grad_std": 0.1087789535522461, "epsilon_dpo/beta_margin_mean": 0.7139739990234375, "epsilon_dpo/beta_margin_std": 0.577499270439148, "epsilon_dpo/loss_margin_mean": 491.39007568359375, "grad_norm": 43.79270553588867, "kl/avg_steps": 0.84375, "kl/beta": 0.0014679065207019448, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 6.698662514899638e-08, "logits/chosen": -3.22434139251709, "logits/rejected": -4.060914039611816, "logps/chosen": -262.8885498046875, "logps/ref_chosen": -31.54595184326172, "logps/ref_rejected": -95.49956512451172, "logps/rejected": -818.2322387695312, "loss": 0.8622, "rewards/accuracies": 0.921875, "rewards/chosen": -0.33760565519332886, "rewards/margins": 0.7139739990234375, "rewards/rejected": -1.0515797138214111, "step": 536 }, { "epoch": 0.788546255506608, "epsilon_dpo/beta": 0.0014448519796133041, "epsilon_dpo/beta_margin_grad_mean": -0.3592894971370697, "epsilon_dpo/beta_margin_grad_std": 0.10450321435928345, "epsilon_dpo/beta_margin_mean": 0.6133064031600952, "epsilon_dpo/beta_margin_std": 0.48698103427886963, "epsilon_dpo/loss_margin_mean": 425.6983947753906, "grad_norm": 37.07543182373047, "kl/avg_steps": 0.75, "kl/beta": 0.0014556247042492032, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 6.611477514857114e-08, "logits/chosen": -3.1620330810546875, "logits/rejected": -3.9779064655303955, "logps/chosen": -205.30322265625, "logps/ref_chosen": -29.482723236083984, "logps/ref_rejected": -78.68902587890625, "logps/rejected": -680.2078857421875, "loss": 0.9176, "rewards/accuracies": 0.921875, "rewards/chosen": -0.2549804449081421, "rewards/margins": 0.6133064031600952, "rewards/rejected": -0.8682868480682373, "step": 537 }, { "epoch": 0.7900146842878121, "epsilon_dpo/beta": 0.0014349992852658033, "epsilon_dpo/beta_margin_grad_mean": -0.34261569380760193, "epsilon_dpo/beta_margin_grad_std": 0.132565438747406, "epsilon_dpo/beta_margin_mean": 0.7123464941978455, "epsilon_dpo/beta_margin_std": 0.6496385931968689, "epsilon_dpo/loss_margin_mean": 498.54559326171875, "grad_norm": 43.92093276977539, "kl/avg_steps": 0.6875, "kl/beta": 0.0014447887660935521, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 6.524777069483525e-08, "logits/chosen": -3.295565128326416, "logits/rejected": -4.157257080078125, "logps/chosen": -342.5325622558594, "logps/ref_chosen": -46.485496520996094, "logps/ref_rejected": -91.08615112304688, "logps/rejected": -885.6788330078125, "loss": 0.8867, "rewards/accuracies": 0.859375, "rewards/chosen": -0.4264609217643738, "rewards/margins": 0.7123464345932007, "rewards/rejected": -1.1388074159622192, "step": 538 }, { "epoch": 0.7914831130690162, "epsilon_dpo/beta": 0.001425649388693273, "epsilon_dpo/beta_margin_grad_mean": -0.34682291746139526, "epsilon_dpo/beta_margin_grad_std": 0.12398696690797806, "epsilon_dpo/beta_margin_mean": 0.6886243224143982, "epsilon_dpo/beta_margin_std": 0.60282963514328, "epsilon_dpo/loss_margin_mean": 485.098388671875, "grad_norm": 45.51395034790039, "kl/avg_steps": 0.65625, "kl/beta": 0.0014349236153066158, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 6.438563463416221e-08, "logits/chosen": -3.2940754890441895, "logits/rejected": -4.077099800109863, "logps/chosen": -323.3985595703125, "logps/ref_chosen": -39.210296630859375, "logps/ref_rejected": -95.66792297363281, "logps/rejected": -864.95458984375, "loss": 0.8907, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4068090319633484, "rewards/margins": 0.688624382019043, "rewards/rejected": -1.0954333543777466, "step": 539 }, { "epoch": 0.7929515418502202, "epsilon_dpo/beta": 0.0014163546729832888, "epsilon_dpo/beta_margin_grad_mean": -0.32467931509017944, "epsilon_dpo/beta_margin_grad_std": 0.1530696600675583, "epsilon_dpo/beta_margin_mean": 0.8351730108261108, "epsilon_dpo/beta_margin_std": 0.7946527004241943, "epsilon_dpo/loss_margin_mean": 592.330078125, "grad_norm": 49.0323486328125, "kl/avg_steps": 0.65625, "kl/beta": 0.0014255683636292815, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 6.352838968463919e-08, "logits/chosen": -3.369414806365967, "logits/rejected": -4.240825176239014, "logps/chosen": -385.478759765625, "logps/ref_chosen": -50.91047668457031, "logps/ref_rejected": -125.03810119628906, "logps/rejected": -1051.9365234375, "loss": 0.8461, "rewards/accuracies": 0.859375, "rewards/chosen": -0.47576338052749634, "rewards/margins": 0.8351730108261108, "rewards/rejected": -1.310936450958252, "step": 540 }, { "epoch": 0.7944199706314243, "epsilon_dpo/beta": 0.0014062352711334825, "epsilon_dpo/beta_margin_grad_mean": -0.3354036808013916, "epsilon_dpo/beta_margin_grad_std": 0.13668367266654968, "epsilon_dpo/beta_margin_mean": 0.7499971985816956, "epsilon_dpo/beta_margin_std": 0.6641453504562378, "epsilon_dpo/loss_margin_mean": 535.408447265625, "grad_norm": 54.5290412902832, "kl/avg_steps": 0.71875, "kl/beta": 0.0014162741135805845, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 6.267605843546767e-08, "logits/chosen": -3.3453049659729004, "logits/rejected": -4.137938499450684, "logps/chosen": -380.36578369140625, "logps/ref_chosen": -48.1887321472168, "logps/ref_rejected": -110.07643127441406, "logps/rejected": -977.6619873046875, "loss": 0.8668, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4690607190132141, "rewards/margins": 0.7499971985816956, "rewards/rejected": -1.2190579175949097, "step": 541 }, { "epoch": 0.7958883994126285, "epsilon_dpo/beta": 0.0013966395054012537, "epsilon_dpo/beta_margin_grad_mean": -0.31941473484039307, "epsilon_dpo/beta_margin_grad_std": 0.1469242125749588, "epsilon_dpo/beta_margin_mean": 0.8643131852149963, "epsilon_dpo/beta_margin_std": 0.7869569063186646, "epsilon_dpo/loss_margin_mean": 621.3475952148438, "grad_norm": 42.56067657470703, "kl/avg_steps": 0.6875, "kl/beta": 0.0014061672845855355, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 6.182866334636888e-08, "logits/chosen": -3.2921829223632812, "logits/rejected": -4.272438049316406, "logps/chosen": -294.858642578125, "logps/ref_chosen": -34.308189392089844, "logps/ref_rejected": -104.48388671875, "logps/rejected": -986.3819580078125, "loss": 0.8216, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3652591109275818, "rewards/margins": 0.8643131852149963, "rewards/rejected": -1.2295722961425781, "step": 542 }, { "epoch": 0.7973568281938326, "epsilon_dpo/beta": 0.0013875396689400077, "epsilon_dpo/beta_margin_grad_mean": -0.32512134313583374, "epsilon_dpo/beta_margin_grad_std": 0.160647913813591, "epsilon_dpo/beta_margin_mean": 0.8715978860855103, "epsilon_dpo/beta_margin_std": 0.9173281192779541, "epsilon_dpo/loss_margin_mean": 631.060791015625, "grad_norm": 50.612884521484375, "kl/avg_steps": 0.65625, "kl/beta": 0.0013965658145025373, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 6.098622674699147e-08, "logits/chosen": -3.523017168045044, "logits/rejected": -4.250926971435547, "logps/chosen": -422.712646484375, "logps/ref_chosen": -43.612937927246094, "logps/ref_rejected": -111.56673431396484, "logps/rejected": -1121.7271728515625, "loss": 0.8507, "rewards/accuracies": 0.859375, "rewards/chosen": -0.527320921421051, "rewards/margins": 0.8715978860855103, "rewards/rejected": -1.398918867111206, "step": 543 }, { "epoch": 0.7988252569750367, "epsilon_dpo/beta": 0.001378059620037675, "epsilon_dpo/beta_margin_grad_mean": -0.3185425102710724, "epsilon_dpo/beta_margin_grad_std": 0.15885071456432343, "epsilon_dpo/beta_margin_mean": 0.9396871328353882, "epsilon_dpo/beta_margin_std": 1.0003536939620972, "epsilon_dpo/loss_margin_mean": 684.6524047851562, "grad_norm": 46.56395721435547, "kl/avg_steps": 0.6875, "kl/beta": 0.0013874606229364872, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 6.01487708363232e-08, "logits/chosen": -3.383917808532715, "logits/rejected": -4.392078876495361, "logps/chosen": -419.0536193847656, "logps/ref_chosen": -39.47812271118164, "logps/ref_rejected": -106.17308044433594, "logps/rejected": -1170.40087890625, "loss": 0.8257, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5249900221824646, "rewards/margins": 0.9396871328353882, "rewards/rejected": -1.464677095413208, "step": 544 }, { "epoch": 0.8002936857562408, "epsilon_dpo/beta": 0.001366927521303296, "epsilon_dpo/beta_margin_grad_mean": -0.3136303424835205, "epsilon_dpo/beta_margin_grad_std": 0.1529344618320465, "epsilon_dpo/beta_margin_mean": 0.9094871282577515, "epsilon_dpo/beta_margin_std": 0.8563925623893738, "epsilon_dpo/loss_margin_mean": 667.2245483398438, "grad_norm": 44.38985061645508, "kl/avg_steps": 0.8125, "kl/beta": 0.001377986976876855, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 5.9316317682106294e-08, "logits/chosen": -3.4539644718170166, "logits/rejected": -4.4015960693359375, "logps/chosen": -428.8343505859375, "logps/ref_chosen": -42.28656768798828, "logps/ref_rejected": -100.58802032470703, "logps/rejected": -1154.3603515625, "loss": 0.8128, "rewards/accuracies": 0.921875, "rewards/chosen": -0.5297784805297852, "rewards/margins": 0.9094871282577515, "rewards/rejected": -1.4392656087875366, "step": 545 }, { "epoch": 0.801762114537445, "epsilon_dpo/beta": 0.0013584739062935114, "epsilon_dpo/beta_margin_grad_mean": -0.33587032556533813, "epsilon_dpo/beta_margin_grad_std": 0.16217297315597534, "epsilon_dpo/beta_margin_mean": 0.7780935764312744, "epsilon_dpo/beta_margin_std": 0.8156005144119263, "epsilon_dpo/loss_margin_mean": 576.030517578125, "grad_norm": 47.50184631347656, "kl/avg_steps": 0.625, "kl/beta": 0.0013668810715898871, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 5.848888922025552e-08, "logits/chosen": -3.3195810317993164, "logits/rejected": -4.246737957000732, "logps/chosen": -418.70166015625, "logps/ref_chosen": -35.013492584228516, "logps/ref_rejected": -86.37733459472656, "logps/rejected": -1046.0960693359375, "loss": 0.8927, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5232120156288147, "rewards/margins": 0.7780935764312744, "rewards/rejected": -1.3013055324554443, "step": 546 }, { "epoch": 0.8032305433186491, "epsilon_dpo/beta": 0.0013491871068254113, "epsilon_dpo/beta_margin_grad_mean": -0.3124694228172302, "epsilon_dpo/beta_margin_grad_std": 0.15798072516918182, "epsilon_dpo/beta_margin_mean": 0.9112512469291687, "epsilon_dpo/beta_margin_std": 0.8325828909873962, "epsilon_dpo/loss_margin_mean": 678.1919555664062, "grad_norm": 46.05626678466797, "kl/avg_steps": 0.6875, "kl/beta": 0.0013583911349996924, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 5.7666507254280265e-08, "logits/chosen": -3.386657238006592, "logits/rejected": -4.366371154785156, "logps/chosen": -437.0877380371094, "logps/ref_chosen": -45.810516357421875, "logps/ref_rejected": -95.15408325195312, "logps/rejected": -1164.623291015625, "loss": 0.8106, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5293705463409424, "rewards/margins": 0.9112513065338135, "rewards/rejected": -1.4406218528747559, "step": 547 }, { "epoch": 0.8046989720998532, "epsilon_dpo/beta": 0.0013399748131632805, "epsilon_dpo/beta_margin_grad_mean": -0.3127719461917877, "epsilon_dpo/beta_margin_grad_std": 0.14488740265369415, "epsilon_dpo/beta_margin_mean": 0.946838915348053, "epsilon_dpo/beta_margin_std": 0.9356114864349365, "epsilon_dpo/loss_margin_mean": 709.183349609375, "grad_norm": 44.01782989501953, "kl/avg_steps": 0.6875, "kl/beta": 0.001349115977063775, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 5.684919345471029e-08, "logits/chosen": -3.3794431686401367, "logits/rejected": -4.389235496520996, "logps/chosen": -411.8974914550781, "logps/ref_chosen": -40.29045486450195, "logps/ref_rejected": -98.83418273925781, "logps/rejected": -1179.62451171875, "loss": 0.7973, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4989817142486572, "rewards/margins": 0.9468388557434082, "rewards/rejected": -1.4458205699920654, "step": 548 }, { "epoch": 0.8061674008810573, "epsilon_dpo/beta": 0.0013320817379280925, "epsilon_dpo/beta_margin_grad_mean": -0.35267123579978943, "epsilon_dpo/beta_margin_grad_std": 0.17075052857398987, "epsilon_dpo/beta_margin_mean": 0.7173733115196228, "epsilon_dpo/beta_margin_std": 0.8851867318153381, "epsilon_dpo/loss_margin_mean": 541.94580078125, "grad_norm": 81.65141296386719, "kl/avg_steps": 0.59375, "kl/beta": 0.0013399041490629315, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.603696935852426e-08, "logits/chosen": -3.2905972003936768, "logits/rejected": -4.312599182128906, "logps/chosen": -439.40911865234375, "logps/ref_chosen": -33.00281524658203, "logps/ref_rejected": -81.72914123535156, "logps/rejected": -1030.0811767578125, "loss": 0.9535, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5441075563430786, "rewards/margins": 0.717373251914978, "rewards/rejected": -1.2614808082580566, "step": 549 }, { "epoch": 0.8076358296622613, "epsilon_dpo/beta": 0.0013221375411376357, "epsilon_dpo/beta_margin_grad_mean": -0.30787116289138794, "epsilon_dpo/beta_margin_grad_std": 0.15403428673744202, "epsilon_dpo/beta_margin_mean": 0.9514063596725464, "epsilon_dpo/beta_margin_std": 0.8637392520904541, "epsilon_dpo/loss_margin_mean": 721.8765869140625, "grad_norm": 47.79182052612305, "kl/avg_steps": 0.75, "kl/beta": 0.001331995357759297, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 5.5229856368582376e-08, "logits/chosen": -3.2790517807006836, "logits/rejected": -4.2515106201171875, "logps/chosen": -415.656982421875, "logps/ref_chosen": -33.51374053955078, "logps/ref_rejected": -101.08897399902344, "logps/rejected": -1205.10888671875, "loss": 0.79, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5064237713813782, "rewards/margins": 0.9514063596725464, "rewards/rejected": -1.4578301906585693, "step": 550 }, { "epoch": 0.8091042584434655, "epsilon_dpo/beta": 0.0013135350309312344, "epsilon_dpo/beta_margin_grad_mean": -0.28273874521255493, "epsilon_dpo/beta_margin_grad_std": 0.17567141354084015, "epsilon_dpo/beta_margin_mean": 1.0942435264587402, "epsilon_dpo/beta_margin_std": 0.9498324394226074, "epsilon_dpo/loss_margin_mean": 836.8770751953125, "grad_norm": 45.95981979370117, "kl/avg_steps": 0.65625, "kl/beta": 0.0013220797991380095, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 5.4427875753062734e-08, "logits/chosen": -3.347069501876831, "logits/rejected": -4.404940605163574, "logps/chosen": -488.1016845703125, "logps/ref_chosen": -42.42958068847656, "logps/ref_rejected": -117.51499938964844, "logps/rejected": -1400.064208984375, "loss": 0.7468, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5882077217102051, "rewards/margins": 1.0942435264587402, "rewards/rejected": -1.6824512481689453, "step": 551 }, { "epoch": 0.8105726872246696, "epsilon_dpo/beta": 0.0013033291324973106, "epsilon_dpo/beta_margin_grad_mean": -0.27695924043655396, "epsilon_dpo/beta_margin_grad_std": 0.17922824621200562, "epsilon_dpo/beta_margin_mean": 1.1912535429000854, "epsilon_dpo/beta_margin_std": 1.1001869440078735, "epsilon_dpo/loss_margin_mean": 916.8844604492188, "grad_norm": 40.460792541503906, "kl/avg_steps": 0.78125, "kl/beta": 0.0013134601758792996, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 5.363104864490034e-08, "logits/chosen": -3.3478827476501465, "logits/rejected": -4.524975776672363, "logps/chosen": -496.78289794921875, "logps/ref_chosen": -40.86379623413086, "logps/ref_rejected": -112.38937377929688, "logps/rejected": -1485.19287109375, "loss": 0.7312, "rewards/accuracies": 0.921875, "rewards/chosen": -0.5959450006484985, "rewards/margins": 1.1912535429000854, "rewards/rejected": -1.787198543548584, "step": 552 }, { "epoch": 0.8120411160058737, "epsilon_dpo/beta": 0.001297299051657319, "epsilon_dpo/beta_margin_grad_mean": -0.3533886671066284, "epsilon_dpo/beta_margin_grad_std": 0.17754045128822327, "epsilon_dpo/beta_margin_mean": 0.7519553303718567, "epsilon_dpo/beta_margin_std": 1.001431941986084, "epsilon_dpo/loss_margin_mean": 583.842041015625, "grad_norm": 88.0752182006836, "kl/avg_steps": 0.46875, "kl/beta": 0.001303278375416994, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 5.2839396041230415e-08, "logits/chosen": -3.496969699859619, "logits/rejected": -4.206221580505371, "logps/chosen": -561.1559448242188, "logps/ref_chosen": -51.35152816772461, "logps/ref_rejected": -103.84159851074219, "logps/rejected": -1197.488037109375, "loss": 0.9607, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6639860272407532, "rewards/margins": 0.7519553303718567, "rewards/rejected": -1.4159413576126099, "step": 553 }, { "epoch": 0.8135095447870778, "epsilon_dpo/beta": 0.001288408413529396, "epsilon_dpo/beta_margin_grad_mean": -0.31087225675582886, "epsilon_dpo/beta_margin_grad_std": 0.16114763915538788, "epsilon_dpo/beta_margin_mean": 0.9212867021560669, "epsilon_dpo/beta_margin_std": 0.8502605557441711, "epsilon_dpo/loss_margin_mean": 718.2112426757812, "grad_norm": 48.403167724609375, "kl/avg_steps": 0.6875, "kl/beta": 0.0012971977703273296, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 5.205293880283551e-08, "logits/chosen": -3.321976661682129, "logits/rejected": -4.445281982421875, "logps/chosen": -478.4771728515625, "logps/ref_chosen": -46.820640563964844, "logps/ref_rejected": -94.6780014038086, "logps/rejected": -1244.5457763671875, "loss": 0.8105, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5585451126098633, "rewards/margins": 0.9212867021560669, "rewards/rejected": -1.4798319339752197, "step": 554 }, { "epoch": 0.8149779735682819, "epsilon_dpo/beta": 0.001279208343476057, "epsilon_dpo/beta_margin_grad_mean": -0.27689388394355774, "epsilon_dpo/beta_margin_grad_std": 0.16369499266147614, "epsilon_dpo/beta_margin_mean": 1.1780664920806885, "epsilon_dpo/beta_margin_std": 1.0308868885040283, "epsilon_dpo/loss_margin_mean": 923.9918212890625, "grad_norm": 45.80141830444336, "kl/avg_steps": 0.71875, "kl/beta": 0.0012883404269814491, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 5.127169765359515e-08, "logits/chosen": -3.4040637016296387, "logits/rejected": -4.443061828613281, "logps/chosen": -423.08660888671875, "logps/ref_chosen": -33.03218460083008, "logps/ref_rejected": -115.6031265258789, "logps/rejected": -1429.6494140625, "loss": 0.7069, "rewards/accuracies": 0.890625, "rewards/chosen": -0.49974268674850464, "rewards/margins": 1.1780664920806885, "rewards/rejected": -1.677809238433838, "step": 555 }, { "epoch": 0.8164464023494861, "epsilon_dpo/beta": 0.001270079636014998, "epsilon_dpo/beta_margin_grad_mean": -0.3221823573112488, "epsilon_dpo/beta_margin_grad_std": 0.15197478234767914, "epsilon_dpo/beta_margin_mean": 0.8850791454315186, "epsilon_dpo/beta_margin_std": 0.8993016481399536, "epsilon_dpo/loss_margin_mean": 699.4880981445312, "grad_norm": 50.429317474365234, "kl/avg_steps": 0.71875, "kl/beta": 0.001279146526940167, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 5.049569317994012e-08, "logits/chosen": -3.259387493133545, "logits/rejected": -4.264613151550293, "logps/chosen": -447.8577575683594, "logps/ref_chosen": -32.514793395996094, "logps/ref_rejected": -106.99200439453125, "logps/rejected": -1221.822998046875, "loss": 0.8334, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5285704135894775, "rewards/margins": 0.8850791454315186, "rewards/rejected": -1.413649559020996, "step": 556 }, { "epoch": 0.8179148311306902, "epsilon_dpo/beta": 0.00126141298096627, "epsilon_dpo/beta_margin_grad_mean": -0.3104838728904724, "epsilon_dpo/beta_margin_grad_std": 0.16874933242797852, "epsilon_dpo/beta_margin_mean": 0.9584492444992065, "epsilon_dpo/beta_margin_std": 0.9464617371559143, "epsilon_dpo/loss_margin_mean": 763.000244140625, "grad_norm": 49.268924713134766, "kl/avg_steps": 0.6875, "kl/beta": 0.0012700182851403952, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.9724945830310144e-08, "logits/chosen": -3.3845365047454834, "logits/rejected": -4.363818645477295, "logps/chosen": -543.5952758789062, "logps/ref_chosen": -51.166099548339844, "logps/ref_rejected": -117.06163024902344, "logps/rejected": -1372.490966796875, "loss": 0.8139, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6233251094818115, "rewards/margins": 0.9584492444992065, "rewards/rejected": -1.581774353981018, "step": 557 }, { "epoch": 0.8193832599118943, "epsilon_dpo/beta": 0.0012516174465417862, "epsilon_dpo/beta_margin_grad_mean": -0.27005571126937866, "epsilon_dpo/beta_margin_grad_std": 0.14844605326652527, "epsilon_dpo/beta_margin_mean": 1.16920006275177, "epsilon_dpo/beta_margin_std": 0.9076313972473145, "epsilon_dpo/loss_margin_mean": 936.5662841796875, "grad_norm": 45.054534912109375, "kl/avg_steps": 0.78125, "kl/beta": 0.0012613465078175068, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.8959475914614554e-08, "logits/chosen": -3.1819422245025635, "logits/rejected": -4.413593292236328, "logps/chosen": -387.129150390625, "logps/ref_chosen": -38.345916748046875, "logps/ref_rejected": -109.03158569335938, "logps/rejected": -1394.381103515625, "loss": 0.6774, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4374838173389435, "rewards/margins": 1.16920006275177, "rewards/rejected": -1.6066837310791016, "step": 558 }, { "epoch": 0.8208516886930984, "epsilon_dpo/beta": 0.0012430885108187795, "epsilon_dpo/beta_margin_grad_mean": -0.31191176176071167, "epsilon_dpo/beta_margin_grad_std": 0.17650018632411957, "epsilon_dpo/beta_margin_mean": 0.9644886255264282, "epsilon_dpo/beta_margin_std": 1.0081322193145752, "epsilon_dpo/loss_margin_mean": 779.3812255859375, "grad_norm": 63.11558151245117, "kl/avg_steps": 0.6875, "kl/beta": 0.0012515686685219407, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.8199303603697614e-08, "logits/chosen": -3.2931509017944336, "logits/rejected": -4.473822593688965, "logps/chosen": -542.97265625, "logps/ref_chosen": -42.82691955566406, "logps/ref_rejected": -98.61148071289062, "logps/rejected": -1378.138427734375, "loss": 0.8302, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6229937672615051, "rewards/margins": 0.9644886255264282, "rewards/rejected": -1.5874824523925781, "step": 559 }, { "epoch": 0.8223201174743024, "epsilon_dpo/beta": 0.0012365429429337382, "epsilon_dpo/beta_margin_grad_mean": -0.34858644008636475, "epsilon_dpo/beta_margin_grad_std": 0.15379458665847778, "epsilon_dpo/beta_margin_mean": 0.7325094938278198, "epsilon_dpo/beta_margin_std": 0.8337624669075012, "epsilon_dpo/loss_margin_mean": 595.9330444335938, "grad_norm": 46.32260513305664, "kl/avg_steps": 0.53125, "kl/beta": 0.001243022852577269, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.7444448928806615e-08, "logits/chosen": -3.276106834411621, "logits/rejected": -4.303354740142822, "logps/chosen": -445.64739990234375, "logps/ref_chosen": -36.52899932861328, "logps/ref_rejected": -85.29887390136719, "logps/rejected": -1090.350341796875, "loss": 0.9186, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5071742534637451, "rewards/margins": 0.7325094938278198, "rewards/rejected": -1.2396838665008545, "step": 560 }, { "epoch": 0.8237885462555066, "epsilon_dpo/beta": 0.0012296221684664488, "epsilon_dpo/beta_margin_grad_mean": -0.32985633611679077, "epsilon_dpo/beta_margin_grad_std": 0.16898517310619354, "epsilon_dpo/beta_margin_mean": 0.8398326635360718, "epsilon_dpo/beta_margin_std": 0.8977148532867432, "epsilon_dpo/loss_margin_mean": 686.9766845703125, "grad_norm": 51.50674057006836, "kl/avg_steps": 0.5625, "kl/beta": 0.001236454234458506, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.669493178106432e-08, "logits/chosen": -3.1779091358184814, "logits/rejected": -4.242127418518066, "logps/chosen": -401.35638427734375, "logps/ref_chosen": -33.39714050292969, "logps/ref_rejected": -104.86662292480469, "logps/rejected": -1159.802490234375, "loss": 0.8751, "rewards/accuracies": 0.875, "rewards/chosen": -0.45402687788009644, "rewards/margins": 0.8398326635360718, "rewards/rejected": -1.293859601020813, "step": 561 }, { "epoch": 0.8252569750367107, "epsilon_dpo/beta": 0.001219669939018786, "epsilon_dpo/beta_margin_grad_mean": -0.31512200832366943, "epsilon_dpo/beta_margin_grad_std": 0.15188972651958466, "epsilon_dpo/beta_margin_mean": 0.9067568182945251, "epsilon_dpo/beta_margin_std": 0.8777819275856018, "epsilon_dpo/loss_margin_mean": 745.5482788085938, "grad_norm": 58.34850311279297, "kl/avg_steps": 0.8125, "kl/beta": 0.001229538000188768, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.5950771910944596e-08, "logits/chosen": -3.1661272048950195, "logits/rejected": -4.288407325744629, "logps/chosen": -410.8808898925781, "logps/ref_chosen": -37.05860900878906, "logps/ref_rejected": -104.74354553222656, "logps/rejected": -1224.1141357421875, "loss": 0.8169, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4571970999240875, "rewards/margins": 0.9067568182945251, "rewards/rejected": -1.363953948020935, "step": 562 }, { "epoch": 0.8267254038179148, "epsilon_dpo/beta": 0.0012125081848353148, "epsilon_dpo/beta_margin_grad_mean": -0.33571353554725647, "epsilon_dpo/beta_margin_grad_std": 0.15231700241565704, "epsilon_dpo/beta_margin_mean": 0.7808871269226074, "epsilon_dpo/beta_margin_std": 0.7981037497520447, "epsilon_dpo/loss_margin_mean": 647.4954223632812, "grad_norm": 56.34610366821289, "kl/avg_steps": 0.59375, "kl/beta": 0.0012196284951642156, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.521198892775202e-08, "logits/chosen": -3.220041275024414, "logits/rejected": -4.239965915679932, "logps/chosen": -462.99176025390625, "logps/ref_chosen": -44.557411193847656, "logps/ref_rejected": -102.11149597167969, "logps/rejected": -1168.041259765625, "loss": 0.8805, "rewards/accuracies": 0.875, "rewards/chosen": -0.5091303586959839, "rewards/margins": 0.7808871269226074, "rewards/rejected": -1.2900173664093018, "step": 563 }, { "epoch": 0.8281938325991189, "epsilon_dpo/beta": 0.0012034567771479487, "epsilon_dpo/beta_margin_grad_mean": -0.3051425516605377, "epsilon_dpo/beta_margin_grad_std": 0.15122495591640472, "epsilon_dpo/beta_margin_mean": 0.944663405418396, "epsilon_dpo/beta_margin_std": 0.820910632610321, "epsilon_dpo/loss_margin_mean": 787.502197265625, "grad_norm": 49.93864059448242, "kl/avg_steps": 0.75, "kl/beta": 0.0012124297209084034, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.447860229910544e-08, "logits/chosen": -3.3658523559570312, "logits/rejected": -4.48719596862793, "logps/chosen": -445.9974060058594, "logps/ref_chosen": -43.770145416259766, "logps/ref_rejected": -99.07379150390625, "logps/rejected": -1288.80322265625, "loss": 0.7846, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4852016270160675, "rewards/margins": 0.944663405418396, "rewards/rejected": -1.4298651218414307, "step": 564 }, { "epoch": 0.8296622613803231, "epsilon_dpo/beta": 0.001195436459966004, "epsilon_dpo/beta_margin_grad_mean": -0.3250107765197754, "epsilon_dpo/beta_margin_grad_std": 0.1642923802137375, "epsilon_dpo/beta_margin_mean": 0.8570442795753479, "epsilon_dpo/beta_margin_std": 0.9114473462104797, "epsilon_dpo/loss_margin_mean": 720.2861328125, "grad_norm": 44.75363540649414, "kl/avg_steps": 0.671875, "kl/beta": 0.001203404157422483, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.375063135042445e-08, "logits/chosen": -3.1451213359832764, "logits/rejected": -4.4003705978393555, "logps/chosen": -405.4493103027344, "logps/ref_chosen": -35.76008605957031, "logps/ref_rejected": -92.45127868652344, "logps/rejected": -1182.4266357421875, "loss": 0.8657, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4435133934020996, "rewards/margins": 0.8570442199707031, "rewards/rejected": -1.3005576133728027, "step": 565 }, { "epoch": 0.8311306901615272, "epsilon_dpo/beta": 0.0011870847083628178, "epsilon_dpo/beta_margin_grad_mean": -0.3126325309276581, "epsilon_dpo/beta_margin_grad_std": 0.1551150530576706, "epsilon_dpo/beta_margin_mean": 0.9413906335830688, "epsilon_dpo/beta_margin_std": 0.8944268226623535, "epsilon_dpo/loss_margin_mean": 795.7219848632812, "grad_norm": 53.6401481628418, "kl/avg_steps": 0.703125, "kl/beta": 0.0011953727807849646, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.3028095264420525e-08, "logits/chosen": -3.165487289428711, "logits/rejected": -4.408507347106934, "logps/chosen": -425.1823425292969, "logps/ref_chosen": -44.17033004760742, "logps/ref_rejected": -111.02940368652344, "logps/rejected": -1287.763427734375, "loss": 0.8029, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4532804787158966, "rewards/margins": 0.9413906335830688, "rewards/rejected": -1.394671082496643, "step": 566 }, { "epoch": 0.8325991189427313, "epsilon_dpo/beta": 0.001178241684101522, "epsilon_dpo/beta_margin_grad_mean": -0.32961148023605347, "epsilon_dpo/beta_margin_grad_std": 0.1618032306432724, "epsilon_dpo/beta_margin_mean": 0.8316484093666077, "epsilon_dpo/beta_margin_std": 0.862678050994873, "epsilon_dpo/loss_margin_mean": 708.5885009765625, "grad_norm": 55.0069694519043, "kl/avg_steps": 0.75, "kl/beta": 0.0011870265007019043, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.231101308059165e-08, "logits/chosen": -3.2106685638427734, "logits/rejected": -4.49782657623291, "logps/chosen": -419.98046875, "logps/ref_chosen": -34.52465057373047, "logps/ref_rejected": -92.00111389160156, "logps/rejected": -1186.04541015625, "loss": 0.8671, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4556201100349426, "rewards/margins": 0.8316484093666077, "rewards/rejected": -1.2872684001922607, "step": 567 }, { "epoch": 0.8340675477239354, "epsilon_dpo/beta": 0.0011691023828461766, "epsilon_dpo/beta_margin_grad_mean": -0.3085906505584717, "epsilon_dpo/beta_margin_grad_std": 0.14849402010440826, "epsilon_dpo/beta_margin_mean": 0.9396561980247498, "epsilon_dpo/beta_margin_std": 0.8285186886787415, "epsilon_dpo/loss_margin_mean": 805.8914794921875, "grad_norm": 51.46321487426758, "kl/avg_steps": 0.78125, "kl/beta": 0.0011781901121139526, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.1599403694720145e-08, "logits/chosen": -3.223677158355713, "logits/rejected": -4.477365493774414, "logps/chosen": -342.3359375, "logps/ref_chosen": -20.8466854095459, "logps/ref_rejected": -96.66067504882812, "logps/rejected": -1224.04150390625, "loss": 0.7867, "rewards/accuracies": 0.90625, "rewards/chosen": -0.37704116106033325, "rewards/margins": 0.9396562576293945, "rewards/rejected": -1.316697359085083, "step": 568 }, { "epoch": 0.8355359765051396, "epsilon_dpo/beta": 0.0011607703054323792, "epsilon_dpo/beta_margin_grad_mean": -0.3284705877304077, "epsilon_dpo/beta_margin_grad_std": 0.159523144364357, "epsilon_dpo/beta_margin_mean": 0.8572857975959778, "epsilon_dpo/beta_margin_std": 0.9337847232818604, "epsilon_dpo/loss_margin_mean": 741.6339111328125, "grad_norm": 49.56915283203125, "kl/avg_steps": 0.71875, "kl/beta": 0.0011690568644553423, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.089328585837512e-08, "logits/chosen": -3.329655647277832, "logits/rejected": -4.526782035827637, "logps/chosen": -403.1963195800781, "logps/ref_chosen": -44.985755920410156, "logps/ref_rejected": -84.71590423583984, "logps/rejected": -1184.5604248046875, "loss": 0.8626, "rewards/accuracies": 0.859375, "rewards/chosen": -0.41802793741226196, "rewards/margins": 0.857285737991333, "rewards/rejected": -1.2753137350082397, "step": 569 }, { "epoch": 0.8370044052863436, "epsilon_dpo/beta": 0.0011524868896231055, "epsilon_dpo/beta_margin_grad_mean": -0.3267635405063629, "epsilon_dpo/beta_margin_grad_std": 0.1530372053384781, "epsilon_dpo/beta_margin_mean": 0.8452085256576538, "epsilon_dpo/beta_margin_std": 0.8515332341194153, "epsilon_dpo/loss_margin_mean": 736.2088012695312, "grad_norm": 46.32669448852539, "kl/avg_steps": 0.71875, "kl/beta": 0.0011607141932472587, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.019267817841834e-08, "logits/chosen": -3.3636648654937744, "logits/rejected": -4.605798721313477, "logps/chosen": -474.98382568359375, "logps/ref_chosen": -39.338470458984375, "logps/ref_rejected": -88.34091186523438, "logps/rejected": -1260.195068359375, "loss": 0.8506, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5037564635276794, "rewards/margins": 0.8452085256576538, "rewards/rejected": -1.3489649295806885, "step": 570 }, { "epoch": 0.8384728340675477, "epsilon_dpo/beta": 0.001143542118370533, "epsilon_dpo/beta_margin_grad_mean": -0.2934305667877197, "epsilon_dpo/beta_margin_grad_std": 0.16104663908481598, "epsilon_dpo/beta_margin_mean": 1.0465232133865356, "epsilon_dpo/beta_margin_std": 0.9296081066131592, "epsilon_dpo/loss_margin_mean": 917.7656860351562, "grad_norm": 43.56798553466797, "kl/avg_steps": 0.78125, "kl/beta": 0.0011524311266839504, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.9497599116513705e-08, "logits/chosen": -3.367818832397461, "logits/rejected": -4.655285835266113, "logps/chosen": -398.0816650390625, "logps/ref_chosen": -27.42264747619629, "logps/ref_rejected": -99.594482421875, "logps/rejected": -1388.0191650390625, "loss": 0.7563, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4253048002719879, "rewards/margins": 1.0465232133865356, "rewards/rejected": -1.4718279838562012, "step": 571 }, { "epoch": 0.8399412628487518, "epsilon_dpo/beta": 0.001137536484748125, "epsilon_dpo/beta_margin_grad_mean": -0.32172301411628723, "epsilon_dpo/beta_margin_grad_std": 0.18587327003479004, "epsilon_dpo/beta_margin_mean": 0.8928326964378357, "epsilon_dpo/beta_margin_std": 0.9588112831115723, "epsilon_dpo/loss_margin_mean": 790.0717163085938, "grad_norm": 59.63137435913086, "kl/avg_steps": 0.53125, "kl/beta": 0.0011434975313022733, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.880806698864086e-08, "logits/chosen": -3.412736654281616, "logits/rejected": -4.5799455642700195, "logps/chosen": -437.66815185546875, "logps/ref_chosen": -28.469844818115234, "logps/ref_rejected": -90.8378677368164, "logps/rejected": -1290.10791015625, "loss": 0.8689, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4676932096481323, "rewards/margins": 0.8928326368331909, "rewards/rejected": -1.3605258464813232, "step": 572 }, { "epoch": 0.8414096916299559, "epsilon_dpo/beta": 0.00113116973079741, "epsilon_dpo/beta_margin_grad_mean": -0.3352082371711731, "epsilon_dpo/beta_margin_grad_std": 0.1706962138414383, "epsilon_dpo/beta_margin_mean": 0.8242603540420532, "epsilon_dpo/beta_margin_std": 0.914152204990387, "epsilon_dpo/loss_margin_mean": 732.9346923828125, "grad_norm": 58.05121612548828, "kl/avg_steps": 0.5625, "kl/beta": 0.0011374547611922026, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.812409996461275e-08, "logits/chosen": -3.55238676071167, "logits/rejected": -4.599684715270996, "logps/chosen": -550.3379516601562, "logps/ref_chosen": -35.605384826660156, "logps/ref_rejected": -91.72334289550781, "logps/rejected": -1339.390625, "loss": 0.8899, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5845237970352173, "rewards/margins": 0.8242603540420532, "rewards/rejected": -1.4087841510772705, "step": 573 }, { "epoch": 0.8428781204111601, "epsilon_dpo/beta": 0.0011223680339753628, "epsilon_dpo/beta_margin_grad_mean": -0.2976383864879608, "epsilon_dpo/beta_margin_grad_std": 0.15246038138866425, "epsilon_dpo/beta_margin_mean": 1.003798246383667, "epsilon_dpo/beta_margin_std": 0.8582514524459839, "epsilon_dpo/loss_margin_mean": 896.7732543945312, "grad_norm": 47.21782302856445, "kl/avg_steps": 0.78125, "kl/beta": 0.001131092431023717, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.74457160675965e-08, "logits/chosen": -3.418008327484131, "logits/rejected": -4.645315170288086, "logps/chosen": -419.362060546875, "logps/ref_chosen": -31.43255615234375, "logps/ref_rejected": -98.70758056640625, "logps/rejected": -1383.410400390625, "loss": 0.7582, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4360452890396118, "rewards/margins": 1.003798246383667, "rewards/rejected": -1.4398435354232788, "step": 574 }, { "epoch": 0.8443465491923642, "epsilon_dpo/beta": 0.0011150705395266414, "epsilon_dpo/beta_margin_grad_mean": -0.3144918978214264, "epsilon_dpo/beta_margin_grad_std": 0.16865824162960052, "epsilon_dpo/beta_margin_mean": 0.9695743918418884, "epsilon_dpo/beta_margin_std": 1.0260599851608276, "epsilon_dpo/loss_margin_mean": 873.2989501953125, "grad_norm": 52.89796447753906, "kl/avg_steps": 0.65625, "kl/beta": 0.0011223242618143559, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.677293317363864e-08, "logits/chosen": -3.360232353210449, "logits/rejected": -4.744388580322266, "logps/chosen": -476.3949279785156, "logps/ref_chosen": -40.592437744140625, "logps/ref_rejected": -102.49923706054688, "logps/rejected": -1411.6007080078125, "loss": 0.8223, "rewards/accuracies": 0.875, "rewards/chosen": -0.4872722625732422, "rewards/margins": 0.9695743322372437, "rewards/rejected": -1.4568467140197754, "step": 575 }, { "epoch": 0.8458149779735683, "epsilon_dpo/beta": 0.001109194359742105, "epsilon_dpo/beta_margin_grad_mean": -0.3279215395450592, "epsilon_dpo/beta_margin_grad_std": 0.1897161304950714, "epsilon_dpo/beta_margin_mean": 0.9002985954284668, "epsilon_dpo/beta_margin_std": 1.0915855169296265, "epsilon_dpo/loss_margin_mean": 817.2003784179688, "grad_norm": 49.19211196899414, "kl/avg_steps": 0.53125, "kl/beta": 0.0011150069767609239, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.6105769011194224e-08, "logits/chosen": -3.442026376724243, "logits/rejected": -4.685531139373779, "logps/chosen": -509.7559814453125, "logps/ref_chosen": -34.52962875366211, "logps/ref_rejected": -107.82730102539062, "logps/rejected": -1400.2540283203125, "loss": 0.8987, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5297691822052002, "rewards/margins": 0.9002985954284668, "rewards/rejected": -1.430067777633667, "step": 576 }, { "epoch": 0.8472834067547724, "epsilon_dpo/beta": 0.0011009065201506019, "epsilon_dpo/beta_margin_grad_mean": -0.3192014694213867, "epsilon_dpo/beta_margin_grad_std": 0.13799449801445007, "epsilon_dpo/beta_margin_mean": 0.8508294224739075, "epsilon_dpo/beta_margin_std": 0.7307493686676025, "epsilon_dpo/loss_margin_mean": 775.322509765625, "grad_norm": 47.395843505859375, "kl/avg_steps": 0.75, "kl/beta": 0.0011091148480772972, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.5444241160659304e-08, "logits/chosen": -3.390251636505127, "logits/rejected": -4.721263885498047, "logps/chosen": -472.676513671875, "logps/ref_chosen": -36.379669189453125, "logps/ref_rejected": -91.19670104980469, "logps/rejected": -1302.816162109375, "loss": 0.8153, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4818669557571411, "rewards/margins": 0.8508294224739075, "rewards/rejected": -1.3326964378356934, "step": 577 }, { "epoch": 0.8487518355359766, "epsilon_dpo/beta": 0.0010930553544312716, "epsilon_dpo/beta_margin_grad_mean": -0.31814831495285034, "epsilon_dpo/beta_margin_grad_std": 0.1622655838727951, "epsilon_dpo/beta_margin_mean": 0.8751156330108643, "epsilon_dpo/beta_margin_std": 0.8696184754371643, "epsilon_dpo/loss_margin_mean": 803.7972412109375, "grad_norm": 56.689117431640625, "kl/avg_steps": 0.71875, "kl/beta": 0.0011008584406226873, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.478836705390808e-08, "logits/chosen": -3.35703182220459, "logits/rejected": -4.623649597167969, "logps/chosen": -402.2279357910156, "logps/ref_chosen": -30.31075668334961, "logps/ref_rejected": -89.46212005615234, "logps/rejected": -1265.176513671875, "loss": 0.8456, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4083082675933838, "rewards/margins": 0.8751156330108643, "rewards/rejected": -1.283423900604248, "step": 578 }, { "epoch": 0.8502202643171806, "epsilon_dpo/beta": 0.0010845718206837773, "epsilon_dpo/beta_margin_grad_mean": -0.3247964680194855, "epsilon_dpo/beta_margin_grad_std": 0.13387812674045563, "epsilon_dpo/beta_margin_mean": 0.8189929723739624, "epsilon_dpo/beta_margin_std": 0.7138299345970154, "epsilon_dpo/loss_margin_mean": 757.1980590820312, "grad_norm": 47.10636520385742, "kl/avg_steps": 0.78125, "kl/beta": 0.0010930025018751621, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.41381639738331e-08, "logits/chosen": -3.4386825561523438, "logits/rejected": -4.622516632080078, "logps/chosen": -481.96820068359375, "logps/ref_chosen": -38.872169494628906, "logps/ref_rejected": -102.441162109375, "logps/rejected": -1302.7352294921875, "loss": 0.8296, "rewards/accuracies": 0.9375, "rewards/chosen": -0.48166942596435547, "rewards/margins": 0.8189929723739624, "rewards/rejected": -1.3006623983383179, "step": 579 }, { "epoch": 0.8516886930983847, "epsilon_dpo/beta": 0.0010795537382364273, "epsilon_dpo/beta_margin_grad_mean": -0.3553590476512909, "epsilon_dpo/beta_margin_grad_std": 0.1680212914943695, "epsilon_dpo/beta_margin_mean": 0.7070707082748413, "epsilon_dpo/beta_margin_std": 0.8788829445838928, "epsilon_dpo/loss_margin_mean": 659.7229614257812, "grad_norm": 53.16489028930664, "kl/avg_steps": 0.46875, "kl/beta": 0.0010845295619219542, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.349364905389032e-08, "logits/chosen": -3.4820971488952637, "logits/rejected": -4.580034255981445, "logps/chosen": -516.2334594726562, "logps/ref_chosen": -43.879005432128906, "logps/ref_rejected": -87.93156433105469, "logps/rejected": -1220.0089111328125, "loss": 0.9567, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5118281841278076, "rewards/margins": 0.7070707082748413, "rewards/rejected": -1.2188990116119385, "step": 580 }, { "epoch": 0.8531571218795888, "epsilon_dpo/beta": 0.0010721554281190038, "epsilon_dpo/beta_margin_grad_mean": -0.32997187972068787, "epsilon_dpo/beta_margin_grad_std": 0.16221848130226135, "epsilon_dpo/beta_margin_mean": 0.8481745719909668, "epsilon_dpo/beta_margin_std": 0.9297420382499695, "epsilon_dpo/loss_margin_mean": 794.6920166015625, "grad_norm": 48.09032440185547, "kl/avg_steps": 0.6875, "kl/beta": 0.0010794695699587464, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.285483927764726e-08, "logits/chosen": -3.527688503265381, "logits/rejected": -4.8321614265441895, "logps/chosen": -542.8037109375, "logps/ref_chosen": -43.27698516845703, "logps/ref_rejected": -97.41041564941406, "logps/rejected": -1391.629150390625, "loss": 0.8701, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5371580123901367, "rewards/margins": 0.8481745719909668, "rewards/rejected": -1.3853325843811035, "step": 581 }, { "epoch": 0.8546255506607929, "epsilon_dpo/beta": 0.001064834650605917, "epsilon_dpo/beta_margin_grad_mean": -0.32155805826187134, "epsilon_dpo/beta_margin_grad_std": 0.18931035697460175, "epsilon_dpo/beta_margin_mean": 0.9292847514152527, "epsilon_dpo/beta_margin_std": 1.0593138933181763, "epsilon_dpo/loss_margin_mean": 877.0955200195312, "grad_norm": 56.42830276489258, "kl/avg_steps": 0.6875, "kl/beta": 0.0010720988502725959, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.222175147833556e-08, "logits/chosen": -3.625027656555176, "logits/rejected": -4.7767333984375, "logps/chosen": -586.5724487304688, "logps/ref_chosen": -47.183502197265625, "logps/ref_rejected": -118.7525634765625, "logps/rejected": -1535.237060546875, "loss": 0.874, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5769472718238831, "rewards/margins": 0.9292848110198975, "rewards/rejected": -1.5062320232391357, "step": 582 }, { "epoch": 0.856093979441997, "epsilon_dpo/beta": 0.0010598934022709727, "epsilon_dpo/beta_margin_grad_mean": -0.3548807203769684, "epsilon_dpo/beta_margin_grad_std": 0.19512853026390076, "epsilon_dpo/beta_margin_mean": 0.7214035391807556, "epsilon_dpo/beta_margin_std": 1.0699379444122314, "epsilon_dpo/loss_margin_mean": 686.6939697265625, "grad_norm": 76.16841888427734, "kl/avg_steps": 0.46875, "kl/beta": 0.0010647785384207964, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.159440233840763e-08, "logits/chosen": -3.5470635890960693, "logits/rejected": -4.620864391326904, "logps/chosen": -522.0079956054688, "logps/ref_chosen": -37.24636459350586, "logps/ref_rejected": -95.97724151611328, "logps/rejected": -1267.432861328125, "loss": 1.018, "rewards/accuracies": 0.75, "rewards/chosen": -0.5164851546287537, "rewards/margins": 0.7214035987854004, "rewards/rejected": -1.2378886938095093, "step": 583 }, { "epoch": 0.8575624082232012, "epsilon_dpo/beta": 0.0010506423423066735, "epsilon_dpo/beta_margin_grad_mean": -0.2919199466705322, "epsilon_dpo/beta_margin_grad_std": 0.1186746209859848, "epsilon_dpo/beta_margin_mean": 0.9708749651908875, "epsilon_dpo/beta_margin_std": 0.6488715410232544, "epsilon_dpo/loss_margin_mean": 925.4342041015625, "grad_norm": 45.21849060058594, "kl/avg_steps": 0.875, "kl/beta": 0.0010598106309771538, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.0972808389096635e-08, "logits/chosen": -3.285341262817383, "logits/rejected": -4.798908710479736, "logps/chosen": -381.122314453125, "logps/ref_chosen": -34.52136993408203, "logps/ref_rejected": -105.40646362304688, "logps/rejected": -1377.441650390625, "loss": 0.7209, "rewards/accuracies": 0.96875, "rewards/chosen": -0.3644617795944214, "rewards/margins": 0.9708749055862427, "rewards/rejected": -1.335336685180664, "step": 584 }, { "epoch": 0.8590308370044053, "epsilon_dpo/beta": 0.001042842399328947, "epsilon_dpo/beta_margin_grad_mean": -0.3161095976829529, "epsilon_dpo/beta_margin_grad_std": 0.1634971797466278, "epsilon_dpo/beta_margin_mean": 0.9011527895927429, "epsilon_dpo/beta_margin_std": 0.8913306593894958, "epsilon_dpo/loss_margin_mean": 867.5726928710938, "grad_norm": 49.96684265136719, "kl/avg_steps": 0.75, "kl/beta": 0.001050617778673768, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.035698600998121e-08, "logits/chosen": -3.437217950820923, "logits/rejected": -4.798175811767578, "logps/chosen": -450.2691345214844, "logps/ref_chosen": -36.641475677490234, "logps/ref_rejected": -92.18727111816406, "logps/rejected": -1373.3876953125, "loss": 0.8314, "rewards/accuracies": 0.875, "rewards/chosen": -0.4328582286834717, "rewards/margins": 0.9011527895927429, "rewards/rejected": -1.3340110778808594, "step": 585 }, { "epoch": 0.8604992657856094, "epsilon_dpo/beta": 0.0010367088252678514, "epsilon_dpo/beta_margin_grad_mean": -0.3300473392009735, "epsilon_dpo/beta_margin_grad_std": 0.16417358815670013, "epsilon_dpo/beta_margin_mean": 0.833533763885498, "epsilon_dpo/beta_margin_std": 0.8716885447502136, "epsilon_dpo/loss_margin_mean": 808.5335693359375, "grad_norm": 41.283748626708984, "kl/avg_steps": 0.59375, "kl/beta": 0.001042796764522791, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.974695142855388e-08, "logits/chosen": -3.5555217266082764, "logits/rejected": -4.804073333740234, "logps/chosen": -526.1253051757812, "logps/ref_chosen": -36.78833770751953, "logps/ref_rejected": -98.72267150878906, "logps/rejected": -1396.59326171875, "loss": 0.8691, "rewards/accuracies": 0.8125, "rewards/chosen": -0.508251428604126, "rewards/margins": 0.833533763885498, "rewards/rejected": -1.341785192489624, "step": 586 }, { "epoch": 0.8619676945668135, "epsilon_dpo/beta": 0.0010302658192813396, "epsilon_dpo/beta_margin_grad_mean": -0.33965250849723816, "epsilon_dpo/beta_margin_grad_std": 0.15424403548240662, "epsilon_dpo/beta_margin_mean": 0.7954989075660706, "epsilon_dpo/beta_margin_std": 0.8702175617218018, "epsilon_dpo/loss_margin_mean": 775.6798706054688, "grad_norm": 43.92211151123047, "kl/avg_steps": 0.625, "kl/beta": 0.001036641770042479, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.9142720719793122e-08, "logits/chosen": -3.2840447425842285, "logits/rejected": -4.652719020843506, "logps/chosen": -320.67523193359375, "logps/ref_chosen": -27.06256103515625, "logps/ref_rejected": -87.31655883789062, "logps/rejected": -1156.609130859375, "loss": 0.8854, "rewards/accuracies": 0.8125, "rewards/chosen": -0.30333322286605835, "rewards/margins": 0.7954989075660706, "rewards/rejected": -1.098832130432129, "step": 587 }, { "epoch": 0.8634361233480177, "epsilon_dpo/beta": 0.0010235445806756616, "epsilon_dpo/beta_margin_grad_mean": -0.3219730854034424, "epsilon_dpo/beta_margin_grad_std": 0.1715155392885208, "epsilon_dpo/beta_margin_mean": 0.8723087906837463, "epsilon_dpo/beta_margin_std": 0.9220552444458008, "epsilon_dpo/loss_margin_mean": 856.6797485351562, "grad_norm": 54.41194152832031, "kl/avg_steps": 0.65625, "kl/beta": 0.001030202955007553, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.8544309805740018e-08, "logits/chosen": -3.385420083999634, "logits/rejected": -4.716907978057861, "logps/chosen": -473.547607421875, "logps/ref_chosen": -32.258827209472656, "logps/ref_rejected": -112.76737976074219, "logps/rejected": -1410.73583984375, "loss": 0.8623, "rewards/accuracies": 0.859375, "rewards/chosen": -0.45345091819763184, "rewards/margins": 0.8723088502883911, "rewards/rejected": -1.3257596492767334, "step": 588 }, { "epoch": 0.8649045521292217, "epsilon_dpo/beta": 0.0010178310330957174, "epsilon_dpo/beta_margin_grad_mean": -0.3191934823989868, "epsilon_dpo/beta_margin_grad_std": 0.17622847855091095, "epsilon_dpo/beta_margin_mean": 0.9419023990631104, "epsilon_dpo/beta_margin_std": 1.028359293937683, "epsilon_dpo/loss_margin_mean": 930.8062744140625, "grad_norm": 57.298946380615234, "kl/avg_steps": 0.5625, "kl/beta": 0.001023486373014748, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.7951734455078786e-08, "logits/chosen": -3.4220762252807617, "logits/rejected": -4.901893138885498, "logps/chosen": -488.34820556640625, "logps/ref_chosen": -36.310951232910156, "logps/ref_rejected": -119.32230377197266, "logps/rejected": -1502.165771484375, "loss": 0.846, "rewards/accuracies": 0.8125, "rewards/chosen": -0.46269693970680237, "rewards/margins": 0.9419023990631104, "rewards/rejected": -1.4045993089675903, "step": 589 }, { "epoch": 0.8663729809104258, "epsilon_dpo/beta": 0.0010111834853887558, "epsilon_dpo/beta_margin_grad_mean": -0.3087705075740814, "epsilon_dpo/beta_margin_grad_std": 0.15020431578159332, "epsilon_dpo/beta_margin_mean": 0.94552081823349, "epsilon_dpo/beta_margin_std": 0.8651568293571472, "epsilon_dpo/loss_margin_mean": 938.7587280273438, "grad_norm": 43.128292083740234, "kl/avg_steps": 0.65625, "kl/beta": 0.001017761416733265, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.736501028272095e-08, "logits/chosen": -3.2722253799438477, "logits/rejected": -4.7439470291137695, "logps/chosen": -384.21209716796875, "logps/ref_chosen": -31.54564094543457, "logps/ref_rejected": -112.30560302734375, "logps/rejected": -1403.730712890625, "loss": 0.7904, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3574645221233368, "rewards/margins": 0.94552081823349, "rewards/rejected": -1.302985429763794, "step": 590 }, { "epoch": 0.8678414096916299, "epsilon_dpo/beta": 0.0010042748181149364, "epsilon_dpo/beta_margin_grad_mean": -0.3359038829803467, "epsilon_dpo/beta_margin_grad_std": 0.17261067032814026, "epsilon_dpo/beta_margin_mean": 0.8146455883979797, "epsilon_dpo/beta_margin_std": 0.9358741641044617, "epsilon_dpo/loss_margin_mean": 815.1807861328125, "grad_norm": 63.2326545715332, "kl/avg_steps": 0.6875, "kl/beta": 0.0010111258598044515, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.678415274939408e-08, "logits/chosen": -3.32479190826416, "logits/rejected": -4.7041168212890625, "logps/chosen": -425.36102294921875, "logps/ref_chosen": -32.81976318359375, "logps/ref_rejected": -89.97746276855469, "logps/rejected": -1297.699462890625, "loss": 0.9023, "rewards/accuracies": 0.859375, "rewards/chosen": -0.39648881554603577, "rewards/margins": 0.814645528793335, "rewards/rejected": -1.211134433746338, "step": 591 }, { "epoch": 0.869309838472834, "epsilon_dpo/beta": 0.0009980452014133334, "epsilon_dpo/beta_margin_grad_mean": -0.33510512113571167, "epsilon_dpo/beta_margin_grad_std": 0.15517839789390564, "epsilon_dpo/beta_margin_mean": 0.7904956340789795, "epsilon_dpo/beta_margin_std": 0.8030670285224915, "epsilon_dpo/loss_margin_mean": 796.0484008789062, "grad_norm": 49.01154708862305, "kl/avg_steps": 0.625, "kl/beta": 0.001004221849143505, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.6209177161234442e-08, "logits/chosen": -3.4462766647338867, "logits/rejected": -4.8302812576293945, "logps/chosen": -423.3301696777344, "logps/ref_chosen": -31.739425659179688, "logps/ref_rejected": -81.3411865234375, "logps/rejected": -1268.98046875, "loss": 0.877, "rewards/accuracies": 0.859375, "rewards/chosen": -0.3929889500141144, "rewards/margins": 0.7904956340789795, "rewards/rejected": -1.1834845542907715, "step": 592 }, { "epoch": 0.8707782672540382, "epsilon_dpo/beta": 0.0009934057015925646, "epsilon_dpo/beta_margin_grad_mean": -0.3691524267196655, "epsilon_dpo/beta_margin_grad_std": 0.15739652514457703, "epsilon_dpo/beta_margin_mean": 0.611284613609314, "epsilon_dpo/beta_margin_std": 0.7558280825614929, "epsilon_dpo/loss_margin_mean": 619.8391723632812, "grad_norm": 50.16069412231445, "kl/avg_steps": 0.46875, "kl/beta": 0.0009979844326153398, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.564009866938349e-08, "logits/chosen": -3.442495822906494, "logits/rejected": -4.646967887878418, "logps/chosen": -388.3526611328125, "logps/ref_chosen": -33.75498962402344, "logps/ref_rejected": -66.24615478515625, "logps/rejected": -1040.68310546875, "loss": 0.9905, "rewards/accuracies": 0.8125, "rewards/chosen": -0.35422414541244507, "rewards/margins": 0.611284613609314, "rewards/rejected": -0.965508759021759, "step": 593 }, { "epoch": 0.8722466960352423, "epsilon_dpo/beta": 0.0009878396522253752, "epsilon_dpo/beta_margin_grad_mean": -0.3453194499015808, "epsilon_dpo/beta_margin_grad_std": 0.15376316010951996, "epsilon_dpo/beta_margin_mean": 0.7299271821975708, "epsilon_dpo/beta_margin_std": 0.7822921276092529, "epsilon_dpo/loss_margin_mean": 743.34716796875, "grad_norm": 56.66592025756836, "kl/avg_steps": 0.5625, "kl/beta": 0.0009933282854035497, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.5076932269588708e-08, "logits/chosen": -3.383502960205078, "logits/rejected": -4.847668647766113, "logps/chosen": -447.0162353515625, "logps/ref_chosen": -34.56220245361328, "logps/ref_rejected": -92.779052734375, "logps/rejected": -1248.580322265625, "loss": 0.9111, "rewards/accuracies": 0.78125, "rewards/chosen": -0.40900394320487976, "rewards/margins": 0.7299271821975708, "rewards/rejected": -1.1389310359954834, "step": 594 }, { "epoch": 0.8737151248164464, "epsilon_dpo/beta": 0.0009810791816562414, "epsilon_dpo/beta_margin_grad_mean": -0.35142308473587036, "epsilon_dpo/beta_margin_grad_std": 0.14063841104507446, "epsilon_dpo/beta_margin_mean": 0.7219837307929993, "epsilon_dpo/beta_margin_std": 0.8028604388237, "epsilon_dpo/loss_margin_mean": 738.6736450195312, "grad_norm": 48.688880920410156, "kl/avg_steps": 0.6875, "kl/beta": 0.0009877720149233937, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.451969280180849e-08, "logits/chosen": -3.31217098236084, "logits/rejected": -4.6105427742004395, "logps/chosen": -390.6729431152344, "logps/ref_chosen": -31.321365356445312, "logps/ref_rejected": -85.45933532714844, "logps/rejected": -1183.484619140625, "loss": 0.9122, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3539902865886688, "rewards/margins": 0.7219836711883545, "rewards/rejected": -1.0759739875793457, "step": 595 }, { "epoch": 0.8751835535976505, "epsilon_dpo/beta": 0.0009740737732499838, "epsilon_dpo/beta_margin_grad_mean": -0.3495016098022461, "epsilon_dpo/beta_margin_grad_std": 0.13467830419540405, "epsilon_dpo/beta_margin_mean": 0.7045981287956238, "epsilon_dpo/beta_margin_std": 0.7173324227333069, "epsilon_dpo/loss_margin_mean": 726.050048828125, "grad_norm": 47.94287872314453, "kl/avg_steps": 0.71875, "kl/beta": 0.0009810274932533503, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.396839494982103e-08, "logits/chosen": -3.158041000366211, "logits/rejected": -4.6905059814453125, "logps/chosen": -378.280517578125, "logps/ref_chosen": -30.307376861572266, "logps/ref_rejected": -88.15638732910156, "logps/rejected": -1162.1795654296875, "loss": 0.9049, "rewards/accuracies": 0.875, "rewards/chosen": -0.3397039771080017, "rewards/margins": 0.7045981287956238, "rewards/rejected": -1.0443021059036255, "step": 596 }, { "epoch": 0.8766519823788547, "epsilon_dpo/beta": 0.0009677313501015306, "epsilon_dpo/beta_margin_grad_mean": -0.32157784700393677, "epsilon_dpo/beta_margin_grad_std": 0.17215141654014587, "epsilon_dpo/beta_margin_mean": 0.894204318523407, "epsilon_dpo/beta_margin_std": 0.9434449076652527, "epsilon_dpo/loss_margin_mean": 928.6572265625, "grad_norm": 41.356510162353516, "kl/avg_steps": 0.65625, "kl/beta": 0.0009740266250446439, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.3423053240837514e-08, "logits/chosen": -3.414703845977783, "logits/rejected": -4.802955627441406, "logps/chosen": -437.45770263671875, "logps/ref_chosen": -34.63069152832031, "logps/ref_rejected": -107.39651489257812, "logps/rejected": -1438.8807373046875, "loss": 0.8543, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3919217586517334, "rewards/margins": 0.8942043781280518, "rewards/rejected": -1.2861260175704956, "step": 597 }, { "epoch": 0.8781204111600588, "epsilon_dpo/beta": 0.0009614220471121371, "epsilon_dpo/beta_margin_grad_mean": -0.34296542406082153, "epsilon_dpo/beta_margin_grad_std": 0.16591456532478333, "epsilon_dpo/beta_margin_mean": 0.7655530571937561, "epsilon_dpo/beta_margin_std": 0.8700756430625916, "epsilon_dpo/loss_margin_mean": 800.5443725585938, "grad_norm": 63.38301467895508, "kl/avg_steps": 0.65625, "kl/beta": 0.0009676762856543064, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.2883682045119062e-08, "logits/chosen": -3.349184989929199, "logits/rejected": -4.659879684448242, "logps/chosen": -458.43048095703125, "logps/ref_chosen": -38.996089935302734, "logps/ref_rejected": -97.60490417480469, "logps/rejected": -1317.583740234375, "loss": 0.9145, "rewards/accuracies": 0.84375, "rewards/chosen": -0.40632039308547974, "rewards/margins": 0.7655530571937561, "rewards/rejected": -1.1718735694885254, "step": 598 }, { "epoch": 0.8795888399412628, "epsilon_dpo/beta": 0.0009536515572108328, "epsilon_dpo/beta_margin_grad_mean": -0.34948039054870605, "epsilon_dpo/beta_margin_grad_std": 0.14211682975292206, "epsilon_dpo/beta_margin_mean": 0.6984655857086182, "epsilon_dpo/beta_margin_std": 0.7249807715415955, "epsilon_dpo/loss_margin_mean": 734.8526000976562, "grad_norm": 51.95116424560547, "kl/avg_steps": 0.8125, "kl/beta": 0.0009613672737032175, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.2350295575598367e-08, "logits/chosen": -3.448948621749878, "logits/rejected": -4.653861045837402, "logps/chosen": -416.742431640625, "logps/ref_chosen": -33.18586730957031, "logps/ref_rejected": -88.57789611816406, "logps/rejected": -1206.987060546875, "loss": 0.915, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3669903874397278, "rewards/margins": 0.6984655857086182, "rewards/rejected": -1.0654559135437012, "step": 599 }, { "epoch": 0.8810572687224669, "epsilon_dpo/beta": 0.0009474557591602206, "epsilon_dpo/beta_margin_grad_mean": -0.34580788016319275, "epsilon_dpo/beta_margin_grad_std": 0.13170255720615387, "epsilon_dpo/beta_margin_mean": 0.7138612270355225, "epsilon_dpo/beta_margin_std": 0.6741172075271606, "epsilon_dpo/loss_margin_mean": 756.361083984375, "grad_norm": 49.26176834106445, "kl/avg_steps": 0.65625, "kl/beta": 0.000953619135543704, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.1822907887504932e-08, "logits/chosen": -3.4161376953125, "logits/rejected": -4.741451263427734, "logps/chosen": -460.50970458984375, "logps/ref_chosen": -42.37248229980469, "logps/ref_rejected": -89.61239624023438, "logps/rejected": -1264.110595703125, "loss": 0.8895, "rewards/accuracies": 0.875, "rewards/chosen": -0.39791154861450195, "rewards/margins": 0.7138612270355225, "rewards/rejected": -1.1117727756500244, "step": 600 }, { "epoch": 0.8810572687224669, "eval_epsilon_dpo/beta": 0.0009440449066460133, "eval_epsilon_dpo/beta_margin_grad_mean": -0.40722984075546265, "eval_epsilon_dpo/beta_margin_grad_std": 0.1766805797815323, "eval_epsilon_dpo/beta_margin_mean": 0.44606366753578186, "eval_epsilon_dpo/beta_margin_std": 0.8790761828422546, "eval_epsilon_dpo/loss_margin_mean": 478.8302001953125, "eval_kl/n_epsilon_steps": 0.31763699650764465, "eval_kl/p_epsilon_steps": 0.6819349527359009, "eval_logits/chosen": -3.8786203861236572, "eval_logits/rejected": -4.723351955413818, "eval_logps/chosen": -771.9093627929688, "eval_logps/ref_chosen": -68.29110717773438, "eval_logps/ref_rejected": -92.08038330078125, "eval_logps/rejected": -1274.5286865234375, "eval_loss": 0.5823308825492859, "eval_rewards/accuracies": 0.7037671208381653, "eval_rewards/chosen": -0.6674680709838867, "eval_rewards/margins": 0.44606366753578186, "eval_rewards/rejected": -1.1135317087173462, "eval_runtime": 38.382, "eval_samples_per_second": 60.94, "eval_steps_per_second": 1.928, "step": 600 }, { "epoch": 0.882525697503671, "epsilon_dpo/beta": 0.0009403903386555612, "epsilon_dpo/beta_margin_grad_mean": -0.32094281911849976, "epsilon_dpo/beta_margin_grad_std": 0.16856704652309418, "epsilon_dpo/beta_margin_mean": 0.8789842128753662, "epsilon_dpo/beta_margin_std": 0.901688814163208, "epsilon_dpo/loss_margin_mean": 938.4696044921875, "grad_norm": 48.359046936035156, "kl/avg_steps": 0.75, "kl/beta": 0.0009474018006585538, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.1301532877994742e-08, "logits/chosen": -3.238035202026367, "logits/rejected": -4.882007122039795, "logps/chosen": -406.67852783203125, "logps/ref_chosen": -34.16810607910156, "logps/ref_rejected": -99.91683959960938, "logps/rejected": -1410.8968505859375, "loss": 0.852, "rewards/accuracies": 0.875, "rewards/chosen": -0.35252028703689575, "rewards/margins": 0.8789842128753662, "rewards/rejected": -1.2315044403076172, "step": 601 }, { "epoch": 0.8839941262848752, "epsilon_dpo/beta": 0.0009328021551482379, "epsilon_dpo/beta_margin_grad_mean": -0.3170680105686188, "epsilon_dpo/beta_margin_grad_std": 0.13237425684928894, "epsilon_dpo/beta_margin_mean": 0.8696146011352539, "epsilon_dpo/beta_margin_std": 0.728697657585144, "epsilon_dpo/loss_margin_mean": 934.039306640625, "grad_norm": 48.63134765625, "kl/avg_steps": 0.8125, "kl/beta": 0.0009403491858392954, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.0786184285784298e-08, "logits/chosen": -3.2103443145751953, "logits/rejected": -4.763635635375977, "logps/chosen": -303.0639343261719, "logps/ref_chosen": -34.405120849609375, "logps/ref_rejected": -93.47988891601562, "logps/rejected": -1296.177978515625, "loss": 0.8003, "rewards/accuracies": 0.953125, "rewards/chosen": -0.25088733434677124, "rewards/margins": 0.8696146011352539, "rewards/rejected": -1.12050199508667, "step": 602 }, { "epoch": 0.8854625550660793, "epsilon_dpo/beta": 0.0009261587401852012, "epsilon_dpo/beta_margin_grad_mean": -0.3474172055721283, "epsilon_dpo/beta_margin_grad_std": 0.14739488065242767, "epsilon_dpo/beta_margin_mean": 0.7221988439559937, "epsilon_dpo/beta_margin_std": 0.7671189904212952, "epsilon_dpo/loss_margin_mean": 782.9607543945312, "grad_norm": 51.643531799316406, "kl/avg_steps": 0.71875, "kl/beta": 0.0009327704319730401, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.0276875690788204e-08, "logits/chosen": -3.2944273948669434, "logits/rejected": -4.612957000732422, "logps/chosen": -470.8554992675781, "logps/ref_chosen": -53.07399368286133, "logps/ref_rejected": -107.52302551269531, "logps/rejected": -1308.265380859375, "loss": 0.9099, "rewards/accuracies": 0.890625, "rewards/chosen": -0.38833025097846985, "rewards/margins": 0.7221988439559937, "rewards/rejected": -1.1105290651321411, "step": 603 }, { "epoch": 0.8869309838472834, "epsilon_dpo/beta": 0.0009189706179313362, "epsilon_dpo/beta_margin_grad_mean": -0.3316219747066498, "epsilon_dpo/beta_margin_grad_std": 0.14688213169574738, "epsilon_dpo/beta_margin_mean": 0.8516996502876282, "epsilon_dpo/beta_margin_std": 0.9336607456207275, "epsilon_dpo/loss_margin_mean": 929.5543823242188, "grad_norm": 44.274085998535156, "kl/avg_steps": 0.78125, "kl/beta": 0.0009261139784939587, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.977362051376158e-08, "logits/chosen": -3.4287848472595215, "logits/rejected": -4.8621602058410645, "logps/chosen": -422.5206298828125, "logps/ref_chosen": -32.21878433227539, "logps/ref_rejected": -99.47515106201172, "logps/rejected": -1419.331298828125, "loss": 0.8561, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3594130277633667, "rewards/margins": 0.8516995906829834, "rewards/rejected": -1.2111127376556396, "step": 604 }, { "epoch": 0.8883994126284875, "epsilon_dpo/beta": 0.0009129956015385687, "epsilon_dpo/beta_margin_grad_mean": -0.33933547139167786, "epsilon_dpo/beta_margin_grad_std": 0.15050913393497467, "epsilon_dpo/beta_margin_mean": 0.7814961075782776, "epsilon_dpo/beta_margin_std": 0.8144083619117737, "epsilon_dpo/loss_margin_mean": 859.486328125, "grad_norm": 41.927146911621094, "kl/avg_steps": 0.65625, "kl/beta": 0.0009189348202198744, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.9276432015946446e-08, "logits/chosen": -3.3077027797698975, "logits/rejected": -4.713131904602051, "logps/chosen": -402.01544189453125, "logps/ref_chosen": -42.914276123046875, "logps/ref_rejected": -108.40269470214844, "logps/rejected": -1326.990234375, "loss": 0.882, "rewards/accuracies": 0.875, "rewards/chosen": -0.3291019797325134, "rewards/margins": 0.7814960479736328, "rewards/rejected": -1.110598087310791, "step": 605 }, { "epoch": 0.8898678414096917, "epsilon_dpo/beta": 0.0009078990551643074, "epsilon_dpo/beta_margin_grad_mean": -0.3660072982311249, "epsilon_dpo/beta_margin_grad_std": 0.15605977177619934, "epsilon_dpo/beta_margin_mean": 0.618130087852478, "epsilon_dpo/beta_margin_std": 0.7797470688819885, "epsilon_dpo/loss_margin_mean": 685.6812133789062, "grad_norm": 57.995399475097656, "kl/avg_steps": 0.5625, "kl/beta": 0.000912943622097373, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.8785323298722093e-08, "logits/chosen": -3.486124277114868, "logits/rejected": -4.584151268005371, "logps/chosen": -506.50384521484375, "logps/ref_chosen": -37.19722366333008, "logps/ref_rejected": -102.87519836425781, "logps/rejected": -1257.863037109375, "loss": 0.9904, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4276297986507416, "rewards/margins": 0.618130087852478, "rewards/rejected": -1.045759916305542, "step": 606 }, { "epoch": 0.8913362701908958, "epsilon_dpo/beta": 0.000903104490134865, "epsilon_dpo/beta_margin_grad_mean": -0.36472412943840027, "epsilon_dpo/beta_margin_grad_std": 0.16157901287078857, "epsilon_dpo/beta_margin_mean": 0.6311452984809875, "epsilon_dpo/beta_margin_std": 0.7819616198539734, "epsilon_dpo/loss_margin_mean": 704.2265014648438, "grad_norm": 55.2100830078125, "kl/avg_steps": 0.53125, "kl/beta": 0.0009078370640054345, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.8300307303259904e-08, "logits/chosen": -3.4703445434570312, "logits/rejected": -4.856854438781738, "logps/chosen": -499.9947509765625, "logps/ref_chosen": -43.06529235839844, "logps/ref_rejected": -84.84536743164062, "logps/rejected": -1246.0013427734375, "loss": 0.9843, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4154084026813507, "rewards/margins": 0.6311452984809875, "rewards/rejected": -1.046553611755371, "step": 607 }, { "epoch": 0.8928046989720999, "epsilon_dpo/beta": 0.0008960742270573974, "epsilon_dpo/beta_margin_grad_mean": -0.3294691741466522, "epsilon_dpo/beta_margin_grad_std": 0.13619713485240936, "epsilon_dpo/beta_margin_mean": 0.8008559942245483, "epsilon_dpo/beta_margin_std": 0.7176367044448853, "epsilon_dpo/loss_margin_mean": 896.19287109375, "grad_norm": 60.484779357910156, "kl/avg_steps": 0.78125, "kl/beta": 0.0009030396468006074, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.7821396810182437e-08, "logits/chosen": -3.1860623359680176, "logits/rejected": -4.672609806060791, "logps/chosen": -324.84686279296875, "logps/ref_chosen": -27.870777130126953, "logps/ref_rejected": -101.65553283691406, "logps/rejected": -1294.824462890625, "loss": 0.843, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2668021619319916, "rewards/margins": 0.8008559942245483, "rewards/rejected": -1.0676581859588623, "step": 608 }, { "epoch": 0.8942731277533039, "epsilon_dpo/beta": 0.0008896880317479372, "epsilon_dpo/beta_margin_grad_mean": -0.3255755305290222, "epsilon_dpo/beta_margin_grad_std": 0.13147076964378357, "epsilon_dpo/beta_margin_mean": 0.8107938766479492, "epsilon_dpo/beta_margin_std": 0.689571738243103, "epsilon_dpo/loss_margin_mean": 914.2926025390625, "grad_norm": 52.808658599853516, "kl/avg_steps": 0.71875, "kl/beta": 0.0008960393606685102, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.7348604439226617e-08, "logits/chosen": -3.266610860824585, "logits/rejected": -4.786360740661621, "logps/chosen": -432.412841796875, "logps/ref_chosen": -33.51665496826172, "logps/ref_rejected": -96.93180084228516, "logps/rejected": -1410.12060546875, "loss": 0.829, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3561561107635498, "rewards/margins": 0.8107938766479492, "rewards/rejected": -1.166949987411499, "step": 609 }, { "epoch": 0.895741556534508, "epsilon_dpo/beta": 0.0008841731469146907, "epsilon_dpo/beta_margin_grad_mean": -0.34196192026138306, "epsilon_dpo/beta_margin_grad_std": 0.17104457318782806, "epsilon_dpo/beta_margin_mean": 0.7602141499519348, "epsilon_dpo/beta_margin_std": 0.892071008682251, "epsilon_dpo/loss_margin_mean": 865.0516967773438, "grad_norm": 50.597049713134766, "kl/avg_steps": 0.625, "kl/beta": 0.0008896450162865222, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -3.319277286529541, "logits/rejected": -4.948149681091309, "logps/chosen": -423.406982421875, "logps/ref_chosen": -39.733856201171875, "logps/ref_rejected": -88.57766723632812, "logps/rejected": -1337.302490234375, "loss": 0.9283, "rewards/accuracies": 0.84375, "rewards/chosen": -0.34177446365356445, "rewards/margins": 0.7602142095565796, "rewards/rejected": -1.1019885540008545, "step": 610 }, { "epoch": 0.8972099853157122, "epsilon_dpo/beta": 0.0008777128532528877, "epsilon_dpo/beta_margin_grad_mean": -0.3441890478134155, "epsilon_dpo/beta_margin_grad_std": 0.1642007678747177, "epsilon_dpo/beta_margin_mean": 0.7517509460449219, "epsilon_dpo/beta_margin_std": 0.8516819477081299, "epsilon_dpo/loss_margin_mean": 860.3085327148438, "grad_norm": 54.84195327758789, "kl/avg_steps": 0.734375, "kl/beta": 0.0008841192466206849, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.6421423736208e-08, "logits/chosen": -3.4497299194335938, "logits/rejected": -4.820193290710449, "logps/chosen": -460.02740478515625, "logps/ref_chosen": -34.78019332885742, "logps/ref_rejected": -90.61834716796875, "logps/rejected": -1376.174072265625, "loss": 0.9188, "rewards/accuracies": 0.859375, "rewards/chosen": -0.3749154508113861, "rewards/margins": 0.7517509460449219, "rewards/rejected": -1.1266663074493408, "step": 611 }, { "epoch": 0.8986784140969163, "epsilon_dpo/beta": 0.0008717270102351904, "epsilon_dpo/beta_margin_grad_mean": -0.33847570419311523, "epsilon_dpo/beta_margin_grad_std": 0.15062828361988068, "epsilon_dpo/beta_margin_mean": 0.7811422348022461, "epsilon_dpo/beta_margin_std": 0.8147025108337402, "epsilon_dpo/loss_margin_mean": 899.7782592773438, "grad_norm": 41.552490234375, "kl/avg_steps": 0.6875, "kl/beta": 0.0008776738541200757, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.5967059836219042e-08, "logits/chosen": -3.163311004638672, "logits/rejected": -4.886482238769531, "logps/chosen": -380.4996337890625, "logps/ref_chosen": -35.333831787109375, "logps/ref_rejected": -93.14432525634766, "logps/rejected": -1338.08837890625, "loss": 0.8826, "rewards/accuracies": 0.875, "rewards/chosen": -0.3019838333129883, "rewards/margins": 0.7811422348022461, "rewards/rejected": -1.0831260681152344, "step": 612 }, { "epoch": 0.9001468428781204, "epsilon_dpo/beta": 0.0008652299293316901, "epsilon_dpo/beta_margin_grad_mean": -0.3142598271369934, "epsilon_dpo/beta_margin_grad_std": 0.12845391035079956, "epsilon_dpo/beta_margin_mean": 0.8559463620185852, "epsilon_dpo/beta_margin_std": 0.6566179990768433, "epsilon_dpo/loss_margin_mean": 992.203369140625, "grad_norm": 61.14183044433594, "kl/avg_steps": 0.75, "kl/beta": 0.0008716810261830688, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.551886292185553e-08, "logits/chosen": -3.458996057510376, "logits/rejected": -4.9003472328186035, "logps/chosen": -418.6805114746094, "logps/ref_chosen": -36.464019775390625, "logps/ref_rejected": -113.0091781616211, "logps/rejected": -1487.4290771484375, "loss": 0.7942, "rewards/accuracies": 0.90625, "rewards/chosen": -0.33134663105010986, "rewards/margins": 0.8559463620185852, "rewards/rejected": -1.1872930526733398, "step": 613 }, { "epoch": 0.9016152716593245, "epsilon_dpo/beta": 0.0008593298261985183, "epsilon_dpo/beta_margin_grad_mean": -0.3246811330318451, "epsilon_dpo/beta_margin_grad_std": 0.15554648637771606, "epsilon_dpo/beta_margin_mean": 0.8337700366973877, "epsilon_dpo/beta_margin_std": 0.7923649549484253, "epsilon_dpo/loss_margin_mean": 974.2940063476562, "grad_norm": 48.429771423339844, "kl/avg_steps": 0.6875, "kl/beta": 0.0008651920943520963, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.507684480352292e-08, "logits/chosen": -3.3637969493865967, "logits/rejected": -4.854522705078125, "logps/chosen": -424.6036376953125, "logps/ref_chosen": -34.81976318359375, "logps/ref_rejected": -111.12577819824219, "logps/rejected": -1475.20361328125, "loss": 0.8482, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3362823724746704, "rewards/margins": 0.8337700366973877, "rewards/rejected": -1.1700525283813477, "step": 614 }, { "epoch": 0.9030837004405287, "epsilon_dpo/beta": 0.0008531937492080033, "epsilon_dpo/beta_margin_grad_mean": -0.3395027816295624, "epsilon_dpo/beta_margin_grad_std": 0.15511348843574524, "epsilon_dpo/beta_margin_mean": 0.7793731689453125, "epsilon_dpo/beta_margin_std": 0.8427423238754272, "epsilon_dpo/loss_margin_mean": 917.2001342773438, "grad_norm": 52.556758880615234, "kl/avg_steps": 0.71875, "kl/beta": 0.0008592845406383276, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.4641017128809801e-08, "logits/chosen": -3.465533971786499, "logits/rejected": -4.829683303833008, "logps/chosen": -447.3005676269531, "logps/ref_chosen": -41.42036819458008, "logps/ref_rejected": -101.49702453613281, "logps/rejected": -1424.577392578125, "loss": 0.8908, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3474053144454956, "rewards/margins": 0.7793731689453125, "rewards/rejected": -1.126778483390808, "step": 615 }, { "epoch": 0.9045521292217328, "epsilon_dpo/beta": 0.000848971598315984, "epsilon_dpo/beta_margin_grad_mean": -0.3760168254375458, "epsilon_dpo/beta_margin_grad_std": 0.1653953492641449, "epsilon_dpo/beta_margin_mean": 0.5968384146690369, "epsilon_dpo/beta_margin_std": 0.8491699695587158, "epsilon_dpo/loss_margin_mean": 708.5220336914062, "grad_norm": 59.032936096191406, "kl/avg_steps": 0.5, "kl/beta": 0.0008531524799764156, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.4211391382180637e-08, "logits/chosen": -3.490535259246826, "logits/rejected": -4.813992500305176, "logps/chosen": -511.3292236328125, "logps/ref_chosen": -45.615753173828125, "logps/ref_rejected": -80.37959289550781, "logps/rejected": -1254.6151123046875, "loss": 1.0265, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3976947069168091, "rewards/margins": 0.5968384146690369, "rewards/rejected": -0.9945331811904907, "step": 616 }, { "epoch": 0.9060205580029369, "epsilon_dpo/beta": 0.0008452784968540072, "epsilon_dpo/beta_margin_grad_mean": -0.3991197347640991, "epsilon_dpo/beta_margin_grad_std": 0.14801205694675446, "epsilon_dpo/beta_margin_mean": 0.4730072617530823, "epsilon_dpo/beta_margin_std": 0.7559821605682373, "epsilon_dpo/loss_margin_mean": 564.623291015625, "grad_norm": 69.23668670654297, "kl/avg_steps": 0.4375, "kl/beta": 0.0008489079191349447, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.378797888467345e-08, "logits/chosen": -3.5469822883605957, "logits/rejected": -4.843328475952148, "logps/chosen": -583.8492431640625, "logps/ref_chosen": -50.210060119628906, "logps/ref_rejected": -69.55174255371094, "logps/rejected": -1167.814208984375, "loss": 1.088, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4543150067329407, "rewards/margins": 0.4730072617530823, "rewards/rejected": -0.927322268486023, "step": 617 }, { "epoch": 0.9074889867841409, "epsilon_dpo/beta": 0.0008396140765398741, "epsilon_dpo/beta_margin_grad_mean": -0.35606545209884644, "epsilon_dpo/beta_margin_grad_std": 0.17117975652217865, "epsilon_dpo/beta_margin_mean": 0.6664056181907654, "epsilon_dpo/beta_margin_std": 0.8637328147888184, "epsilon_dpo/loss_margin_mean": 798.8843383789062, "grad_norm": 48.00564193725586, "kl/avg_steps": 0.671875, "kl/beta": 0.0008452101610600948, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.3370790793601371e-08, "logits/chosen": -3.4216156005859375, "logits/rejected": -4.760526657104492, "logps/chosen": -533.0503540039062, "logps/ref_chosen": -43.185306549072266, "logps/ref_rejected": -98.49762725830078, "logps/rejected": -1387.2470703125, "loss": 0.9866, "rewards/accuracies": 0.875, "rewards/chosen": -0.41405919194221497, "rewards/margins": 0.6664056181907654, "rewards/rejected": -1.0804648399353027, "step": 618 }, { "epoch": 0.908957415565345, "epsilon_dpo/beta": 0.0008336182800121605, "epsilon_dpo/beta_margin_grad_mean": -0.34246736764907837, "epsilon_dpo/beta_margin_grad_std": 0.14548242092132568, "epsilon_dpo/beta_margin_mean": 0.7482366561889648, "epsilon_dpo/beta_margin_std": 0.7654976844787598, "epsilon_dpo/loss_margin_mean": 900.8438110351562, "grad_norm": 55.89384460449219, "kl/avg_steps": 0.71875, "kl/beta": 0.0008395693148486316, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.2959838102258535e-08, "logits/chosen": -3.394374370574951, "logits/rejected": -4.941725254058838, "logps/chosen": -482.2840576171875, "logps/ref_chosen": -33.69963836669922, "logps/ref_rejected": -101.2354736328125, "logps/rejected": -1450.6636962890625, "loss": 0.891, "rewards/accuracies": 0.859375, "rewards/chosen": -0.37517303228378296, "rewards/margins": 0.7482366561889648, "rewards/rejected": -1.1234097480773926, "step": 619 }, { "epoch": 0.9104258443465492, "epsilon_dpo/beta": 0.0008292324491776526, "epsilon_dpo/beta_margin_grad_mean": -0.37484031915664673, "epsilon_dpo/beta_margin_grad_std": 0.15843062102794647, "epsilon_dpo/beta_margin_mean": 0.5776193737983704, "epsilon_dpo/beta_margin_std": 0.8028265833854675, "epsilon_dpo/loss_margin_mean": 701.974365234375, "grad_norm": 51.175804138183594, "kl/avg_steps": 0.53125, "kl/beta": 0.0008335779421031475, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.2555131639630567e-08, "logits/chosen": -3.558187961578369, "logits/rejected": -4.871739387512207, "logps/chosen": -579.7201538085938, "logps/ref_chosen": -42.774513244628906, "logps/ref_rejected": -84.47439575195312, "logps/rejected": -1323.3944091796875, "loss": 1.0273, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4475584030151367, "rewards/margins": 0.5776193737983704, "rewards/rejected": -1.0251778364181519, "step": 620 }, { "epoch": 0.9118942731277533, "epsilon_dpo/beta": 0.0008243321790359914, "epsilon_dpo/beta_margin_grad_mean": -0.3368692100048065, "epsilon_dpo/beta_margin_grad_std": 0.16272993385791779, "epsilon_dpo/beta_margin_mean": 0.794694185256958, "epsilon_dpo/beta_margin_std": 0.8520438075065613, "epsilon_dpo/loss_margin_mean": 969.4013061523438, "grad_norm": 52.058837890625, "kl/avg_steps": 0.59375, "kl/beta": 0.000829172960948199, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.2156682070109086e-08, "logits/chosen": -3.53486967086792, "logits/rejected": -5.024101257324219, "logps/chosen": -459.840576171875, "logps/ref_chosen": -37.82067108154297, "logps/ref_rejected": -94.49537658691406, "logps/rejected": -1485.9166259765625, "loss": 0.8891, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3498554825782776, "rewards/margins": 0.794694185256958, "rewards/rejected": -1.1445496082305908, "step": 621 }, { "epoch": 0.9133627019089574, "epsilon_dpo/beta": 0.0008184361504390836, "epsilon_dpo/beta_margin_grad_mean": -0.3743629455566406, "epsilon_dpo/beta_margin_grad_std": 0.13544589281082153, "epsilon_dpo/beta_margin_mean": 0.581274688243866, "epsilon_dpo/beta_margin_std": 0.6987016797065735, "epsilon_dpo/loss_margin_mean": 713.4944458007812, "grad_norm": 49.092411041259766, "kl/avg_steps": 0.71875, "kl/beta": 0.0008242788026109338, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.1764499893210878e-08, "logits/chosen": -3.374379873275757, "logits/rejected": -4.7842888832092285, "logps/chosen": -553.1505126953125, "logps/ref_chosen": -39.961334228515625, "logps/ref_rejected": -92.28267669677734, "logps/rejected": -1318.96630859375, "loss": 0.9872, "rewards/accuracies": 0.875, "rewards/chosen": -0.4212498664855957, "rewards/margins": 0.5812746286392212, "rewards/rejected": -1.0025246143341064, "step": 622 }, { "epoch": 0.9148311306901615, "epsilon_dpo/beta": 0.000813874474260956, "epsilon_dpo/beta_margin_grad_mean": -0.3725954592227936, "epsilon_dpo/beta_margin_grad_std": 0.1656135767698288, "epsilon_dpo/beta_margin_mean": 0.5889464020729065, "epsilon_dpo/beta_margin_std": 0.8087703585624695, "epsilon_dpo/loss_margin_mean": 729.1351928710938, "grad_norm": 50.80109405517578, "kl/avg_steps": 0.5625, "kl/beta": 0.0008183965692296624, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.1378595443300998e-08, "logits/chosen": -3.542423963546753, "logits/rejected": -4.91584587097168, "logps/chosen": -577.177001953125, "logps/ref_chosen": -49.0926513671875, "logps/ref_rejected": -91.09358215332031, "logps/rejected": -1348.3131103515625, "loss": 1.0243, "rewards/accuracies": 0.78125, "rewards/chosen": -0.432176798582077, "rewards/margins": 0.5889463424682617, "rewards/rejected": -1.0211231708526611, "step": 623 }, { "epoch": 0.9162995594713657, "epsilon_dpo/beta": 0.000808813376352191, "epsilon_dpo/beta_margin_grad_mean": -0.34141990542411804, "epsilon_dpo/beta_margin_grad_std": 0.16724152863025665, "epsilon_dpo/beta_margin_mean": 0.7709789872169495, "epsilon_dpo/beta_margin_std": 0.8832123875617981, "epsilon_dpo/loss_margin_mean": 958.7295532226562, "grad_norm": 48.523738861083984, "kl/avg_steps": 0.625, "kl/beta": 0.0008138188859447837, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.0998978889320582e-08, "logits/chosen": -3.3430838584899902, "logits/rejected": -4.918627738952637, "logps/chosen": -515.30517578125, "logps/ref_chosen": -46.57392501831055, "logps/ref_rejected": -105.08536529541016, "logps/rejected": -1532.546142578125, "loss": 0.9153, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3814919590950012, "rewards/margins": 0.7709789872169495, "rewards/rejected": -1.1524710655212402, "step": 624 }, { "epoch": 0.9177679882525698, "epsilon_dpo/beta": 0.0008035369100980461, "epsilon_dpo/beta_margin_grad_mean": -0.34749388694763184, "epsilon_dpo/beta_margin_grad_std": 0.13651293516159058, "epsilon_dpo/beta_margin_mean": 0.6988623738288879, "epsilon_dpo/beta_margin_std": 0.6729094386100769, "epsilon_dpo/loss_margin_mean": 873.6240844726562, "grad_norm": 49.30683135986328, "kl/avg_steps": 0.65625, "kl/beta": 0.0008087640744633973, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.0625660234518913e-08, "logits/chosen": -3.506286144256592, "logits/rejected": -4.933239936828613, "logps/chosen": -515.078857421875, "logps/ref_chosen": -43.60509490966797, "logps/ref_rejected": -92.33833312988281, "logps/rejected": -1437.4361572265625, "loss": 0.902, "rewards/accuracies": 0.84375, "rewards/chosen": -0.38019776344299316, "rewards/margins": 0.6988623142242432, "rewards/rejected": -1.0790600776672363, "step": 625 }, { "epoch": 0.9192364170337739, "epsilon_dpo/beta": 0.0007988003198988736, "epsilon_dpo/beta_margin_grad_mean": -0.3574766218662262, "epsilon_dpo/beta_margin_grad_std": 0.16741888225078583, "epsilon_dpo/beta_margin_mean": 0.6738518476486206, "epsilon_dpo/beta_margin_std": 0.8334734439849854, "epsilon_dpo/loss_margin_mean": 849.4379272460938, "grad_norm": 50.068603515625, "kl/avg_steps": 0.59375, "kl/beta": 0.0008034911588765681, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.0258649316189721e-08, "logits/chosen": -3.526883363723755, "logits/rejected": -4.905704021453857, "logps/chosen": -593.0721435546875, "logps/ref_chosen": -50.95122528076172, "logps/ref_rejected": -103.29271697998047, "logps/rejected": -1494.8515625, "loss": 0.9691, "rewards/accuracies": 0.8125, "rewards/chosen": -0.43541914224624634, "rewards/margins": 0.6738518476486206, "rewards/rejected": -1.1092710494995117, "step": 626 }, { "epoch": 0.920704845814978, "epsilon_dpo/beta": 0.00079333659959957, "epsilon_dpo/beta_margin_grad_mean": -0.32665178179740906, "epsilon_dpo/beta_margin_grad_std": 0.14166025817394257, "epsilon_dpo/beta_margin_mean": 0.8340927958488464, "epsilon_dpo/beta_margin_std": 0.7881202101707458, "epsilon_dpo/loss_margin_mean": 1055.364501953125, "grad_norm": 55.83228302001953, "kl/avg_steps": 0.6875, "kl/beta": 0.0007987486314959824, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 9.897955805412e-09, "logits/chosen": -3.3846006393432617, "logits/rejected": -4.88566780090332, "logps/chosen": -337.388427734375, "logps/ref_chosen": -28.80577850341797, "logps/ref_rejected": -114.96311950683594, "logps/rejected": -1478.910400390625, "loss": 0.8381, "rewards/accuracies": 0.875, "rewards/chosen": -0.24587570130825043, "rewards/margins": 0.8340927362442017, "rewards/rejected": -1.0799684524536133, "step": 627 }, { "epoch": 0.922173274596182, "epsilon_dpo/beta": 0.0007881674682721496, "epsilon_dpo/beta_margin_grad_mean": -0.34786510467529297, "epsilon_dpo/beta_margin_grad_std": 0.15347820520401, "epsilon_dpo/beta_margin_mean": 0.7319678068161011, "epsilon_dpo/beta_margin_std": 0.8163183331489563, "epsilon_dpo/loss_margin_mean": 933.1701049804688, "grad_norm": 49.38209915161133, "kl/avg_steps": 0.65625, "kl/beta": 0.0007932946900837123, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 9.543589206795238e-09, "logits/chosen": -3.572722911834717, "logits/rejected": -4.90597677230835, "logps/chosen": -581.771484375, "logps/ref_chosen": -45.28186798095703, "logps/ref_rejected": -108.524169921875, "logps/rejected": -1578.183837890625, "loss": 0.9161, "rewards/accuracies": 0.84375, "rewards/chosen": -0.42482954263687134, "rewards/margins": 0.7319678068161011, "rewards/rejected": -1.1567974090576172, "step": 628 }, { "epoch": 0.9236417033773862, "epsilon_dpo/beta": 0.00078401411883533, "epsilon_dpo/beta_margin_grad_mean": -0.34917938709259033, "epsilon_dpo/beta_margin_grad_std": 0.14244189858436584, "epsilon_dpo/beta_margin_mean": 0.7041851878166199, "epsilon_dpo/beta_margin_std": 0.7174273133277893, "epsilon_dpo/loss_margin_mean": 903.3053588867188, "grad_norm": 46.65474319458008, "kl/avg_steps": 0.53125, "kl/beta": 0.0007881226483732462, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 9.19555885822887e-09, "logits/chosen": -3.379412889480591, "logits/rejected": -4.922481536865234, "logps/chosen": -457.57720947265625, "logps/ref_chosen": -41.636070251464844, "logps/ref_rejected": -96.60995483398438, "logps/rejected": -1415.8564453125, "loss": 0.9092, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3272828459739685, "rewards/margins": 0.7041851282119751, "rewards/rejected": -1.0314680337905884, "step": 629 }, { "epoch": 0.9251101321585903, "epsilon_dpo/beta": 0.0007786460919305682, "epsilon_dpo/beta_margin_grad_mean": -0.3540729284286499, "epsilon_dpo/beta_margin_grad_std": 0.13038687407970428, "epsilon_dpo/beta_margin_mean": 0.6700939536094666, "epsilon_dpo/beta_margin_std": 0.6649956703186035, "epsilon_dpo/loss_margin_mean": 864.0321655273438, "grad_norm": 40.582828521728516, "kl/avg_steps": 0.6875, "kl/beta": 0.0007839578902348876, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 8.85387393063622e-09, "logits/chosen": -3.2850284576416016, "logits/rejected": -4.819733619689941, "logps/chosen": -374.94354248046875, "logps/ref_chosen": -31.366878509521484, "logps/ref_rejected": -90.5899658203125, "logps/rejected": -1298.19873046875, "loss": 0.9169, "rewards/accuracies": 0.859375, "rewards/chosen": -0.26877981424331665, "rewards/margins": 0.6700939536094666, "rewards/rejected": -0.9388737678527832, "step": 630 }, { "epoch": 0.9265785609397944, "epsilon_dpo/beta": 0.0007730860379524529, "epsilon_dpo/beta_margin_grad_mean": -0.3612503111362457, "epsilon_dpo/beta_margin_grad_std": 0.14340829849243164, "epsilon_dpo/beta_margin_mean": 0.6428188681602478, "epsilon_dpo/beta_margin_std": 0.7184677124023438, "epsilon_dpo/loss_margin_mean": 835.1441040039062, "grad_norm": 46.966888427734375, "kl/avg_steps": 0.71875, "kl/beta": 0.0007786049391143024, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 8.518543427732949e-09, "logits/chosen": -3.4160025119781494, "logits/rejected": -4.869636535644531, "logps/chosen": -483.23895263671875, "logps/ref_chosen": -44.379119873046875, "logps/ref_rejected": -86.64693450927734, "logps/rejected": -1360.65087890625, "loss": 0.9527, "rewards/accuracies": 0.875, "rewards/chosen": -0.34111732244491577, "rewards/margins": 0.642818808555603, "rewards/rejected": -0.9839361906051636, "step": 631 }, { "epoch": 0.9280469897209985, "epsilon_dpo/beta": 0.0007680523558519781, "epsilon_dpo/beta_margin_grad_mean": -0.3662717640399933, "epsilon_dpo/beta_margin_grad_std": 0.13854503631591797, "epsilon_dpo/beta_margin_mean": 0.6167137026786804, "epsilon_dpo/beta_margin_std": 0.6938005685806274, "epsilon_dpo/loss_margin_mean": 806.8697509765625, "grad_norm": 54.08295440673828, "kl/avg_steps": 0.65625, "kl/beta": 0.0007730486686341465, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 8.189576185789637e-09, "logits/chosen": -3.6578102111816406, "logits/rejected": -5.017969131469727, "logps/chosen": -500.601318359375, "logps/ref_chosen": -43.92643737792969, "logps/ref_rejected": -90.67631530761719, "logps/rejected": -1354.220947265625, "loss": 0.9639, "rewards/accuracies": 0.875, "rewards/chosen": -0.35203373432159424, "rewards/margins": 0.6167136430740356, "rewards/rejected": -0.9687473773956299, "step": 632 }, { "epoch": 0.9295154185022027, "epsilon_dpo/beta": 0.0007630449254065752, "epsilon_dpo/beta_margin_grad_mean": -0.36804521083831787, "epsilon_dpo/beta_margin_grad_std": 0.1329408884048462, "epsilon_dpo/beta_margin_mean": 0.5877854824066162, "epsilon_dpo/beta_margin_std": 0.6214975118637085, "epsilon_dpo/loss_margin_mean": 774.373046875, "grad_norm": 49.735836029052734, "kl/avg_steps": 0.65625, "kl/beta": 0.000768008641898632, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 7.866980873399015e-09, "logits/chosen": -3.5298781394958496, "logits/rejected": -4.785070419311523, "logps/chosen": -524.2335205078125, "logps/ref_chosen": -42.23455047607422, "logps/ref_rejected": -100.14579772949219, "logps/rejected": -1356.517822265625, "loss": 0.9689, "rewards/accuracies": 0.859375, "rewards/chosen": -0.3694680333137512, "rewards/margins": 0.5877854824066162, "rewards/rejected": -0.9572535157203674, "step": 633 }, { "epoch": 0.9309838472834068, "epsilon_dpo/beta": 0.0007587854051962495, "epsilon_dpo/beta_margin_grad_mean": -0.3558174967765808, "epsilon_dpo/beta_margin_grad_std": 0.14750294387340546, "epsilon_dpo/beta_margin_mean": 0.6918417811393738, "epsilon_dpo/beta_margin_std": 0.782641589641571, "epsilon_dpo/loss_margin_mean": 916.8782958984375, "grad_norm": 43.45764923095703, "kl/avg_steps": 0.5625, "kl/beta": 0.0007630014442838728, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 7.550765991247654e-09, "logits/chosen": -3.4708662033081055, "logits/rejected": -4.9770402908325195, "logps/chosen": -502.7445068359375, "logps/ref_chosen": -39.36439895629883, "logps/ref_rejected": -113.15769958496094, "logps/rejected": -1493.4161376953125, "loss": 0.933, "rewards/accuracies": 0.796875, "rewards/chosen": -0.35321560502052307, "rewards/margins": 0.6918417811393738, "rewards/rejected": -1.0450574159622192, "step": 634 }, { "epoch": 0.9324522760646109, "epsilon_dpo/beta": 0.0007538297213613987, "epsilon_dpo/beta_margin_grad_mean": -0.39024749398231506, "epsilon_dpo/beta_margin_grad_std": 0.13376963138580322, "epsilon_dpo/beta_margin_mean": 0.5062357783317566, "epsilon_dpo/beta_margin_std": 0.6855813264846802, "epsilon_dpo/loss_margin_mean": 675.380859375, "grad_norm": 51.91278839111328, "kl/avg_steps": 0.65625, "kl/beta": 0.0007587335421703756, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 7.240939871891699e-09, "logits/chosen": -3.523716926574707, "logits/rejected": -4.783617973327637, "logps/chosen": -562.4381103515625, "logps/ref_chosen": -49.88642120361328, "logps/ref_rejected": -89.69390869140625, "logps/rejected": -1277.62646484375, "loss": 1.0411, "rewards/accuracies": 0.859375, "rewards/chosen": -0.38917070627212524, "rewards/margins": 0.5062357783317566, "rewards/rejected": -0.8954064846038818, "step": 635 }, { "epoch": 0.933920704845815, "epsilon_dpo/beta": 0.0007493861485272646, "epsilon_dpo/beta_margin_grad_mean": -0.3881148397922516, "epsilon_dpo/beta_margin_grad_std": 0.1402616947889328, "epsilon_dpo/beta_margin_mean": 0.5030276775360107, "epsilon_dpo/beta_margin_std": 0.6626613140106201, "epsilon_dpo/loss_margin_mean": 675.861083984375, "grad_norm": 52.333683013916016, "kl/avg_steps": 0.59375, "kl/beta": 0.0007537868223153055, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 6.937510679537628e-09, "logits/chosen": -3.715489387512207, "logits/rejected": -4.909304618835449, "logps/chosen": -566.8447875976562, "logps/ref_chosen": -46.58656692504883, "logps/ref_rejected": -86.21536254882812, "logps/rejected": -1282.334716796875, "loss": 1.043, "rewards/accuracies": 0.8125, "rewards/chosen": -0.39245936274528503, "rewards/margins": 0.5030276775360107, "rewards/rejected": -0.8954870104789734, "step": 636 }, { "epoch": 0.9353891336270191, "epsilon_dpo/beta": 0.0007449628901667893, "epsilon_dpo/beta_margin_grad_mean": -0.3551463186740875, "epsilon_dpo/beta_margin_grad_std": 0.14750678837299347, "epsilon_dpo/beta_margin_mean": 0.6876585483551025, "epsilon_dpo/beta_margin_std": 0.7672451138496399, "epsilon_dpo/loss_margin_mean": 928.1746215820312, "grad_norm": 41.60480880737305, "kl/avg_steps": 0.59375, "kl/beta": 0.0007493376033380628, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 6.640486409826785e-09, "logits/chosen": -3.4943830966949463, "logits/rejected": -4.821347236633301, "logps/chosen": -483.88134765625, "logps/ref_chosen": -37.54460144042969, "logps/ref_rejected": -103.94780731201172, "logps/rejected": -1478.459228515625, "loss": 0.9332, "rewards/accuracies": 0.84375, "rewards/chosen": -0.33350521326065063, "rewards/margins": 0.6876585483551025, "rewards/rejected": -1.0211637020111084, "step": 637 }, { "epoch": 0.9368575624082232, "epsilon_dpo/beta": 0.0007394017884507775, "epsilon_dpo/beta_margin_grad_mean": -0.3611353039741516, "epsilon_dpo/beta_margin_grad_std": 0.13404038548469543, "epsilon_dpo/beta_margin_mean": 0.6473937034606934, "epsilon_dpo/beta_margin_std": 0.7178838849067688, "epsilon_dpo/loss_margin_mean": 878.7179565429688, "grad_norm": 38.22664260864258, "kl/avg_steps": 0.75, "kl/beta": 0.0007449146942235529, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 6.349874889624962e-09, "logits/chosen": -3.2752623558044434, "logits/rejected": -5.041168212890625, "logps/chosen": -439.9283447265625, "logps/ref_chosen": -35.51661682128906, "logps/ref_rejected": -85.09121704101562, "logps/rejected": -1368.220947265625, "loss": 0.9433, "rewards/accuracies": 0.890625, "rewards/chosen": -0.30027681589126587, "rewards/margins": 0.6473937034606934, "rewards/rejected": -0.9476705193519592, "step": 638 }, { "epoch": 0.9383259911894273, "epsilon_dpo/beta": 0.0007355150883086026, "epsilon_dpo/beta_margin_grad_mean": -0.400001585483551, "epsilon_dpo/beta_margin_grad_std": 0.13325192034244537, "epsilon_dpo/beta_margin_mean": 0.4458373486995697, "epsilon_dpo/beta_margin_std": 0.6368516087532043, "epsilon_dpo/loss_margin_mean": 610.8491821289062, "grad_norm": 46.616573333740234, "kl/avg_steps": 0.53125, "kl/beta": 0.0007393694249913096, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 6.065683776815933e-09, "logits/chosen": -3.443708896636963, "logits/rejected": -4.675836563110352, "logps/chosen": -551.689208984375, "logps/ref_chosen": -44.109619140625, "logps/ref_rejected": -81.57601928710938, "logps/rejected": -1200.0047607421875, "loss": 1.0788, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3750152587890625, "rewards/margins": 0.4458373486995697, "rewards/rejected": -0.8208526372909546, "step": 639 }, { "epoch": 0.9397944199706314, "epsilon_dpo/beta": 0.0007302492158487439, "epsilon_dpo/beta_margin_grad_mean": -0.34541165828704834, "epsilon_dpo/beta_margin_grad_std": 0.12778370082378387, "epsilon_dpo/beta_margin_mean": 0.7259718775749207, "epsilon_dpo/beta_margin_std": 0.7215431928634644, "epsilon_dpo/loss_margin_mean": 997.45458984375, "grad_norm": 47.58893966674805, "kl/avg_steps": 0.71875, "kl/beta": 0.0007354622939601541, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 5.7879205600998296e-09, "logits/chosen": -3.3290483951568604, "logits/rejected": -4.9194793701171875, "logps/chosen": -445.7485656738281, "logps/ref_chosen": -40.14595413208008, "logps/ref_rejected": -114.68016815185547, "logps/rejected": -1517.7373046875, "loss": 0.8862, "rewards/accuracies": 0.890625, "rewards/chosen": -0.2974758744239807, "rewards/margins": 0.7259718179702759, "rewards/rejected": -1.0234477519989014, "step": 640 }, { "epoch": 0.9412628487518355, "epsilon_dpo/beta": 0.0007254944066517055, "epsilon_dpo/beta_margin_grad_mean": -0.3516106605529785, "epsilon_dpo/beta_margin_grad_std": 0.14073027670383453, "epsilon_dpo/beta_margin_mean": 0.6929788589477539, "epsilon_dpo/beta_margin_std": 0.7391694188117981, "epsilon_dpo/loss_margin_mean": 959.70703125, "grad_norm": 43.9137077331543, "kl/avg_steps": 0.65625, "kl/beta": 0.0007302138837985694, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 5.516592558795746e-09, "logits/chosen": -3.348452568054199, "logits/rejected": -5.015802383422852, "logps/chosen": -465.90545654296875, "logps/ref_chosen": -39.17839050292969, "logps/ref_rejected": -94.15284729003906, "logps/rejected": -1480.5869140625, "loss": 0.9185, "rewards/accuracies": 0.875, "rewards/chosen": -0.3105979263782501, "rewards/margins": 0.6929788589477539, "rewards/rejected": -1.0035767555236816, "step": 641 }, { "epoch": 0.9427312775330396, "epsilon_dpo/beta": 0.0007200841791927814, "epsilon_dpo/beta_margin_grad_mean": -0.34033486247062683, "epsilon_dpo/beta_margin_grad_std": 0.1520089954137802, "epsilon_dpo/beta_margin_mean": 0.7516418695449829, "epsilon_dpo/beta_margin_std": 0.7943530082702637, "epsilon_dpo/loss_margin_mean": 1047.7049560546875, "grad_norm": 42.03846740722656, "kl/avg_steps": 0.75, "kl/beta": 0.0007254530792124569, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 5.251706922648868e-09, "logits/chosen": -3.4749832153320312, "logits/rejected": -5.020747184753418, "logps/chosen": -480.06494140625, "logps/ref_chosen": -46.66090393066406, "logps/ref_rejected": -115.78807067871094, "logps/rejected": -1596.8970947265625, "loss": 0.8997, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3134210705757141, "rewards/margins": 0.7516418695449829, "rewards/rejected": -1.0650629997253418, "step": 642 }, { "epoch": 0.9441997063142438, "epsilon_dpo/beta": 0.0007149488083086908, "epsilon_dpo/beta_margin_grad_mean": -0.3584926128387451, "epsilon_dpo/beta_margin_grad_std": 0.11874385923147202, "epsilon_dpo/beta_margin_mean": 0.6369244456291199, "epsilon_dpo/beta_margin_std": 0.594021201133728, "epsilon_dpo/loss_margin_mean": 893.6637573242188, "grad_norm": 39.312984466552734, "kl/avg_steps": 0.71875, "kl/beta": 0.0007200526888482273, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.993270631642038e-09, "logits/chosen": -3.4138784408569336, "logits/rejected": -4.991301536560059, "logps/chosen": -453.1264343261719, "logps/ref_chosen": -35.49954605102539, "logps/ref_rejected": -92.612060546875, "logps/rejected": -1403.9027099609375, "loss": 0.9222, "rewards/accuracies": 0.90625, "rewards/chosen": -0.29885566234588623, "rewards/margins": 0.6369244456291199, "rewards/rejected": -0.9357801079750061, "step": 643 }, { "epoch": 0.9456681350954479, "epsilon_dpo/beta": 0.0007100701914168894, "epsilon_dpo/beta_margin_grad_mean": -0.37228909134864807, "epsilon_dpo/beta_margin_grad_std": 0.1232423335313797, "epsilon_dpo/beta_margin_mean": 0.5710337162017822, "epsilon_dpo/beta_margin_std": 0.5858953595161438, "epsilon_dpo/loss_margin_mean": 807.3773193359375, "grad_norm": 46.13814926147461, "kl/avg_steps": 0.6875, "kl/beta": 0.0007149142329581082, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.741290495811873e-09, "logits/chosen": -3.398604154586792, "logits/rejected": -4.7896952629089355, "logps/chosen": -432.34515380859375, "logps/ref_chosen": -37.63022232055664, "logps/ref_rejected": -93.44629669189453, "logps/rejected": -1295.53857421875, "loss": 0.9704, "rewards/accuracies": 0.859375, "rewards/chosen": -0.28092193603515625, "rewards/margins": 0.5710337162017822, "rewards/rejected": -0.8519556522369385, "step": 644 }, { "epoch": 0.947136563876652, "epsilon_dpo/beta": 0.0007061094511300325, "epsilon_dpo/beta_margin_grad_mean": -0.38581857085227966, "epsilon_dpo/beta_margin_grad_std": 0.14517748355865479, "epsilon_dpo/beta_margin_mean": 0.5264767408370972, "epsilon_dpo/beta_margin_std": 0.6991389393806458, "epsilon_dpo/loss_margin_mean": 750.698974609375, "grad_norm": 42.622047424316406, "kl/avg_steps": 0.5625, "kl/beta": 0.0007100327638909221, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.495773155069299e-09, "logits/chosen": -3.5165293216705322, "logits/rejected": -4.699129104614258, "logps/chosen": -535.257568359375, "logps/ref_chosen": -37.85113525390625, "logps/ref_rejected": -105.40227508544922, "logps/rejected": -1353.5076904296875, "loss": 1.0346, "rewards/accuracies": 0.828125, "rewards/chosen": -0.35279667377471924, "rewards/margins": 0.5264767408370972, "rewards/rejected": -0.8792734146118164, "step": 645 }, { "epoch": 0.9486049926578561, "epsilon_dpo/beta": 0.0007019391632638872, "epsilon_dpo/beta_margin_grad_mean": -0.39241456985473633, "epsilon_dpo/beta_margin_grad_std": 0.129283145070076, "epsilon_dpo/beta_margin_mean": 0.4752976894378662, "epsilon_dpo/beta_margin_std": 0.5920587778091431, "epsilon_dpo/loss_margin_mean": 681.503662109375, "grad_norm": 47.977237701416016, "kl/avg_steps": 0.59375, "kl/beta": 0.0007060611969791353, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.256725079024553e-09, "logits/chosen": -3.479154109954834, "logits/rejected": -4.7768402099609375, "logps/chosen": -520.402099609375, "logps/ref_chosen": -41.30128860473633, "logps/ref_rejected": -82.82234954833984, "logps/rejected": -1243.4267578125, "loss": 1.0461, "rewards/accuracies": 0.8125, "rewards/chosen": -0.33754080533981323, "rewards/margins": 0.4752976894378662, "rewards/rejected": -0.8128384947776794, "step": 646 }, { "epoch": 0.9500734214390602, "epsilon_dpo/beta": 0.0006964798085391521, "epsilon_dpo/beta_margin_grad_mean": -0.3713095486164093, "epsilon_dpo/beta_margin_grad_std": 0.11274945735931396, "epsilon_dpo/beta_margin_mean": 0.5578054785728455, "epsilon_dpo/beta_margin_std": 0.5175791382789612, "epsilon_dpo/loss_margin_mean": 803.4420776367188, "grad_norm": 48.550132751464844, "kl/avg_steps": 0.78125, "kl/beta": 0.0007018937030807137, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.024152566816791e-09, "logits/chosen": -3.453265905380249, "logits/rejected": -4.754009246826172, "logps/chosen": -431.2979736328125, "logps/ref_chosen": -35.967567443847656, "logps/ref_rejected": -98.74945068359375, "logps/rejected": -1297.52197265625, "loss": 0.9654, "rewards/accuracies": 0.890625, "rewards/chosen": -0.2764018774032593, "rewards/margins": 0.5578055381774902, "rewards/rejected": -0.8342074155807495, "step": 647 }, { "epoch": 0.9515418502202643, "epsilon_dpo/beta": 0.0006910806987434626, "epsilon_dpo/beta_margin_grad_mean": -0.35618698596954346, "epsilon_dpo/beta_margin_grad_std": 0.14514584839344025, "epsilon_dpo/beta_margin_mean": 0.6508342027664185, "epsilon_dpo/beta_margin_std": 0.699810802936554, "epsilon_dpo/loss_margin_mean": 945.5726318359375, "grad_norm": 52.17890930175781, "kl/avg_steps": 0.78125, "kl/beta": 0.0006964526255615056, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.798061746947995e-09, "logits/chosen": -3.3199658393859863, "logits/rejected": -4.965579986572266, "logps/chosen": -432.31951904296875, "logps/ref_chosen": -33.676727294921875, "logps/ref_rejected": -105.11663818359375, "logps/rejected": -1449.33203125, "loss": 0.9453, "rewards/accuracies": 0.890625, "rewards/chosen": -0.2765774726867676, "rewards/margins": 0.6508342027664185, "rewards/rejected": -0.927411675453186, "step": 648 }, { "epoch": 0.9530102790014684, "epsilon_dpo/beta": 0.0006873422535136342, "epsilon_dpo/beta_margin_grad_mean": -0.40578776597976685, "epsilon_dpo/beta_margin_grad_std": 0.1477607786655426, "epsilon_dpo/beta_margin_mean": 0.43915510177612305, "epsilon_dpo/beta_margin_std": 0.7278276085853577, "epsilon_dpo/loss_margin_mean": 644.4627685546875, "grad_norm": 40.87678146362305, "kl/avg_steps": 0.546875, "kl/beta": 0.0006910538068041205, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.5784585771215235e-09, "logits/chosen": -3.504033088684082, "logits/rejected": -4.718470573425293, "logps/chosen": -526.4951171875, "logps/ref_chosen": -45.06011199951172, "logps/ref_rejected": -86.40021514892578, "logps/rejected": -1212.298095703125, "loss": 1.1091, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3326740860939026, "rewards/margins": 0.43915510177612305, "rewards/rejected": -0.7718292474746704, "step": 649 }, { "epoch": 0.9544787077826725, "epsilon_dpo/beta": 0.0006830678903497756, "epsilon_dpo/beta_margin_grad_mean": -0.3580322861671448, "epsilon_dpo/beta_margin_grad_std": 0.16048245131969452, "epsilon_dpo/beta_margin_mean": 0.6828190684318542, "epsilon_dpo/beta_margin_std": 0.8265655040740967, "epsilon_dpo/loss_margin_mean": 1005.4606323242188, "grad_norm": 45.862091064453125, "kl/avg_steps": 0.625, "kl/beta": 0.0006872951635159552, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.3653488440851253e-09, "logits/chosen": -3.553635835647583, "logits/rejected": -5.113847732543945, "logps/chosen": -577.81494140625, "logps/ref_chosen": -39.21210861206055, "logps/ref_rejected": -103.7435302734375, "logps/rejected": -1647.8070068359375, "loss": 0.957, "rewards/accuracies": 0.84375, "rewards/chosen": -0.36932051181793213, "rewards/margins": 0.682819128036499, "rewards/rejected": -1.0521395206451416, "step": 650 }, { "epoch": 0.9559471365638766, "epsilon_dpo/beta": 0.0006790386396460235, "epsilon_dpo/beta_margin_grad_mean": -0.3646124601364136, "epsilon_dpo/beta_margin_grad_std": 0.14354108273983002, "epsilon_dpo/beta_margin_mean": 0.6235911250114441, "epsilon_dpo/beta_margin_std": 0.7001570463180542, "epsilon_dpo/loss_margin_mean": 923.6541137695312, "grad_norm": 40.79838943481445, "kl/avg_steps": 0.59375, "kl/beta": 0.0006830262136645615, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.158738163478475e-09, "logits/chosen": -3.4843456745147705, "logits/rejected": -4.997735500335693, "logps/chosen": -481.7735595703125, "logps/ref_chosen": -34.16796875, "logps/ref_rejected": -105.96416473388672, "logps/rejected": -1477.223876953125, "loss": 0.9624, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3056422173976898, "rewards/margins": 0.6235911250114441, "rewards/rejected": -0.9292333126068115, "step": 651 }, { "epoch": 0.9574155653450808, "epsilon_dpo/beta": 0.0006750306929461658, "epsilon_dpo/beta_margin_grad_mean": -0.36323583126068115, "epsilon_dpo/beta_margin_grad_std": 0.1515437662601471, "epsilon_dpo/beta_margin_mean": 0.6380264759063721, "epsilon_dpo/beta_margin_std": 0.7482752203941345, "epsilon_dpo/loss_margin_mean": 950.8151245117188, "grad_norm": 47.86579895019531, "kl/avg_steps": 0.59375, "kl/beta": 0.0006789946928620338, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.9586319796851555e-09, "logits/chosen": -3.3652729988098145, "logits/rejected": -4.884889602661133, "logps/chosen": -517.8300170898438, "logps/ref_chosen": -43.1708984375, "logps/ref_rejected": -119.19691467285156, "logps/rejected": -1544.671142578125, "loss": 0.9666, "rewards/accuracies": 0.8125, "rewards/chosen": -0.32234591245651245, "rewards/margins": 0.6380264759063721, "rewards/rejected": -0.9603723883628845, "step": 652 }, { "epoch": 0.9588839941262849, "epsilon_dpo/beta": 0.0006693587056361139, "epsilon_dpo/beta_margin_grad_mean": -0.3340756595134735, "epsilon_dpo/beta_margin_grad_std": 0.13510021567344666, "epsilon_dpo/beta_margin_mean": 0.7706581950187683, "epsilon_dpo/beta_margin_std": 0.685066819190979, "epsilon_dpo/loss_margin_mean": 1153.705078125, "grad_norm": 48.55562973022461, "kl/avg_steps": 0.84375, "kl/beta": 0.0006749869789928198, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.7650355656892166e-09, "logits/chosen": -3.2305068969726562, "logits/rejected": -5.08480167388916, "logps/chosen": -467.5439453125, "logps/ref_chosen": -36.02939987182617, "logps/ref_rejected": -111.6745376586914, "logps/rejected": -1696.89404296875, "loss": 0.8559, "rewards/accuracies": 0.90625, "rewards/chosen": -0.28941768407821655, "rewards/margins": 0.7706582546234131, "rewards/rejected": -1.0600758790969849, "step": 653 }, { "epoch": 0.960352422907489, "epsilon_dpo/beta": 0.0006645949906669557, "epsilon_dpo/beta_margin_grad_mean": -0.36657387018203735, "epsilon_dpo/beta_margin_grad_std": 0.1234474629163742, "epsilon_dpo/beta_margin_mean": 0.605144739151001, "epsilon_dpo/beta_margin_std": 0.6342909932136536, "epsilon_dpo/loss_margin_mean": 914.0985717773438, "grad_norm": 41.65330505371094, "kl/avg_steps": 0.71875, "kl/beta": 0.0006693393806926906, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.577954022936174e-09, "logits/chosen": -3.4308786392211914, "logits/rejected": -4.949398040771484, "logps/chosen": -473.72698974609375, "logps/ref_chosen": -38.32045364379883, "logps/ref_rejected": -105.32196807861328, "logps/rejected": -1454.8271484375, "loss": 0.9534, "rewards/accuracies": 0.890625, "rewards/chosen": -0.2903170585632324, "rewards/margins": 0.605144739151001, "rewards/rejected": -0.8954617977142334, "step": 654 }, { "epoch": 0.9618208516886931, "epsilon_dpo/beta": 0.000660475343465805, "epsilon_dpo/beta_margin_grad_mean": -0.3668808341026306, "epsilon_dpo/beta_margin_grad_std": 0.14767590165138245, "epsilon_dpo/beta_margin_mean": 0.6252740025520325, "epsilon_dpo/beta_margin_std": 0.7545697093009949, "epsilon_dpo/loss_margin_mean": 952.0883178710938, "grad_norm": 43.04616165161133, "kl/avg_steps": 0.625, "kl/beta": 0.0006645628600381315, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.397392281198729e-09, "logits/chosen": -3.374847173690796, "logits/rejected": -4.775622844696045, "logps/chosen": -389.5122985839844, "logps/ref_chosen": -29.801528930664062, "logps/ref_rejected": -104.75204467773438, "logps/rejected": -1416.551025390625, "loss": 0.9735, "rewards/accuracies": 0.8125, "rewards/chosen": -0.23886506259441376, "rewards/margins": 0.6252740025520325, "rewards/rejected": -0.8641390800476074, "step": 655 }, { "epoch": 0.9632892804698973, "epsilon_dpo/beta": 0.0006549282115884125, "epsilon_dpo/beta_margin_grad_mean": -0.32215529680252075, "epsilon_dpo/beta_margin_grad_std": 0.1285041719675064, "epsilon_dpo/beta_margin_mean": 0.8166189193725586, "epsilon_dpo/beta_margin_std": 0.6401075720787048, "epsilon_dpo/loss_margin_mean": 1249.337646484375, "grad_norm": 51.54441833496094, "kl/avg_steps": 0.84375, "kl/beta": 0.0006604351219721138, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.223355098446622e-09, "logits/chosen": -3.3955602645874023, "logits/rejected": -5.103156089782715, "logps/chosen": -448.05487060546875, "logps/ref_chosen": -40.732295989990234, "logps/ref_rejected": -120.36123657226562, "logps/rejected": -1777.021484375, "loss": 0.8158, "rewards/accuracies": 0.921875, "rewards/chosen": -0.2671954035758972, "rewards/margins": 0.8166189193725586, "rewards/rejected": -1.0838143825531006, "step": 656 }, { "epoch": 0.9647577092511013, "epsilon_dpo/beta": 0.0006502671749331057, "epsilon_dpo/beta_margin_grad_mean": -0.3548159599304199, "epsilon_dpo/beta_margin_grad_std": 0.12233418226242065, "epsilon_dpo/beta_margin_mean": 0.6513071060180664, "epsilon_dpo/beta_margin_std": 0.5884512662887573, "epsilon_dpo/loss_margin_mean": 1004.9851684570312, "grad_norm": 43.996578216552734, "kl/avg_steps": 0.71875, "kl/beta": 0.0006549093523062766, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.055847060721566e-09, "logits/chosen": -3.505584716796875, "logits/rejected": -5.049525260925293, "logps/chosen": -486.0547790527344, "logps/ref_chosen": -32.56511688232422, "logps/ref_rejected": -104.74242401123047, "logps/rejected": -1563.21728515625, "loss": 0.9133, "rewards/accuracies": 0.90625, "rewards/chosen": -0.29575812816619873, "rewards/margins": 0.6513071060180664, "rewards/rejected": -0.9470652341842651, "step": 657 }, { "epoch": 0.9662261380323054, "epsilon_dpo/beta": 0.0006452203379012644, "epsilon_dpo/beta_margin_grad_mean": -0.36330175399780273, "epsilon_dpo/beta_margin_grad_std": 0.11596217751502991, "epsilon_dpo/beta_margin_mean": 0.5977858304977417, "epsilon_dpo/beta_margin_std": 0.5393864512443542, "epsilon_dpo/loss_margin_mean": 929.5409545898438, "grad_norm": 35.85395431518555, "kl/avg_steps": 0.78125, "kl/beta": 0.0006502358010038733, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.8948725820160662e-09, "logits/chosen": -3.3597793579101562, "logits/rejected": -5.021791458129883, "logps/chosen": -452.764404296875, "logps/ref_chosen": -32.16458511352539, "logps/ref_rejected": -100.98091125488281, "logps/rejected": -1451.1217041015625, "loss": 0.9409, "rewards/accuracies": 0.890625, "rewards/chosen": -0.2720116972923279, "rewards/margins": 0.5977858304977417, "rewards/rejected": -0.8697974681854248, "step": 658 }, { "epoch": 0.9676945668135095, "epsilon_dpo/beta": 0.0006414285162463784, "epsilon_dpo/beta_margin_grad_mean": -0.3757038414478302, "epsilon_dpo/beta_margin_grad_std": 0.12834706902503967, "epsilon_dpo/beta_margin_mean": 0.5647572875022888, "epsilon_dpo/beta_margin_std": 0.6336069703102112, "epsilon_dpo/loss_margin_mean": 885.123046875, "grad_norm": 40.669891357421875, "kl/avg_steps": 0.59375, "kl/beta": 0.0006451951921917498, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.7404359041573723e-09, "logits/chosen": -3.3426105976104736, "logits/rejected": -4.952641487121582, "logps/chosen": -490.76068115234375, "logps/ref_chosen": -44.455406188964844, "logps/ref_rejected": -93.29725646972656, "logps/rejected": -1424.7255859375, "loss": 0.9852, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2872786521911621, "rewards/margins": 0.5647573471069336, "rewards/rejected": -0.8520359992980957, "step": 659 }, { "epoch": 0.9691629955947136, "epsilon_dpo/beta": 0.0006372415809892118, "epsilon_dpo/beta_margin_grad_mean": -0.36822497844696045, "epsilon_dpo/beta_margin_grad_std": 0.1204964891076088, "epsilon_dpo/beta_margin_mean": 0.5889629125595093, "epsilon_dpo/beta_margin_std": 0.5796295404434204, "epsilon_dpo/loss_margin_mean": 928.2510375976562, "grad_norm": 37.189697265625, "kl/avg_steps": 0.65625, "kl/beta": 0.0006413869559764862, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.592541096695571e-09, "logits/chosen": -3.402561664581299, "logits/rejected": -5.116570472717285, "logps/chosen": -454.3048095703125, "logps/ref_chosen": -41.76560974121094, "logps/ref_rejected": -82.32925415039062, "logps/rejected": -1423.1195068359375, "loss": 0.9552, "rewards/accuracies": 0.859375, "rewards/chosen": -0.26376718282699585, "rewards/margins": 0.5889629125595093, "rewards/rejected": -0.8527300953865051, "step": 660 }, { "epoch": 0.9706314243759178, "epsilon_dpo/beta": 0.0006320912507362664, "epsilon_dpo/beta_margin_grad_mean": -0.3780907392501831, "epsilon_dpo/beta_margin_grad_std": 0.11522994190454483, "epsilon_dpo/beta_margin_mean": 0.5599713921546936, "epsilon_dpo/beta_margin_std": 0.6463156938552856, "epsilon_dpo/loss_margin_mean": 888.2638549804688, "grad_norm": 32.038639068603516, "kl/avg_steps": 0.8125, "kl/beta": 0.0006372053176164627, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.4511920567963908e-09, "logits/chosen": -3.5372705459594727, "logits/rejected": -4.98753547668457, "logps/chosen": -504.87115478515625, "logps/ref_chosen": -38.271453857421875, "logps/ref_rejected": -92.10589599609375, "logps/rejected": -1446.969482421875, "loss": 0.9829, "rewards/accuracies": 0.90625, "rewards/chosen": -0.29568618535995483, "rewards/margins": 0.5599713921546936, "rewards/rejected": -0.8556575775146484, "step": 661 }, { "epoch": 0.9720998531571219, "epsilon_dpo/beta": 0.0006279845256358385, "epsilon_dpo/beta_margin_grad_mean": -0.3694445788860321, "epsilon_dpo/beta_margin_grad_std": 0.1416751593351364, "epsilon_dpo/beta_margin_mean": 0.5916183590888977, "epsilon_dpo/beta_margin_std": 0.681064784526825, "epsilon_dpo/loss_margin_mean": 946.8544311523438, "grad_norm": 37.34529113769531, "kl/avg_steps": 0.65625, "kl/beta": 0.0006320697139017284, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.3163925091384532e-09, "logits/chosen": -3.4751718044281006, "logits/rejected": -5.172747611999512, "logps/chosen": -533.3956298828125, "logps/ref_chosen": -39.42928695678711, "logps/ref_rejected": -96.40357208251953, "logps/rejected": -1537.224365234375, "loss": 0.9818, "rewards/accuracies": 0.875, "rewards/chosen": -0.31171175837516785, "rewards/margins": 0.5916184186935425, "rewards/rejected": -0.9033301472663879, "step": 662 }, { "epoch": 0.973568281938326, "epsilon_dpo/beta": 0.0006246752454899251, "epsilon_dpo/beta_margin_grad_mean": -0.3684435486793518, "epsilon_dpo/beta_margin_grad_std": 0.14285063743591309, "epsilon_dpo/beta_margin_mean": 0.6072575449943542, "epsilon_dpo/beta_margin_std": 0.6948506832122803, "epsilon_dpo/loss_margin_mean": 978.2127075195312, "grad_norm": 46.52898025512695, "kl/avg_steps": 0.53125, "kl/beta": 0.0006279487861320376, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.1881460058152382e-09, "logits/chosen": -3.4985265731811523, "logits/rejected": -4.843735694885254, "logps/chosen": -518.152099609375, "logps/ref_chosen": -44.08625411987305, "logps/ref_rejected": -121.31452178955078, "logps/rejected": -1573.593017578125, "loss": 0.9729, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2980585992336273, "rewards/margins": 0.6072574853897095, "rewards/rejected": -0.9053161144256592, "step": 663 }, { "epoch": 0.9750367107195301, "epsilon_dpo/beta": 0.0006194221205078065, "epsilon_dpo/beta_margin_grad_mean": -0.33726438879966736, "epsilon_dpo/beta_margin_grad_std": 0.1218588799238205, "epsilon_dpo/beta_margin_mean": 0.7527669072151184, "epsilon_dpo/beta_margin_std": 0.655570924282074, "epsilon_dpo/loss_margin_mean": 1217.3953857421875, "grad_norm": 43.377994537353516, "kl/avg_steps": 0.84375, "kl/beta": 0.0006246304837986827, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.066455926241383e-09, "logits/chosen": -3.1317358016967773, "logits/rejected": -5.002200126647949, "logps/chosen": -338.7730712890625, "logps/ref_chosen": -31.542118072509766, "logps/ref_rejected": -116.06498718261719, "logps/rejected": -1640.69140625, "loss": 0.8558, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19074614346027374, "rewards/margins": 0.7527669072151184, "rewards/rejected": -0.943513035774231, "step": 664 }, { "epoch": 0.9765051395007343, "epsilon_dpo/beta": 0.0006142394850030541, "epsilon_dpo/beta_margin_grad_mean": -0.37084051966667175, "epsilon_dpo/beta_margin_grad_std": 0.11781810969114304, "epsilon_dpo/beta_margin_mean": 0.5667802691459656, "epsilon_dpo/beta_margin_std": 0.5540540218353271, "epsilon_dpo/loss_margin_mean": 925.40185546875, "grad_norm": 44.8764762878418, "kl/avg_steps": 0.84375, "kl/beta": 0.0006194042507559061, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 9.513254770636137e-10, "logits/chosen": -3.3389735221862793, "logits/rejected": -5.07022762298584, "logps/chosen": -454.4164123535156, "logps/ref_chosen": -36.933109283447266, "logps/ref_rejected": -89.9020767211914, "logps/rejected": -1432.787353515625, "loss": 0.9663, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2572126090526581, "rewards/margins": 0.5667802095413208, "rewards/rejected": -0.8239928483963013, "step": 665 }, { "epoch": 0.9779735682819384, "epsilon_dpo/beta": 0.0006110197864472866, "epsilon_dpo/beta_margin_grad_mean": -0.39552050828933716, "epsilon_dpo/beta_margin_grad_std": 0.1250423640012741, "epsilon_dpo/beta_margin_mean": 0.46575719118118286, "epsilon_dpo/beta_margin_std": 0.5870808362960815, "epsilon_dpo/loss_margin_mean": 767.228515625, "grad_norm": 41.77029037475586, "kl/avg_steps": 0.53125, "kl/beta": 0.0006142217316664755, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.427576920763956e-10, "logits/chosen": -3.6731343269348145, "logits/rejected": -4.908390522003174, "logps/chosen": -623.1746826171875, "logps/ref_chosen": -47.59907913208008, "logps/ref_rejected": -102.33778381347656, "logps/rejected": -1445.141845703125, "loss": 1.0509, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3538281321525574, "rewards/margins": 0.46575719118118286, "rewards/rejected": -0.8195853233337402, "step": 666 }, { "epoch": 0.9794419970631424, "epsilon_dpo/beta": 0.000606836169026792, "epsilon_dpo/beta_margin_grad_mean": -0.38951799273490906, "epsilon_dpo/beta_margin_grad_std": 0.12825334072113037, "epsilon_dpo/beta_margin_mean": 0.4804462492465973, "epsilon_dpo/beta_margin_std": 0.5962188839912415, "epsilon_dpo/loss_margin_mean": 796.4560546875, "grad_norm": 38.44181823730469, "kl/avg_steps": 0.6875, "kl/beta": 0.0006109759560786188, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 7.407554321417764e-10, "logits/chosen": -3.463103771209717, "logits/rejected": -4.9513959884643555, "logps/chosen": -567.8262939453125, "logps/ref_chosen": -43.69598388671875, "logps/ref_rejected": -93.95926666259766, "logps/rejected": -1414.545654296875, "loss": 1.0431, "rewards/accuracies": 0.84375, "rewards/chosen": -0.320189893245697, "rewards/margins": 0.4804462790489197, "rewards/rejected": -0.8006361722946167, "step": 667 }, { "epoch": 0.9809104258443465, "epsilon_dpo/beta": 0.0006032615783624351, "epsilon_dpo/beta_margin_grad_mean": -0.3832368850708008, "epsilon_dpo/beta_margin_grad_std": 0.11170457303524017, "epsilon_dpo/beta_margin_mean": 0.5103982090950012, "epsilon_dpo/beta_margin_std": 0.517078161239624, "epsilon_dpo/loss_margin_mean": 850.3494262695312, "grad_norm": 37.77009963989258, "kl/avg_steps": 0.59375, "kl/beta": 0.0006068041548132896, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 6.453213851142225e-10, "logits/chosen": -3.5545034408569336, "logits/rejected": -4.857883453369141, "logps/chosen": -569.862548828125, "logps/ref_chosen": -48.83540344238281, "logps/ref_rejected": -108.24037170410156, "logps/rejected": -1479.616943359375, "loss": 0.9999, "rewards/accuracies": 0.84375, "rewards/chosen": -0.31520360708236694, "rewards/margins": 0.5103981494903564, "rewards/rejected": -0.8256018161773682, "step": 668 }, { "epoch": 0.9823788546255506, "epsilon_dpo/beta": 0.0005995123065076768, "epsilon_dpo/beta_margin_grad_mean": -0.3920701742172241, "epsilon_dpo/beta_margin_grad_std": 0.11787072569131851, "epsilon_dpo/beta_margin_mean": 0.4740796983242035, "epsilon_dpo/beta_margin_std": 0.5450995564460754, "epsilon_dpo/loss_margin_mean": 795.2932739257812, "grad_norm": 37.88663101196289, "kl/avg_steps": 0.625, "kl/beta": 0.0006032225210219622, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 5.564580657695939e-10, "logits/chosen": -3.4180922508239746, "logits/rejected": -4.825163841247559, "logps/chosen": -459.27081298828125, "logps/ref_chosen": -33.70474624633789, "logps/ref_rejected": -84.20474243164062, "logps/rejected": -1305.064208984375, "loss": 1.0343, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2562495172023773, "rewards/margins": 0.4740796983242035, "rewards/rejected": -0.7303292751312256, "step": 669 }, { "epoch": 0.9838472834067548, "epsilon_dpo/beta": 0.0005952265928499401, "epsilon_dpo/beta_margin_grad_mean": -0.3791544735431671, "epsilon_dpo/beta_margin_grad_std": 0.1323607712984085, "epsilon_dpo/beta_margin_mean": 0.5472928881645203, "epsilon_dpo/beta_margin_std": 0.6459078788757324, "epsilon_dpo/loss_margin_mean": 923.514404296875, "grad_norm": 38.17559051513672, "kl/avg_steps": 0.71875, "kl/beta": 0.0005994758103042841, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.741678157389739e-10, "logits/chosen": -3.493618965148926, "logits/rejected": -5.067645072937012, "logps/chosen": -505.29241943359375, "logps/ref_chosen": -41.06857681274414, "logps/ref_rejected": -103.42701721191406, "logps/rejected": -1491.165283203125, "loss": 1.0026, "rewards/accuracies": 0.890625, "rewards/chosen": -0.27749842405319214, "rewards/margins": 0.547292947769165, "rewards/rejected": -0.8247913122177124, "step": 670 }, { "epoch": 0.9853157121879589, "epsilon_dpo/beta": 0.000591909047216177, "epsilon_dpo/beta_margin_grad_mean": -0.38887372612953186, "epsilon_dpo/beta_margin_grad_std": 0.14615273475646973, "epsilon_dpo/beta_margin_mean": 0.5095086693763733, "epsilon_dpo/beta_margin_std": 0.7063105702400208, "epsilon_dpo/loss_margin_mean": 866.9005737304688, "grad_norm": 41.20299530029297, "kl/avg_steps": 0.5625, "kl/beta": 0.0005951978382654488, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.9845280344705245e-10, "logits/chosen": -3.6080431938171387, "logits/rejected": -5.071390151977539, "logps/chosen": -567.7605590820312, "logps/ref_chosen": -35.17292785644531, "logps/ref_rejected": -90.90328216552734, "logps/rejected": -1490.3914794921875, "loss": 1.0497, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3173062801361084, "rewards/margins": 0.5095087289810181, "rewards/rejected": -0.8268150091171265, "step": 671 }, { "epoch": 0.986784140969163, "epsilon_dpo/beta": 0.000588598137255758, "epsilon_dpo/beta_margin_grad_mean": -0.3892599046230316, "epsilon_dpo/beta_margin_grad_std": 0.13194513320922852, "epsilon_dpo/beta_margin_mean": 0.491289347410202, "epsilon_dpo/beta_margin_std": 0.6089746356010437, "epsilon_dpo/loss_margin_mean": 840.4798583984375, "grad_norm": 42.78655242919922, "kl/avg_steps": 0.5625, "kl/beta": 0.0005918685346841812, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.293150240547549e-10, "logits/chosen": -3.515069007873535, "logits/rejected": -5.043622970581055, "logps/chosen": -580.21044921875, "logps/ref_chosen": -35.802879333496094, "logps/ref_rejected": -100.60040283203125, "logps/rejected": -1485.48779296875, "loss": 1.0379, "rewards/accuracies": 0.8125, "rewards/chosen": -0.32212620973587036, "rewards/margins": 0.491289347410202, "rewards/rejected": -0.8134155869483948, "step": 672 }, { "epoch": 0.9882525697503671, "epsilon_dpo/beta": 0.0005843861144967377, "epsilon_dpo/beta_margin_grad_mean": -0.3730253279209137, "epsilon_dpo/beta_margin_grad_std": 0.10759243369102478, "epsilon_dpo/beta_margin_mean": 0.5548798441886902, "epsilon_dpo/beta_margin_std": 0.5018904209136963, "epsilon_dpo/loss_margin_mean": 952.566162109375, "grad_norm": 38.58977508544922, "kl/avg_steps": 0.71875, "kl/beta": 0.0005885579157620668, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.6675629940689504e-10, "logits/chosen": -3.419285297393799, "logits/rejected": -5.03121280670166, "logps/chosen": -399.2283935546875, "logps/ref_chosen": -26.926271438598633, "logps/ref_rejected": -91.83390808105469, "logps/rejected": -1416.7021484375, "loss": 0.963, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2182503640651703, "rewards/margins": 0.5548798441886902, "rewards/rejected": -0.7731301784515381, "step": 673 }, { "epoch": 0.9897209985315712, "epsilon_dpo/beta": 0.0005809462745673954, "epsilon_dpo/beta_margin_grad_mean": -0.392082154750824, "epsilon_dpo/beta_margin_grad_std": 0.13613100349903107, "epsilon_dpo/beta_margin_mean": 0.47949162125587463, "epsilon_dpo/beta_margin_std": 0.6231409311294556, "epsilon_dpo/loss_margin_mean": 831.1757202148438, "grad_norm": 32.84586715698242, "kl/avg_steps": 0.59375, "kl/beta": 0.0005843578255735338, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.1077827798404725e-10, "logits/chosen": -3.5797486305236816, "logits/rejected": -5.058620452880859, "logps/chosen": -462.48150634765625, "logps/ref_chosen": -32.161155700683594, "logps/ref_rejected": -73.33118438720703, "logps/rejected": -1334.8271484375, "loss": 1.0513, "rewards/accuracies": 0.796875, "rewards/chosen": -0.25154006481170654, "rewards/margins": 0.479491651058197, "rewards/rejected": -0.7310316562652588, "step": 674 }, { "epoch": 0.9911894273127754, "epsilon_dpo/beta": 0.0005764279630966485, "epsilon_dpo/beta_margin_grad_mean": -0.37939831614494324, "epsilon_dpo/beta_margin_grad_std": 0.11590909957885742, "epsilon_dpo/beta_margin_mean": 0.5321040749549866, "epsilon_dpo/beta_margin_std": 0.5455756187438965, "epsilon_dpo/loss_margin_mean": 925.9025268554688, "grad_norm": 30.256996154785156, "kl/avg_steps": 0.78125, "kl/beta": 0.0005809086724184453, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.6138243485910863e-10, "logits/chosen": -3.4077110290527344, "logits/rejected": -5.058513641357422, "logps/chosen": -380.3410339355469, "logps/ref_chosen": -26.25046157836914, "logps/ref_rejected": -80.57308959960938, "logps/rejected": -1360.566162109375, "loss": 0.9896, "rewards/accuracies": 0.90625, "rewards/chosen": -0.20505669713020325, "rewards/margins": 0.5321040749549866, "rewards/rejected": -0.7371607422828674, "step": 675 }, { "epoch": 0.9926578560939795, "epsilon_dpo/beta": 0.0005732206045649946, "epsilon_dpo/beta_margin_grad_mean": -0.39650610089302063, "epsilon_dpo/beta_margin_grad_std": 0.13006651401519775, "epsilon_dpo/beta_margin_mean": 0.45457127690315247, "epsilon_dpo/beta_margin_std": 0.583877682685852, "epsilon_dpo/loss_margin_mean": 798.5032958984375, "grad_norm": 36.964908599853516, "kl/avg_steps": 0.5625, "kl/beta": 0.0005764055531471968, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.1857007165852472e-10, "logits/chosen": -3.637737512588501, "logits/rejected": -4.942368507385254, "logps/chosen": -621.6793823242188, "logps/ref_chosen": -44.59165954589844, "logps/ref_rejected": -96.4842300415039, "logps/rejected": -1472.0751953125, "loss": 1.0612, "rewards/accuracies": 0.828125, "rewards/chosen": -0.332570344209671, "rewards/margins": 0.4545712471008301, "rewards/rejected": -0.7871416211128235, "step": 676 }, { "epoch": 0.9941262848751835, "epsilon_dpo/beta": 0.0005685811047442257, "epsilon_dpo/beta_margin_grad_mean": -0.3787325918674469, "epsilon_dpo/beta_margin_grad_std": 0.11398439854383469, "epsilon_dpo/beta_margin_mean": 0.5294075012207031, "epsilon_dpo/beta_margin_std": 0.5406895279884338, "epsilon_dpo/loss_margin_mean": 933.8375244140625, "grad_norm": 35.77093505859375, "kl/avg_steps": 0.8125, "kl/beta": 0.0005731813726015389, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 8.23423165278725e-11, "logits/chosen": -3.3925907611846924, "logits/rejected": -5.077644348144531, "logps/chosen": -438.6134948730469, "logps/ref_chosen": -29.166034698486328, "logps/ref_rejected": -84.22445678710938, "logps/rejected": -1427.509521484375, "loss": 0.9904, "rewards/accuracies": 0.921875, "rewards/chosen": -0.23343393206596375, "rewards/margins": 0.5294075012207031, "rewards/rejected": -0.7628414630889893, "step": 677 }, { "epoch": 0.9955947136563876, "epsilon_dpo/beta": 0.0005647094221785665, "epsilon_dpo/beta_margin_grad_mean": -0.3867654800415039, "epsilon_dpo/beta_margin_grad_std": 0.12935031950473785, "epsilon_dpo/beta_margin_mean": 0.4952041506767273, "epsilon_dpo/beta_margin_std": 0.5945535898208618, "epsilon_dpo/loss_margin_mean": 881.53369140625, "grad_norm": 34.48164749145508, "kl/avg_steps": 0.6875, "kl/beta": 0.0005685618380084634, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 5.270012410216185e-11, "logits/chosen": -3.5082788467407227, "logits/rejected": -5.051633358001709, "logps/chosen": -463.51409912109375, "logps/ref_chosen": -31.562625885009766, "logps/ref_rejected": -86.20599365234375, "logps/rejected": -1399.691162109375, "loss": 1.0318, "rewards/accuracies": 0.859375, "rewards/chosen": -0.24528416991233826, "rewards/margins": 0.4952041506767273, "rewards/rejected": -0.7404882907867432, "step": 678 }, { "epoch": 0.9970631424375918, "epsilon_dpo/beta": 0.0005620888550765812, "epsilon_dpo/beta_margin_grad_mean": -0.4052530825138092, "epsilon_dpo/beta_margin_grad_std": 0.11750102043151855, "epsilon_dpo/beta_margin_mean": 0.41717761754989624, "epsilon_dpo/beta_margin_std": 0.5460913777351379, "epsilon_dpo/loss_margin_mean": 747.7733154296875, "grad_norm": 37.97880935668945, "kl/avg_steps": 0.46875, "kl/beta": 0.0005646796198561788, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.9644275480772416e-11, "logits/chosen": -3.43449068069458, "logits/rejected": -4.8752970695495605, "logps/chosen": -521.1033935546875, "logps/ref_chosen": -35.110084533691406, "logps/ref_rejected": -82.25491333007812, "logps/rejected": -1316.021484375, "loss": 1.0791, "rewards/accuracies": 0.796875, "rewards/chosen": -0.2744186222553253, "rewards/margins": 0.41717761754989624, "rewards/rejected": -0.691596269607544, "step": 679 }, { "epoch": 0.9985315712187959, "epsilon_dpo/beta": 0.0005580611759796739, "epsilon_dpo/beta_margin_grad_mean": -0.360996276140213, "epsilon_dpo/beta_margin_grad_std": 0.11674246937036514, "epsilon_dpo/beta_margin_mean": 0.621047854423523, "epsilon_dpo/beta_margin_std": 0.5679141879081726, "epsilon_dpo/loss_margin_mean": 1116.3912353515625, "grad_norm": 45.923912048339844, "kl/avg_steps": 0.71875, "kl/beta": 0.0005620450829155743, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.31753782067201e-11, "logits/chosen": -3.399845600128174, "logits/rejected": -4.949437618255615, "logps/chosen": -484.4205017089844, "logps/ref_chosen": -49.117393493652344, "logps/ref_rejected": -118.43990325927734, "logps/rejected": -1670.13427734375, "loss": 0.9288, "rewards/accuracies": 0.890625, "rewards/chosen": -0.24402347207069397, "rewards/margins": 0.621047854423523, "rewards/rejected": -0.8650712966918945, "step": 680 }, { "epoch": 1.0, "epsilon_dpo/beta": 0.0005535554955713451, "epsilon_dpo/beta_margin_grad_mean": -0.37510794401168823, "epsilon_dpo/beta_margin_grad_std": 0.11336696147918701, "epsilon_dpo/beta_margin_mean": 0.5475642085075378, "epsilon_dpo/beta_margin_std": 0.535968005657196, "epsilon_dpo/loss_margin_mean": 992.2255859375, "grad_norm": 39.61741638183594, "kl/avg_steps": 0.8125, "kl/beta": 0.00055803416762501, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.2938662507808745e-12, "logits/chosen": -3.4865760803222656, "logits/rejected": -5.149275302886963, "logps/chosen": -487.08197021484375, "logps/ref_chosen": -40.42382049560547, "logps/ref_rejected": -94.08821105957031, "logps/rejected": -1532.971923828125, "loss": 0.9757, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2478674352169037, "rewards/margins": 0.5475642085075378, "rewards/rejected": -0.7954316735267639, "step": 681 }, { "epoch": 1.0, "step": 681, "total_flos": 0.0, "train_loss": 0.6833117403385398, "train_runtime": 3053.5655, "train_samples_per_second": 14.278, "train_steps_per_second": 0.223 } ], "logging_steps": 1, "max_steps": 681, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }