{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999244142101285, "eval_steps": 100, "global_step": 661, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015117157974300832, "epsilon_dpo/beta": 0.10007250308990479, "epsilon_dpo/beta_margin_grad_mean": -0.5023139119148254, "epsilon_dpo/beta_margin_grad_std": 0.008728506043553352, "epsilon_dpo/beta_margin_mean": -0.009260065853595734, "epsilon_dpo/beta_margin_std": 0.034927163273096085, "epsilon_dpo/loss_margin_mean": -0.08983081579208374, "grad_norm": 20.890338897705078, "kl/avg_steps": -0.0625, "kl/beta": 0.10000000149011612, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 0.0, "logits/chosen": 0.16419550776481628, "logits/rejected": 0.3213843107223511, "logps/chosen": -85.75910186767578, "logps/ref_chosen": -85.73025512695312, "logps/ref_rejected": -88.7523193359375, "logps/rejected": -88.69134521484375, "loss": 1.3959, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0029897706117480993, "rewards/margins": -0.00926008727401495, "rewards/rejected": 0.006270316895097494, "step": 1 }, { "epoch": 0.0030234315948601664, "epsilon_dpo/beta": 0.10010381788015366, "epsilon_dpo/beta_margin_grad_mean": -0.5008825063705444, "epsilon_dpo/beta_margin_grad_std": 0.009928400628268719, "epsilon_dpo/beta_margin_mean": -0.0035273784305900335, "epsilon_dpo/beta_margin_std": 0.03973941132426262, "epsilon_dpo/loss_margin_mean": -0.0322224497795105, "grad_norm": 19.60763168334961, "kl/avg_steps": -0.03125, "kl/beta": 0.1000625416636467, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 7.462686567164179e-09, "logits/chosen": 0.22774043679237366, "logits/rejected": 0.2854035198688507, "logps/chosen": -79.52250671386719, "logps/ref_chosen": -79.5359115600586, "logps/ref_rejected": -88.0529556274414, "logps/rejected": -88.00732421875, "loss": 1.3902, "rewards/accuracies": 0.484375, "rewards/chosen": 0.00122718489728868, "rewards/margins": -0.0035273819230496883, "rewards/rejected": 0.0047545661218464375, "step": 2 }, { "epoch": 0.0045351473922902496, "epsilon_dpo/beta": 0.10007254779338837, "epsilon_dpo/beta_margin_grad_mean": -0.49972668290138245, "epsilon_dpo/beta_margin_grad_std": 0.010364462621510029, "epsilon_dpo/beta_margin_mean": 0.0010939195053651929, "epsilon_dpo/beta_margin_std": 0.04147500917315483, "epsilon_dpo/loss_margin_mean": 0.014205396175384521, "grad_norm": 22.582014083862305, "kl/avg_steps": 0.03125, "kl/beta": 0.10009381920099258, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.4925373134328357e-08, "logits/chosen": 0.414395809173584, "logits/rejected": 0.5279880166053772, "logps/chosen": -88.2738037109375, "logps/ref_chosen": -88.24923706054688, "logps/ref_rejected": -115.33953857421875, "logps/rejected": -115.37831115722656, "loss": 1.3856, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0025792494416236877, "rewards/margins": 0.0010940060019493103, "rewards/rejected": -0.0036732552107423544, "step": 3 }, { "epoch": 0.006046863189720333, "epsilon_dpo/beta": 0.09994746744632721, "epsilon_dpo/beta_margin_grad_mean": -0.4983575642108917, "epsilon_dpo/beta_margin_grad_std": 0.010976379737257957, "epsilon_dpo/beta_margin_mean": 0.006576488725841045, "epsilon_dpo/beta_margin_std": 0.04393243417143822, "epsilon_dpo/loss_margin_mean": 0.06914472579956055, "grad_norm": 22.595983505249023, "kl/avg_steps": 0.125, "kl/beta": 0.1000625491142273, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 2.2388059701492534e-08, "logits/chosen": 0.6811072826385498, "logits/rejected": 0.5620886087417603, "logps/chosen": -100.42185974121094, "logps/ref_chosen": -100.4271240234375, "logps/ref_rejected": -107.98687744140625, "logps/rejected": -108.0507583618164, "loss": 1.3802, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00037316768430173397, "rewards/margins": 0.006576500833034515, "rewards/rejected": -0.006203333381563425, "step": 4 }, { "epoch": 0.007558578987150416, "epsilon_dpo/beta": 0.09988515079021454, "epsilon_dpo/beta_margin_grad_mean": -0.4993220567703247, "epsilon_dpo/beta_margin_grad_std": 0.011706396006047726, "epsilon_dpo/beta_margin_mean": 0.0027205420192331076, "epsilon_dpo/beta_margin_std": 0.046862564980983734, "epsilon_dpo/loss_margin_mean": 0.030702859163284302, "grad_norm": 21.218976974487305, "kl/avg_steps": 0.0625, "kl/beta": 0.09993762522935867, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 2.9850746268656714e-08, "logits/chosen": 0.35812437534332275, "logits/rejected": 0.536173939704895, "logps/chosen": -86.76708984375, "logps/ref_chosen": -86.78726959228516, "logps/ref_rejected": -105.92151641845703, "logps/rejected": -105.93203735351562, "loss": 1.3841, "rewards/accuracies": 0.53125, "rewards/chosen": 0.001866575563326478, "rewards/margins": 0.0027204796206206083, "rewards/rejected": -0.0008539038244634867, "step": 5 }, { "epoch": 0.009070294784580499, "epsilon_dpo/beta": 0.09994761645793915, "epsilon_dpo/beta_margin_grad_mean": -0.500268280506134, "epsilon_dpo/beta_margin_grad_std": 0.009158617816865444, "epsilon_dpo/beta_margin_mean": -0.0010719078127294779, "epsilon_dpo/beta_margin_std": 0.036647580564022064, "epsilon_dpo/loss_margin_mean": -0.007842868566513062, "grad_norm": 21.13234519958496, "kl/avg_steps": -0.0625, "kl/beta": 0.09987520426511765, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 3.731343283582089e-08, "logits/chosen": 0.33463966846466064, "logits/rejected": 0.5480047464370728, "logps/chosen": -73.50778198242188, "logps/ref_chosen": -73.4627914428711, "logps/ref_rejected": -108.36318969726562, "logps/rejected": -108.40033721923828, "loss": 1.3877, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004621636122465134, "rewards/margins": -0.001071843784302473, "rewards/rejected": -0.003549792803823948, "step": 6 }, { "epoch": 0.010582010582010581, "epsilon_dpo/beta": 0.10004135221242905, "epsilon_dpo/beta_margin_grad_mean": -0.5008215308189392, "epsilon_dpo/beta_margin_grad_std": 0.00910845585167408, "epsilon_dpo/beta_margin_mean": -0.0032860664650797844, "epsilon_dpo/beta_margin_std": 0.03644711524248123, "epsilon_dpo/loss_margin_mean": -0.02999284863471985, "grad_norm": 20.586753845214844, "kl/avg_steps": -0.09375, "kl/beta": 0.09993766248226166, "kl/n_epsilon_steps": 0.546875, "kl/p_epsilon_steps": 0.453125, "learning_rate": 4.477611940298507e-08, "logits/chosen": 0.38425832986831665, "logits/rejected": 0.5074484944343567, "logps/chosen": -73.5091552734375, "logps/ref_chosen": -73.51263427734375, "logps/ref_rejected": -92.73175811767578, "logps/rejected": -92.69828796386719, "loss": 1.3899, "rewards/accuracies": 0.453125, "rewards/chosen": 0.00024446635507047176, "rewards/margins": -0.003286009654402733, "rewards/rejected": 0.0035304762423038483, "step": 7 }, { "epoch": 0.012093726379440665, "epsilon_dpo/beta": 0.10005691647529602, "epsilon_dpo/beta_margin_grad_mean": -0.5013768076896667, "epsilon_dpo/beta_margin_grad_std": 0.010522288270294666, "epsilon_dpo/beta_margin_mean": -0.005510755814611912, "epsilon_dpo/beta_margin_std": 0.04210984334349632, "epsilon_dpo/loss_margin_mean": -0.051828861236572266, "grad_norm": 21.454986572265625, "kl/avg_steps": -0.015625, "kl/beta": 0.10003144294023514, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.484375, "learning_rate": 5.223880597014925e-08, "logits/chosen": 0.44094252586364746, "logits/rejected": 0.6611981391906738, "logps/chosen": -86.21814727783203, "logps/ref_chosen": -86.1173095703125, "logps/ref_rejected": -97.60299682617188, "logps/rejected": -97.65200805664062, "loss": 1.3923, "rewards/accuracies": 0.484375, "rewards/chosen": -0.010229799896478653, "rewards/margins": -0.005510762333869934, "rewards/rejected": -0.004719037562608719, "step": 8 }, { "epoch": 0.013605442176870748, "epsilon_dpo/beta": 0.10011961311101913, "epsilon_dpo/beta_margin_grad_mean": -0.5023381114006042, "epsilon_dpo/beta_margin_grad_std": 0.010044464841485023, "epsilon_dpo/beta_margin_mean": -0.009359496645629406, "epsilon_dpo/beta_margin_std": 0.04019934684038162, "epsilon_dpo/loss_margin_mean": -0.0903191864490509, "grad_norm": 22.43557357788086, "kl/avg_steps": -0.0625, "kl/beta": 0.10004707425832748, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 5.970149253731343e-08, "logits/chosen": 0.33967816829681396, "logits/rejected": 0.5344347953796387, "logps/chosen": -76.60382843017578, "logps/ref_chosen": -76.54713439941406, "logps/ref_rejected": -113.62154388427734, "logps/rejected": -113.5879135131836, "loss": 1.3961, "rewards/accuracies": 0.453125, "rewards/chosen": -0.005803982727229595, "rewards/margins": -0.009359488263726234, "rewards/rejected": 0.0035555048380047083, "step": 9 }, { "epoch": 0.015117157974300832, "epsilon_dpo/beta": 0.09994741529226303, "epsilon_dpo/beta_margin_grad_mean": -0.498788446187973, "epsilon_dpo/beta_margin_grad_std": 0.013752726837992668, "epsilon_dpo/beta_margin_mean": 0.0048447358421981335, "epsilon_dpo/beta_margin_std": 0.05508105456829071, "epsilon_dpo/loss_margin_mean": 0.05251148343086243, "grad_norm": 21.88376235961914, "kl/avg_steps": 0.171875, "kl/beta": 0.10010964423418045, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.578125, "learning_rate": 6.71641791044776e-08, "logits/chosen": 0.3774455189704895, "logits/rejected": 0.5606910586357117, "logps/chosen": -86.106201171875, "logps/ref_chosen": -86.14695739746094, "logps/ref_rejected": -112.69779968261719, "logps/rejected": -112.70955657958984, "loss": 1.3822, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003937458153814077, "rewards/margins": 0.004844787064939737, "rewards/rejected": -0.0009073292021639645, "step": 10 }, { "epoch": 0.016628873771730914, "epsilon_dpo/beta": 0.09985417127609253, "epsilon_dpo/beta_margin_grad_mean": -0.5002174377441406, "epsilon_dpo/beta_margin_grad_std": 0.011608804576098919, "epsilon_dpo/beta_margin_mean": -0.0008744366350583732, "epsilon_dpo/beta_margin_std": 0.04646685719490051, "epsilon_dpo/loss_margin_mean": -0.00528264045715332, "grad_norm": 20.917577743530273, "kl/avg_steps": 0.09375, "kl/beta": 0.09993787854909897, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 7.462686567164178e-08, "logits/chosen": 0.46160414814949036, "logits/rejected": 0.4182695746421814, "logps/chosen": -93.89750671386719, "logps/ref_chosen": -93.89774322509766, "logps/ref_rejected": -91.40093994140625, "logps/rejected": -91.39541625976562, "loss": 1.3877, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0001967300195246935, "rewards/margins": -0.0008744364604353905, "rewards/rejected": 0.0006777062080800533, "step": 11 }, { "epoch": 0.018140589569160998, "epsilon_dpo/beta": 0.09980729967355728, "epsilon_dpo/beta_margin_grad_mean": -0.4989064633846283, "epsilon_dpo/beta_margin_grad_std": 0.011306311003863811, "epsilon_dpo/beta_margin_mean": 0.004372739233076572, "epsilon_dpo/beta_margin_std": 0.045261066406965256, "epsilon_dpo/loss_margin_mean": 0.047140806913375854, "grad_norm": 19.881134033203125, "kl/avg_steps": 0.046875, "kl/beta": 0.09984427690505981, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.515625, "learning_rate": 8.208955223880596e-08, "logits/chosen": 0.39477378129959106, "logits/rejected": 0.4941557049751282, "logps/chosen": -86.1840591430664, "logps/ref_chosen": -86.17680358886719, "logps/ref_rejected": -97.00653076171875, "logps/rejected": -97.0609359741211, "loss": 1.3824, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0008587964694015682, "rewards/margins": 0.004372742492705584, "rewards/rejected": -0.005231539253145456, "step": 12 }, { "epoch": 0.019652305366591082, "epsilon_dpo/beta": 0.09980747103691101, "epsilon_dpo/beta_margin_grad_mean": -0.5011844635009766, "epsilon_dpo/beta_margin_grad_std": 0.010257571935653687, "epsilon_dpo/beta_margin_mean": -0.004736810456961393, "epsilon_dpo/beta_margin_std": 0.041052356362342834, "epsilon_dpo/loss_margin_mean": -0.0443558394908905, "grad_norm": 20.99261474609375, "kl/avg_steps": 0.0, "kl/beta": 0.09979749470949173, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 8.955223880597014e-08, "logits/chosen": 0.04624774307012558, "logits/rejected": 0.36173152923583984, "logps/chosen": -67.12196350097656, "logps/ref_chosen": -67.10835266113281, "logps/ref_rejected": -114.23330688476562, "logps/rejected": -114.20256042480469, "loss": 1.3915, "rewards/accuracies": 0.453125, "rewards/chosen": -0.0015045893378555775, "rewards/margins": -0.00473686121404171, "rewards/rejected": 0.003232272807508707, "step": 13 }, { "epoch": 0.021164021164021163, "epsilon_dpo/beta": 0.09983865916728973, "epsilon_dpo/beta_margin_grad_mean": -0.5012878775596619, "epsilon_dpo/beta_margin_grad_std": 0.010014628060162067, "epsilon_dpo/beta_margin_mean": -0.005156196188181639, "epsilon_dpo/beta_margin_std": 0.040078554302453995, "epsilon_dpo/loss_margin_mean": -0.0487152636051178, "grad_norm": 22.20970344543457, "kl/avg_steps": -0.03125, "kl/beta": 0.09979749470949173, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 9.701492537313432e-08, "logits/chosen": 0.3906971514225006, "logits/rejected": 0.523829460144043, "logps/chosen": -87.2183837890625, "logps/ref_chosen": -87.24519348144531, "logps/ref_rejected": -96.43710327148438, "logps/rejected": -96.36158752441406, "loss": 1.3919, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0025335517711937428, "rewards/margins": -0.005156217608600855, "rewards/rejected": 0.007689769379794598, "step": 14 }, { "epoch": 0.022675736961451247, "epsilon_dpo/beta": 0.09971387684345245, "epsilon_dpo/beta_margin_grad_mean": -0.4991253614425659, "epsilon_dpo/beta_margin_grad_std": 0.012193976901471615, "epsilon_dpo/beta_margin_mean": 0.0035042737144976854, "epsilon_dpo/beta_margin_std": 0.0488242469727993, "epsilon_dpo/loss_margin_mean": 0.038533955812454224, "grad_norm": 22.38343048095703, "kl/avg_steps": 0.125, "kl/beta": 0.09982869029045105, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 1.044776119402985e-07, "logits/chosen": 0.41666990518569946, "logits/rejected": 0.5997655391693115, "logps/chosen": -86.97470092773438, "logps/ref_chosen": -87.02511596679688, "logps/ref_rejected": -115.12210845947266, "logps/rejected": -115.1102294921875, "loss": 1.3834, "rewards/accuracies": 0.546875, "rewards/chosen": 0.004882176406681538, "rewards/margins": 0.003504345426335931, "rewards/rejected": 0.0013778312131762505, "step": 15 }, { "epoch": 0.02418745275888133, "epsilon_dpo/beta": 0.09968286752700806, "epsilon_dpo/beta_margin_grad_mean": -0.4998158812522888, "epsilon_dpo/beta_margin_grad_std": 0.007838121615350246, "epsilon_dpo/beta_margin_mean": 0.0007361461757682264, "epsilon_dpo/beta_margin_std": 0.03135906532406807, "epsilon_dpo/loss_margin_mean": 0.009959220886230469, "grad_norm": 19.675779342651367, "kl/avg_steps": 0.03125, "kl/beta": 0.09970405697822571, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.1194029850746268e-07, "logits/chosen": 0.36139073967933655, "logits/rejected": 0.5083924531936646, "logps/chosen": -65.78361511230469, "logps/ref_chosen": -65.84059143066406, "logps/ref_rejected": -95.32190704345703, "logps/rejected": -95.27488708496094, "loss": 1.3858, "rewards/accuracies": 0.515625, "rewards/chosen": 0.005595714319497347, "rewards/margins": 0.0007362178876064718, "rewards/rejected": 0.004859496373683214, "step": 16 }, { "epoch": 0.025699168556311415, "epsilon_dpo/beta": 0.09983862936496735, "epsilon_dpo/beta_margin_grad_mean": -0.5021052956581116, "epsilon_dpo/beta_margin_grad_std": 0.0091917235404253, "epsilon_dpo/beta_margin_mean": -0.008424973115324974, "epsilon_dpo/beta_margin_std": 0.03678138181567192, "epsilon_dpo/loss_margin_mean": -0.08158034086227417, "grad_norm": 21.948225021362305, "kl/avg_steps": -0.15625, "kl/beta": 0.09967291355133057, "kl/n_epsilon_steps": 0.578125, "kl/p_epsilon_steps": 0.421875, "learning_rate": 1.1940298507462686e-07, "logits/chosen": 0.265707790851593, "logits/rejected": 0.5080006122589111, "logps/chosen": -91.50019836425781, "logps/ref_chosen": -91.41425323486328, "logps/ref_rejected": -109.13850402832031, "logps/rejected": -109.14286804199219, "loss": 1.3951, "rewards/accuracies": 0.421875, "rewards/chosen": -0.008698860183358192, "rewards/margins": -0.008425043895840645, "rewards/rejected": -0.00027381590916775167, "step": 17 }, { "epoch": 0.027210884353741496, "epsilon_dpo/beta": 0.09996367245912552, "epsilon_dpo/beta_margin_grad_mean": -0.5014711022377014, "epsilon_dpo/beta_margin_grad_std": 0.010209612548351288, "epsilon_dpo/beta_margin_mean": -0.005889255087822676, "epsilon_dpo/beta_margin_std": 0.0408661924302578, "epsilon_dpo/loss_margin_mean": -0.055916786193847656, "grad_norm": 20.114952087402344, "kl/avg_steps": -0.125, "kl/beta": 0.09982889145612717, "kl/n_epsilon_steps": 0.5625, "kl/p_epsilon_steps": 0.4375, "learning_rate": 1.2686567164179106e-07, "logits/chosen": 0.295833557844162, "logits/rejected": 0.4196588397026062, "logps/chosen": -67.54663848876953, "logps/ref_chosen": -67.49713134765625, "logps/ref_rejected": -91.58694458007812, "logps/rejected": -91.58052825927734, "loss": 1.3926, "rewards/accuracies": 0.46875, "rewards/chosen": -0.005046804901212454, "rewards/margins": -0.005889310501515865, "rewards/rejected": 0.0008425057167187333, "step": 18 }, { "epoch": 0.02872260015117158, "epsilon_dpo/beta": 0.09993259608745575, "epsilon_dpo/beta_margin_grad_mean": -0.5004714727401733, "epsilon_dpo/beta_margin_grad_std": 0.01001923531293869, "epsilon_dpo/beta_margin_mean": -0.001890135812573135, "epsilon_dpo/beta_margin_std": 0.040112897753715515, "epsilon_dpo/loss_margin_mean": -0.01608338952064514, "grad_norm": 21.74436378479004, "kl/avg_steps": 0.03125, "kl/beta": 0.09995383769273758, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.343283582089552e-07, "logits/chosen": 0.534844696521759, "logits/rejected": 0.6198501586914062, "logps/chosen": -92.06356811523438, "logps/ref_chosen": -92.03971862792969, "logps/ref_rejected": -106.87828063964844, "logps/rejected": -106.88603210449219, "loss": 1.3886, "rewards/accuracies": 0.5, "rewards/chosen": -0.002494135871529579, "rewards/margins": -0.001890112180262804, "rewards/rejected": -0.0006040236912667751, "step": 19 }, { "epoch": 0.030234315948601664, "epsilon_dpo/beta": 0.09987014532089233, "epsilon_dpo/beta_margin_grad_mean": -0.49894291162490845, "epsilon_dpo/beta_margin_grad_std": 0.008131814189255238, "epsilon_dpo/beta_margin_mean": 0.004229863174259663, "epsilon_dpo/beta_margin_std": 0.032535601407289505, "epsilon_dpo/loss_margin_mean": 0.04488375782966614, "grad_norm": 21.32170295715332, "kl/avg_steps": 0.0625, "kl/beta": 0.09992261230945587, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 1.4179104477611938e-07, "logits/chosen": 0.298221617937088, "logits/rejected": 0.36488860845565796, "logps/chosen": -79.46307373046875, "logps/ref_chosen": -79.5280532836914, "logps/ref_rejected": -96.36955261230469, "logps/rejected": -96.34945678710938, "loss": 1.3823, "rewards/accuracies": 0.53125, "rewards/chosen": 0.006376877427101135, "rewards/margins": 0.00422988086938858, "rewards/rejected": 0.002146995859220624, "step": 20 }, { "epoch": 0.031746031746031744, "epsilon_dpo/beta": 0.09987018257379532, "epsilon_dpo/beta_margin_grad_mean": -0.5001654624938965, "epsilon_dpo/beta_margin_grad_std": 0.010333231650292873, "epsilon_dpo/beta_margin_mean": -0.0006611068965867162, "epsilon_dpo/beta_margin_std": 0.04135696962475777, "epsilon_dpo/loss_margin_mean": -0.003520876169204712, "grad_norm": 20.74885368347168, "kl/avg_steps": 0.0, "kl/beta": 0.09986019879579544, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 1.4925373134328355e-07, "logits/chosen": 0.32211834192276, "logits/rejected": 0.4015572667121887, "logps/chosen": -87.98433685302734, "logps/ref_chosen": -87.99819946289062, "logps/ref_rejected": -103.26502990722656, "logps/rejected": -103.24763488769531, "loss": 1.3874, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0012191254645586014, "rewards/margins": -0.0006612140568904579, "rewards/rejected": 0.0018803395796567202, "step": 21 }, { "epoch": 0.03325774754346183, "epsilon_dpo/beta": 0.0998389720916748, "epsilon_dpo/beta_margin_grad_mean": -0.49859529733657837, "epsilon_dpo/beta_margin_grad_std": 0.00828811526298523, "epsilon_dpo/beta_margin_mean": 0.005621983669698238, "epsilon_dpo/beta_margin_std": 0.033166419714689255, "epsilon_dpo/loss_margin_mean": 0.05865293741226196, "grad_norm": 20.97919464111328, "kl/avg_steps": 0.03125, "kl/beta": 0.09986019879579544, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.5671641791044775e-07, "logits/chosen": 0.23899997770786285, "logits/rejected": 0.5824065804481506, "logps/chosen": -81.6365966796875, "logps/ref_chosen": -81.64871978759766, "logps/ref_rejected": -100.75686645507812, "logps/rejected": -100.80340576171875, "loss": 1.381, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0011308013927191496, "rewards/margins": 0.0056219929829239845, "rewards/rejected": -0.004491191357374191, "step": 22 }, { "epoch": 0.03476946334089191, "epsilon_dpo/beta": 0.099838986992836, "epsilon_dpo/beta_margin_grad_mean": -0.4990421533584595, "epsilon_dpo/beta_margin_grad_std": 0.010845753364264965, "epsilon_dpo/beta_margin_mean": 0.003838536562398076, "epsilon_dpo/beta_margin_std": 0.043410323560237885, "epsilon_dpo/loss_margin_mean": 0.0417940616607666, "grad_norm": 21.895824432373047, "kl/avg_steps": 0.0, "kl/beta": 0.09982900321483612, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 1.6417910447761193e-07, "logits/chosen": 0.3985062539577484, "logits/rejected": 0.523756742477417, "logps/chosen": -92.14674377441406, "logps/ref_chosen": -92.16307830810547, "logps/ref_rejected": -113.73563385009766, "logps/rejected": -113.76109313964844, "loss": 1.3829, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0014242115430533886, "rewards/margins": 0.0038385987281799316, "rewards/rejected": -0.0024143874179571867, "step": 23 }, { "epoch": 0.036281179138321996, "epsilon_dpo/beta": 0.099838986992836, "epsilon_dpo/beta_margin_grad_mean": -0.49900925159454346, "epsilon_dpo/beta_margin_grad_std": 0.01059596799314022, "epsilon_dpo/beta_margin_mean": 0.0039714789018034935, "epsilon_dpo/beta_margin_std": 0.04242187365889549, "epsilon_dpo/loss_margin_mean": 0.04267817735671997, "grad_norm": 20.0445556640625, "kl/avg_steps": 0.0, "kl/beta": 0.09982900321483612, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 1.716417910447761e-07, "logits/chosen": 0.23830129206180573, "logits/rejected": 0.3382936120033264, "logps/chosen": -74.7188491821289, "logps/ref_chosen": -74.73088073730469, "logps/ref_rejected": -90.28663635253906, "logps/rejected": -90.31729125976562, "loss": 1.3828, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0010350291850045323, "rewards/margins": 0.003971490543335676, "rewards/rejected": -0.0029364614747464657, "step": 24 }, { "epoch": 0.03779289493575208, "epsilon_dpo/beta": 0.09974538534879684, "epsilon_dpo/beta_margin_grad_mean": -0.5002205967903137, "epsilon_dpo/beta_margin_grad_std": 0.012343145906925201, "epsilon_dpo/beta_margin_mean": -0.0008880018722265959, "epsilon_dpo/beta_margin_std": 0.04941296949982643, "epsilon_dpo/loss_margin_mean": -0.005264639854431152, "grad_norm": 21.26227569580078, "kl/avg_steps": 0.09375, "kl/beta": 0.09982900321483612, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 1.7910447761194027e-07, "logits/chosen": 0.38851284980773926, "logits/rejected": 0.5324080586433411, "logps/chosen": -81.98751068115234, "logps/ref_chosen": -81.99627685546875, "logps/ref_rejected": -114.30165100097656, "logps/rejected": -114.2876205444336, "loss": 1.3878, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0007206163718365133, "rewards/margins": -0.0008880080422386527, "rewards/rejected": 0.001608624355867505, "step": 25 }, { "epoch": 0.039304610733182165, "epsilon_dpo/beta": 0.09977664053440094, "epsilon_dpo/beta_margin_grad_mean": -0.500114381313324, "epsilon_dpo/beta_margin_grad_std": 0.009109625592827797, "epsilon_dpo/beta_margin_mean": -0.0004547676944639534, "epsilon_dpo/beta_margin_std": 0.03645933419466019, "epsilon_dpo/loss_margin_mean": -0.0018818974494934082, "grad_norm": 21.053829193115234, "kl/avg_steps": -0.03125, "kl/beta": 0.09973549842834473, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 1.8656716417910447e-07, "logits/chosen": 0.4273748993873596, "logits/rejected": 0.6181799173355103, "logps/chosen": -80.81678771972656, "logps/ref_chosen": -80.84437561035156, "logps/ref_rejected": -99.3437271118164, "logps/rejected": -99.31425476074219, "loss": 1.3871, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0025958300102502108, "rewards/margins": -0.0004547259886749089, "rewards/rejected": 0.0030505559407174587, "step": 26 }, { "epoch": 0.04081632653061224, "epsilon_dpo/beta": 0.09987019002437592, "epsilon_dpo/beta_margin_grad_mean": -0.5037402510643005, "epsilon_dpo/beta_margin_grad_std": 0.01256086491048336, "epsilon_dpo/beta_margin_mean": -0.014985705725848675, "epsilon_dpo/beta_margin_std": 0.050342340022325516, "epsilon_dpo/loss_margin_mean": -0.14665690064430237, "grad_norm": 21.79582977294922, "kl/avg_steps": -0.09375, "kl/beta": 0.09976667910814285, "kl/n_epsilon_steps": 0.546875, "kl/p_epsilon_steps": 0.453125, "learning_rate": 1.9402985074626865e-07, "logits/chosen": 0.2670428454875946, "logits/rejected": 0.4372353255748749, "logps/chosen": -85.55349731445312, "logps/ref_chosen": -85.45787811279297, "logps/ref_rejected": -100.81729888916016, "logps/rejected": -100.76626586914062, "loss": 1.402, "rewards/accuracies": 0.453125, "rewards/chosen": -0.00972618255764246, "rewards/margins": -0.014985740184783936, "rewards/rejected": 0.005259557627141476, "step": 27 }, { "epoch": 0.042328042328042326, "epsilon_dpo/beta": 0.09996391087770462, "epsilon_dpo/beta_margin_grad_mean": -0.5001952648162842, "epsilon_dpo/beta_margin_grad_std": 0.011885374784469604, "epsilon_dpo/beta_margin_mean": -0.0007757341954857111, "epsilon_dpo/beta_margin_std": 0.04760770499706268, "epsilon_dpo/loss_margin_mean": -0.004470318555831909, "grad_norm": 20.6678524017334, "kl/avg_steps": -0.09375, "kl/beta": 0.0998602956533432, "kl/n_epsilon_steps": 0.546875, "kl/p_epsilon_steps": 0.453125, "learning_rate": 2.0149253731343282e-07, "logits/chosen": 0.343519926071167, "logits/rejected": 0.44987940788269043, "logps/chosen": -82.88597106933594, "logps/ref_chosen": -82.87960815429688, "logps/ref_rejected": -95.13070678710938, "logps/rejected": -95.13259887695312, "loss": 1.3876, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0007612911285832524, "rewards/margins": -0.0007756366394460201, "rewards/rejected": 1.434539444744587e-05, "step": 28 }, { "epoch": 0.04383975812547241, "epsilon_dpo/beta": 0.09996399283409119, "epsilon_dpo/beta_margin_grad_mean": -0.49949145317077637, "epsilon_dpo/beta_margin_grad_std": 0.010683816857635975, "epsilon_dpo/beta_margin_mean": 0.002037009224295616, "epsilon_dpo/beta_margin_std": 0.042756691575050354, "epsilon_dpo/loss_margin_mean": 0.023626625537872314, "grad_norm": 23.955188751220703, "kl/avg_steps": 0.0, "kl/beta": 0.09995400160551071, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 2.08955223880597e-07, "logits/chosen": 0.44967925548553467, "logits/rejected": 0.4150964021682739, "logps/chosen": -96.74322509765625, "logps/ref_chosen": -96.73393249511719, "logps/ref_rejected": -106.6417007446289, "logps/rejected": -106.67463684082031, "loss": 1.3847, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0010466062230989337, "rewards/margins": 0.0020370446145534515, "rewards/rejected": -0.003083650954067707, "step": 29 }, { "epoch": 0.045351473922902494, "epsilon_dpo/beta": 0.0999482199549675, "epsilon_dpo/beta_margin_grad_mean": -0.5016565918922424, "epsilon_dpo/beta_margin_grad_std": 0.01134086586534977, "epsilon_dpo/beta_margin_mean": -0.006640481762588024, "epsilon_dpo/beta_margin_std": 0.045420292764902115, "epsilon_dpo/loss_margin_mean": -0.06311115622520447, "grad_norm": 22.373016357421875, "kl/avg_steps": 0.015625, "kl/beta": 0.09995400160551071, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.5, "learning_rate": 2.1641791044776117e-07, "logits/chosen": 0.2677932381629944, "logits/rejected": 0.44384220242500305, "logps/chosen": -73.20950317382812, "logps/ref_chosen": -73.2177734375, "logps/ref_rejected": -111.69508361816406, "logps/rejected": -111.62370300292969, "loss": 1.3935, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0007222597487270832, "rewards/margins": -0.006640553940087557, "rewards/rejected": 0.00736281368881464, "step": 30 }, { "epoch": 0.04686318972033258, "epsilon_dpo/beta": 0.09985467791557312, "epsilon_dpo/beta_margin_grad_mean": -0.4987980127334595, "epsilon_dpo/beta_margin_grad_std": 0.01364812720566988, "epsilon_dpo/beta_margin_mean": 0.0048094564117491245, "epsilon_dpo/beta_margin_std": 0.05463935062289238, "epsilon_dpo/loss_margin_mean": 0.052467405796051025, "grad_norm": 24.33083724975586, "kl/avg_steps": 0.09375, "kl/beta": 0.09993838518857956, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 2.2388059701492537e-07, "logits/chosen": 0.2801819145679474, "logits/rejected": 0.494117796421051, "logps/chosen": -91.95301818847656, "logps/ref_chosen": -91.93222045898438, "logps/ref_rejected": -126.33772277832031, "logps/rejected": -126.41098022460938, "loss": 1.3822, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0022574204485863447, "rewards/margins": 0.004809522069990635, "rewards/rejected": -0.007066942285746336, "step": 31 }, { "epoch": 0.04837490551776266, "epsilon_dpo/beta": 0.09991717338562012, "epsilon_dpo/beta_margin_grad_mean": -0.5006169080734253, "epsilon_dpo/beta_margin_grad_std": 0.009541872888803482, "epsilon_dpo/beta_margin_mean": -0.0024695084430277348, "epsilon_dpo/beta_margin_std": 0.038185965269804, "epsilon_dpo/loss_margin_mean": -0.021846860647201538, "grad_norm": 21.380720138549805, "kl/avg_steps": -0.0625, "kl/beta": 0.0998447835445404, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 2.3134328358208954e-07, "logits/chosen": 0.04479830712080002, "logits/rejected": 0.5272079110145569, "logps/chosen": -94.07949829101562, "logps/ref_chosen": -94.01815795898438, "logps/ref_rejected": -106.67022705078125, "logps/rejected": -106.709716796875, "loss": 1.3891, "rewards/accuracies": 0.421875, "rewards/chosen": -0.006249488331377506, "rewards/margins": -0.0024695044849067926, "rewards/rejected": -0.0037799840793013573, "step": 32 }, { "epoch": 0.049886621315192746, "epsilon_dpo/beta": 0.09966742247343063, "epsilon_dpo/beta_margin_grad_mean": -0.4978310465812683, "epsilon_dpo/beta_margin_grad_std": 0.011500690132379532, "epsilon_dpo/beta_margin_mean": 0.008678106591105461, "epsilon_dpo/beta_margin_std": 0.04604245722293854, "epsilon_dpo/loss_margin_mean": 0.09031841158866882, "grad_norm": 22.49273681640625, "kl/avg_steps": 0.25, "kl/beta": 0.09990722686052322, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.388059701492537e-07, "logits/chosen": 0.21207210421562195, "logits/rejected": 0.4895442724227905, "logps/chosen": -89.40026092529297, "logps/ref_chosen": -89.4225082397461, "logps/ref_rejected": -105.73570251464844, "logps/rejected": -105.80377197265625, "loss": 1.3782, "rewards/accuracies": 0.65625, "rewards/chosen": 0.002066537970677018, "rewards/margins": 0.008678080514073372, "rewards/rejected": -0.006611541844904423, "step": 33 }, { "epoch": 0.05139833711262283, "epsilon_dpo/beta": 0.0996057540178299, "epsilon_dpo/beta_margin_grad_mean": -0.4997608959674835, "epsilon_dpo/beta_margin_grad_std": 0.009196538478136063, "epsilon_dpo/beta_margin_mean": 0.0009559081518091261, "epsilon_dpo/beta_margin_std": 0.03679739311337471, "epsilon_dpo/loss_margin_mean": 0.012517750263214111, "grad_norm": 20.236793518066406, "kl/avg_steps": 0.0625, "kl/beta": 0.09965807944536209, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 2.4626865671641786e-07, "logits/chosen": 0.2664468288421631, "logits/rejected": 0.48599302768707275, "logps/chosen": -75.28192138671875, "logps/ref_chosen": -75.25447845458984, "logps/ref_rejected": -90.25471496582031, "logps/rejected": -90.294677734375, "loss": 1.3857, "rewards/accuracies": 0.515625, "rewards/chosen": -0.002886722795665264, "rewards/margins": 0.0009559143800288439, "rewards/rejected": -0.0038426369428634644, "step": 34 }, { "epoch": 0.05291005291005291, "epsilon_dpo/beta": 0.09963691234588623, "epsilon_dpo/beta_margin_grad_mean": -0.500873327255249, "epsilon_dpo/beta_margin_grad_std": 0.009642170742154121, "epsilon_dpo/beta_margin_mean": -0.003494812408462167, "epsilon_dpo/beta_margin_std": 0.03858347609639168, "epsilon_dpo/loss_margin_mean": -0.03209325671195984, "grad_norm": 20.750717163085938, "kl/avg_steps": -0.03125, "kl/beta": 0.09959582984447479, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 2.537313432835821e-07, "logits/chosen": 0.2340129315853119, "logits/rejected": 0.3774367570877075, "logps/chosen": -84.54783630371094, "logps/ref_chosen": -84.49177551269531, "logps/ref_rejected": -96.995849609375, "logps/rejected": -97.01981353759766, "loss": 1.3902, "rewards/accuracies": 0.5, "rewards/chosen": -0.005702049471437931, "rewards/margins": -0.003494792152196169, "rewards/rejected": -0.002207257319241762, "step": 35 }, { "epoch": 0.05442176870748299, "epsilon_dpo/beta": 0.09960579127073288, "epsilon_dpo/beta_margin_grad_mean": -0.4998779594898224, "epsilon_dpo/beta_margin_grad_std": 0.012096166610717773, "epsilon_dpo/beta_margin_mean": 0.000493088096845895, "epsilon_dpo/beta_margin_std": 0.04841361939907074, "epsilon_dpo/loss_margin_mean": 0.008720353245735168, "grad_norm": 21.406538009643555, "kl/avg_steps": 0.03125, "kl/beta": 0.09962696582078934, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 2.611940298507462e-07, "logits/chosen": 0.23152907192707062, "logits/rejected": 0.601810097694397, "logps/chosen": -72.32382202148438, "logps/ref_chosen": -72.29872131347656, "logps/ref_rejected": -106.939453125, "logps/rejected": -106.9732894897461, "loss": 1.3864, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0026765610091388226, "rewards/margins": 0.000493088096845895, "rewards/rejected": -0.0031696492806077003, "step": 36 }, { "epoch": 0.055933484504913075, "epsilon_dpo/beta": 0.09951242059469223, "epsilon_dpo/beta_margin_grad_mean": -0.49880462884902954, "epsilon_dpo/beta_margin_grad_std": 0.011057516559958458, "epsilon_dpo/beta_margin_mean": 0.004784699063748121, "epsilon_dpo/beta_margin_std": 0.0442543663084507, "epsilon_dpo/loss_margin_mean": 0.051472991704940796, "grad_norm": 26.469751358032227, "kl/avg_steps": 0.09375, "kl/beta": 0.09959584474563599, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 2.686567164179104e-07, "logits/chosen": 0.2194148302078247, "logits/rejected": 0.3654429614543915, "logps/chosen": -81.1114501953125, "logps/ref_chosen": -81.10102844238281, "logps/ref_rejected": -144.65945434570312, "logps/rejected": -144.72134399414062, "loss": 1.382, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0012095842976123095, "rewards/margins": 0.004784717224538326, "rewards/rejected": -0.005994301289319992, "step": 37 }, { "epoch": 0.05744520030234316, "epsilon_dpo/beta": 0.09941920638084412, "epsilon_dpo/beta_margin_grad_mean": -0.5004234313964844, "epsilon_dpo/beta_margin_grad_std": 0.010645652189850807, "epsilon_dpo/beta_margin_mean": -0.0017045249696820974, "epsilon_dpo/beta_margin_std": 0.04264267161488533, "epsilon_dpo/loss_margin_mean": -0.014012634754180908, "grad_norm": 21.179275512695312, "kl/avg_steps": 0.09375, "kl/beta": 0.0995025560259819, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 2.761194029850746e-07, "logits/chosen": 0.7173947095870972, "logits/rejected": 0.7745821475982666, "logps/chosen": -90.1217041015625, "logps/ref_chosen": -90.05032348632812, "logps/ref_rejected": -96.94110107421875, "logps/rejected": -96.99847412109375, "loss": 1.3885, "rewards/accuracies": 0.53125, "rewards/chosen": -0.007230603136122227, "rewards/margins": -0.0017045974964275956, "rewards/rejected": -0.005526005756109953, "step": 38 }, { "epoch": 0.05895691609977324, "epsilon_dpo/beta": 0.09935716539621353, "epsilon_dpo/beta_margin_grad_mean": -0.4987095892429352, "epsilon_dpo/beta_margin_grad_std": 0.009196286089718342, "epsilon_dpo/beta_margin_mean": 0.005164078436791897, "epsilon_dpo/beta_margin_std": 0.03680035471916199, "epsilon_dpo/loss_margin_mean": 0.0548284649848938, "grad_norm": 21.44578742980957, "kl/avg_steps": 0.0625, "kl/beta": 0.09940936416387558, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 2.8358208955223876e-07, "logits/chosen": 0.5785679221153259, "logits/rejected": 0.6269518136978149, "logps/chosen": -81.97523498535156, "logps/ref_chosen": -81.93487548828125, "logps/ref_rejected": -103.41276550292969, "logps/rejected": -103.50794982910156, "loss": 1.3815, "rewards/accuracies": 0.515625, "rewards/chosen": -0.004156146664172411, "rewards/margins": 0.005164049565792084, "rewards/rejected": -0.009320196695625782, "step": 39 }, { "epoch": 0.06046863189720333, "epsilon_dpo/beta": 0.09920196235179901, "epsilon_dpo/beta_margin_grad_mean": -0.49731066823005676, "epsilon_dpo/beta_margin_grad_std": 0.011403567157685757, "epsilon_dpo/beta_margin_mean": 0.010767289437353611, "epsilon_dpo/beta_margin_std": 0.04566150903701782, "epsilon_dpo/loss_margin_mean": 0.11194059252738953, "grad_norm": 20.931968688964844, "kl/avg_steps": 0.15625, "kl/beta": 0.09934727102518082, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 2.9104477611940296e-07, "logits/chosen": 0.4167971611022949, "logits/rejected": 0.3265213370323181, "logps/chosen": -83.24398803710938, "logps/ref_chosen": -83.23312377929688, "logps/ref_rejected": -104.69706726074219, "logps/rejected": -104.81986999511719, "loss": 1.3761, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0012151999399065971, "rewards/margins": 0.010767241939902306, "rewards/rejected": -0.011982442811131477, "step": 40 }, { "epoch": 0.06198034769463341, "epsilon_dpo/beta": 0.09929520636796951, "epsilon_dpo/beta_margin_grad_mean": -0.5001014471054077, "epsilon_dpo/beta_margin_grad_std": 0.00980335846543312, "epsilon_dpo/beta_margin_mean": -0.00040489688399247825, "epsilon_dpo/beta_margin_std": 0.03922676667571068, "epsilon_dpo/loss_margin_mean": -0.000916600227355957, "grad_norm": 21.39777183532715, "kl/avg_steps": -0.09375, "kl/beta": 0.09919228404760361, "kl/n_epsilon_steps": 0.546875, "kl/p_epsilon_steps": 0.453125, "learning_rate": 2.985074626865671e-07, "logits/chosen": 0.39663296937942505, "logits/rejected": 0.3223673105239868, "logps/chosen": -92.5504150390625, "logps/ref_chosen": -92.5335693359375, "logps/ref_rejected": -107.55760192871094, "logps/rejected": -107.57351684570312, "loss": 1.3871, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0017897419165819883, "rewards/margins": -0.0004048999398946762, "rewards/rejected": -0.0013848419766873121, "step": 41 }, { "epoch": 0.06349206349206349, "epsilon_dpo/beta": 0.09917117655277252, "epsilon_dpo/beta_margin_grad_mean": -0.5006127953529358, "epsilon_dpo/beta_margin_grad_std": 0.011890949681401253, "epsilon_dpo/beta_margin_mean": -0.0024510445073246956, "epsilon_dpo/beta_margin_std": 0.047598425298929214, "epsilon_dpo/loss_margin_mean": -0.021076232194900513, "grad_norm": 21.89639663696289, "kl/avg_steps": 0.125, "kl/beta": 0.09928536415100098, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 3.059701492537313e-07, "logits/chosen": 0.35543912649154663, "logits/rejected": 0.3687231242656708, "logps/chosen": -84.61444091796875, "logps/ref_chosen": -84.56074523925781, "logps/ref_rejected": -98.42295837402344, "logps/rejected": -98.45557403564453, "loss": 1.3893, "rewards/accuracies": 0.53125, "rewards/chosen": -0.005525700282305479, "rewards/margins": -0.002451097359880805, "rewards/rejected": -0.0030746031552553177, "step": 42 }, { "epoch": 0.06500377928949358, "epsilon_dpo/beta": 0.09910934418439865, "epsilon_dpo/beta_margin_grad_mean": -0.4985501766204834, "epsilon_dpo/beta_margin_grad_std": 0.011431388556957245, "epsilon_dpo/beta_margin_mean": 0.0058038788847625256, "epsilon_dpo/beta_margin_std": 0.04577384516596794, "epsilon_dpo/loss_margin_mean": 0.06193992495536804, "grad_norm": 21.574052810668945, "kl/avg_steps": 0.0625, "kl/beta": 0.09916140884160995, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 3.134328358208955e-07, "logits/chosen": 0.5684385299682617, "logits/rejected": 0.4524195194244385, "logps/chosen": -97.50875854492188, "logps/ref_chosen": -97.4771728515625, "logps/ref_rejected": -100.22068786621094, "logps/rejected": -100.314208984375, "loss": 1.381, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0032527425792068243, "rewards/margins": 0.00580390403047204, "rewards/rejected": -0.00905664637684822, "step": 43 }, { "epoch": 0.06651549508692366, "epsilon_dpo/beta": 0.09895452111959457, "epsilon_dpo/beta_margin_grad_mean": -0.4991898238658905, "epsilon_dpo/beta_margin_grad_std": 0.013249047100543976, "epsilon_dpo/beta_margin_mean": 0.0032492447644472122, "epsilon_dpo/beta_margin_std": 0.05304902419447899, "epsilon_dpo/loss_margin_mean": 0.03694462776184082, "grad_norm": 21.381933212280273, "kl/avg_steps": 0.15625, "kl/beta": 0.09909947216510773, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 3.2089552238805965e-07, "logits/chosen": 0.23688527941703796, "logits/rejected": 0.36477425694465637, "logps/chosen": -94.30439758300781, "logps/ref_chosen": -94.24890899658203, "logps/ref_rejected": -107.57559967041016, "logps/rejected": -107.66804504394531, "loss": 1.3838, "rewards/accuracies": 0.5625, "rewards/chosen": -0.005697104148566723, "rewards/margins": 0.0032492445316165686, "rewards/rejected": -0.00894634798169136, "step": 44 }, { "epoch": 0.06802721088435375, "epsilon_dpo/beta": 0.09866083413362503, "epsilon_dpo/beta_margin_grad_mean": -0.495146244764328, "epsilon_dpo/beta_margin_grad_std": 0.011300712823867798, "epsilon_dpo/beta_margin_mean": 0.01942691206932068, "epsilon_dpo/beta_margin_std": 0.04522864893078804, "epsilon_dpo/loss_margin_mean": 0.20017725229263306, "grad_norm": 22.540977478027344, "kl/avg_steps": 0.296875, "kl/beta": 0.09894487261772156, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 3.2835820895522385e-07, "logits/chosen": 0.27466094493865967, "logits/rejected": 0.6556792259216309, "logps/chosen": -84.09241485595703, "logps/ref_chosen": -84.11470794677734, "logps/ref_rejected": -120.240234375, "logps/rejected": -120.41810607910156, "loss": 1.3675, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0020693570841103792, "rewards/margins": 0.019426988437771797, "rewards/rejected": -0.017357632517814636, "step": 45 }, { "epoch": 0.06953892668178382, "epsilon_dpo/beta": 0.09853853285312653, "epsilon_dpo/beta_margin_grad_mean": -0.4979417026042938, "epsilon_dpo/beta_margin_grad_std": 0.010288399644196033, "epsilon_dpo/beta_margin_mean": 0.008237614296376705, "epsilon_dpo/beta_margin_std": 0.041169725358486176, "epsilon_dpo/loss_margin_mean": 0.08688095211982727, "grad_norm": 21.157970428466797, "kl/avg_steps": 0.125, "kl/beta": 0.09865199774503708, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 3.3582089552238805e-07, "logits/chosen": 0.3935273587703705, "logits/rejected": 0.28722450137138367, "logps/chosen": -84.30718231201172, "logps/ref_chosen": -84.37173461914062, "logps/ref_rejected": -106.11967468261719, "logps/rejected": -106.14201354980469, "loss": 1.3785, "rewards/accuracies": 0.59375, "rewards/chosen": 0.006227842066437006, "rewards/margins": 0.008237641304731369, "rewards/rejected": -0.0020097994711250067, "step": 46 }, { "epoch": 0.0710506424792139, "epsilon_dpo/beta": 0.09841551631689072, "epsilon_dpo/beta_margin_grad_mean": -0.49714115262031555, "epsilon_dpo/beta_margin_grad_std": 0.011357166804373264, "epsilon_dpo/beta_margin_mean": 0.011449804529547691, "epsilon_dpo/beta_margin_std": 0.04547470435500145, "epsilon_dpo/loss_margin_mean": 0.11970293521881104, "grad_norm": 20.536399841308594, "kl/avg_steps": 0.125, "kl/beta": 0.09852883964776993, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 3.432835820895522e-07, "logits/chosen": 0.38530057668685913, "logits/rejected": 0.711715579032898, "logps/chosen": -74.67720031738281, "logps/ref_chosen": -74.71006774902344, "logps/ref_rejected": -97.41304779052734, "logps/rejected": -97.4998779296875, "loss": 1.3754, "rewards/accuracies": 0.546875, "rewards/chosen": 0.003110171528533101, "rewards/margins": 0.011449852026998997, "rewards/rejected": -0.008339679799973965, "step": 47 }, { "epoch": 0.07256235827664399, "epsilon_dpo/beta": 0.09835416078567505, "epsilon_dpo/beta_margin_grad_mean": -0.4995853304862976, "epsilon_dpo/beta_margin_grad_std": 0.010593732818961143, "epsilon_dpo/beta_margin_mean": 0.0016609171871095896, "epsilon_dpo/beta_margin_std": 0.04239671304821968, "epsilon_dpo/loss_margin_mean": 0.020181596279144287, "grad_norm": 19.753515243530273, "kl/avg_steps": 0.0625, "kl/beta": 0.09840583056211472, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 3.507462686567164e-07, "logits/chosen": 0.2559836506843567, "logits/rejected": 0.528465986251831, "logps/chosen": -71.79025268554688, "logps/ref_chosen": -71.76451873779297, "logps/ref_rejected": -85.74510192871094, "logps/rejected": -85.79102325439453, "loss": 1.3851, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0027173953130841255, "rewards/margins": 0.001660920213907957, "rewards/rejected": -0.004378315526992083, "step": 48 }, { "epoch": 0.07407407407407407, "epsilon_dpo/beta": 0.0982312485575676, "epsilon_dpo/beta_margin_grad_mean": -0.4984458088874817, "epsilon_dpo/beta_margin_grad_std": 0.009596822783350945, "epsilon_dpo/beta_margin_mean": 0.0062199123203754425, "epsilon_dpo/beta_margin_std": 0.03840209171175957, "epsilon_dpo/loss_margin_mean": 0.06633976101875305, "grad_norm": 18.510061264038086, "kl/avg_steps": 0.125, "kl/beta": 0.0983443632721901, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 3.5820895522388055e-07, "logits/chosen": 0.526823878288269, "logits/rejected": 0.5619438886642456, "logps/chosen": -79.90139770507812, "logps/ref_chosen": -79.86338806152344, "logps/ref_rejected": -86.35136413574219, "logps/rejected": -86.45571899414062, "loss": 1.3805, "rewards/accuracies": 0.5625, "rewards/chosen": -0.003875012043863535, "rewards/margins": 0.006219968199729919, "rewards/rejected": -0.010094979777932167, "step": 49 }, { "epoch": 0.07558578987150416, "epsilon_dpo/beta": 0.09801653027534485, "epsilon_dpo/beta_margin_grad_mean": -0.49809837341308594, "epsilon_dpo/beta_margin_grad_std": 0.009929750114679337, "epsilon_dpo/beta_margin_mean": 0.007609872613102198, "epsilon_dpo/beta_margin_std": 0.03973739221692085, "epsilon_dpo/loss_margin_mean": 0.08075441420078278, "grad_norm": 20.460899353027344, "kl/avg_steps": 0.21875, "kl/beta": 0.09822158515453339, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 3.6567164179104475e-07, "logits/chosen": 0.2808557152748108, "logits/rejected": 0.47312384843826294, "logps/chosen": -76.44319152832031, "logps/ref_chosen": -76.4841537475586, "logps/ref_rejected": -93.49752044677734, "logps/rejected": -93.53730773925781, "loss": 1.3791, "rewards/accuracies": 0.609375, "rewards/chosen": 0.003850931068882346, "rewards/margins": 0.0076098134741187096, "rewards/rejected": -0.003758882638067007, "step": 50 }, { "epoch": 0.07709750566893424, "epsilon_dpo/beta": 0.09789447486400604, "epsilon_dpo/beta_margin_grad_mean": -0.4988565742969513, "epsilon_dpo/beta_margin_grad_std": 0.012610708363354206, "epsilon_dpo/beta_margin_mean": 0.004573495592921972, "epsilon_dpo/beta_margin_std": 0.050480011850595474, "epsilon_dpo/loss_margin_mean": 0.05065241456031799, "grad_norm": 21.301637649536133, "kl/avg_steps": 0.125, "kl/beta": 0.0980071946978569, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 3.7313432835820895e-07, "logits/chosen": 0.21216990053653717, "logits/rejected": 0.41427797079086304, "logps/chosen": -86.60366821289062, "logps/ref_chosen": -86.52101135253906, "logps/ref_rejected": -109.46188354492188, "logps/rejected": -109.59519958496094, "loss": 1.3824, "rewards/accuracies": 0.53125, "rewards/chosen": -0.008307162672281265, "rewards/margins": 0.004573536571115255, "rewards/rejected": -0.012880699709057808, "step": 51 }, { "epoch": 0.07860922146636433, "epsilon_dpo/beta": 0.09764989465475082, "epsilon_dpo/beta_margin_grad_mean": -0.49518582224845886, "epsilon_dpo/beta_margin_grad_std": 0.013356276787817478, "epsilon_dpo/beta_margin_mean": 0.0192727018147707, "epsilon_dpo/beta_margin_std": 0.05347498133778572, "epsilon_dpo/loss_margin_mean": 0.2012864351272583, "grad_norm": 22.263221740722656, "kl/avg_steps": 0.25, "kl/beta": 0.09788484126329422, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.805970149253731e-07, "logits/chosen": 0.2791569232940674, "logits/rejected": 0.3946777582168579, "logps/chosen": -95.08229064941406, "logps/ref_chosen": -95.06444549560547, "logps/ref_rejected": -123.63211059570312, "logps/rejected": -123.85124206542969, "loss": 1.3678, "rewards/accuracies": 0.640625, "rewards/chosen": -0.0018905354663729668, "rewards/margins": 0.019272757694125175, "rewards/rejected": -0.021163294091820717, "step": 52 }, { "epoch": 0.0801209372637944, "epsilon_dpo/beta": 0.09765049815177917, "epsilon_dpo/beta_margin_grad_mean": -0.499002605676651, "epsilon_dpo/beta_margin_grad_std": 0.008922034874558449, "epsilon_dpo/beta_margin_mean": 0.003993574995547533, "epsilon_dpo/beta_margin_std": 0.03570278361439705, "epsilon_dpo/loss_margin_mean": 0.0436423122882843, "grad_norm": 19.184598922729492, "kl/avg_steps": 0.0, "kl/beta": 0.0976407378911972, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 3.880597014925373e-07, "logits/chosen": 0.19146090745925903, "logits/rejected": 0.24986706674098969, "logps/chosen": -79.03793334960938, "logps/ref_chosen": -79.04600524902344, "logps/ref_rejected": -83.90208435058594, "logps/rejected": -83.93766784667969, "loss": 1.3826, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0006839814595878124, "rewards/margins": 0.003993605263531208, "rewards/rejected": -0.0033096238039433956, "step": 53 }, { "epoch": 0.08163265306122448, "epsilon_dpo/beta": 0.09755895286798477, "epsilon_dpo/beta_margin_grad_mean": -0.4974757432937622, "epsilon_dpo/beta_margin_grad_std": 0.009732147678732872, "epsilon_dpo/beta_margin_mean": 0.010100303217768669, "epsilon_dpo/beta_margin_std": 0.038946740329265594, "epsilon_dpo/loss_margin_mean": 0.10647621750831604, "grad_norm": 18.53166961669922, "kl/avg_steps": 0.09375, "kl/beta": 0.0976407378911972, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 3.9552238805970144e-07, "logits/chosen": 0.24084338545799255, "logits/rejected": 0.5024117827415466, "logps/chosen": -71.29666137695312, "logps/ref_chosen": -71.31385803222656, "logps/ref_rejected": -82.59730529785156, "logps/rejected": -82.68658447265625, "loss": 1.3766, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0015400029951706529, "rewards/margins": 0.010100292041897774, "rewards/rejected": -0.008560288697481155, "step": 54 }, { "epoch": 0.08314436885865457, "epsilon_dpo/beta": 0.09726925939321518, "epsilon_dpo/beta_margin_grad_mean": -0.49761292338371277, "epsilon_dpo/beta_margin_grad_std": 0.01231041457504034, "epsilon_dpo/beta_margin_mean": 0.009554947726428509, "epsilon_dpo/beta_margin_std": 0.04927734658122063, "epsilon_dpo/loss_margin_mean": 0.10191529989242554, "grad_norm": 21.672401428222656, "kl/avg_steps": 0.296875, "kl/beta": 0.09754928946495056, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.0298507462686564e-07, "logits/chosen": 0.5175954103469849, "logits/rejected": 0.7563967704772949, "logps/chosen": -72.74899291992188, "logps/ref_chosen": -72.7046127319336, "logps/ref_rejected": -117.64811706542969, "logps/rejected": -117.7944107055664, "loss": 1.3774, "rewards/accuracies": 0.609375, "rewards/chosen": -0.004462738521397114, "rewards/margins": 0.009554837830364704, "rewards/rejected": -0.014017576351761818, "step": 55 }, { "epoch": 0.08465608465608465, "epsilon_dpo/beta": 0.09714867919683456, "epsilon_dpo/beta_margin_grad_mean": -0.4990710914134979, "epsilon_dpo/beta_margin_grad_std": 0.011907228268682957, "epsilon_dpo/beta_margin_mean": 0.0037128631956875324, "epsilon_dpo/beta_margin_std": 0.0476621612906456, "epsilon_dpo/loss_margin_mean": 0.04199719429016113, "grad_norm": 19.10873794555664, "kl/avg_steps": 0.125, "kl/beta": 0.09726054221391678, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.1044776119402984e-07, "logits/chosen": 0.301601767539978, "logits/rejected": 0.4184657335281372, "logps/chosen": -80.00164794921875, "logps/ref_chosen": -79.95884704589844, "logps/ref_rejected": -86.45406341552734, "logps/rejected": -86.53886413574219, "loss": 1.3832, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0043641347438097, "rewards/margins": 0.0037128704134374857, "rewards/rejected": -0.008077004924416542, "step": 56 }, { "epoch": 0.08616780045351474, "epsilon_dpo/beta": 0.09693632274866104, "epsilon_dpo/beta_margin_grad_mean": -0.4949696958065033, "epsilon_dpo/beta_margin_grad_std": 0.014986831694841385, "epsilon_dpo/beta_margin_mean": 0.02014934830367565, "epsilon_dpo/beta_margin_std": 0.06002877652645111, "epsilon_dpo/loss_margin_mean": 0.2123461365699768, "grad_norm": 23.030841827392578, "kl/avg_steps": 0.21875, "kl/beta": 0.09713912010192871, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.17910447761194e-07, "logits/chosen": 0.3571116328239441, "logits/rejected": 0.2162424772977829, "logps/chosen": -80.38567352294922, "logps/ref_chosen": -80.30390930175781, "logps/ref_rejected": -119.16410827636719, "logps/rejected": -119.45822143554688, "loss": 1.3671, "rewards/accuracies": 0.59375, "rewards/chosen": -0.008071388117969036, "rewards/margins": 0.020149312913417816, "rewards/rejected": -0.028220700100064278, "step": 57 }, { "epoch": 0.08767951625094482, "epsilon_dpo/beta": 0.09696707874536514, "epsilon_dpo/beta_margin_grad_mean": -0.49949392676353455, "epsilon_dpo/beta_margin_grad_std": 0.013705207034945488, "epsilon_dpo/beta_margin_mean": 0.0020344173535704613, "epsilon_dpo/beta_margin_std": 0.054878149181604385, "epsilon_dpo/loss_margin_mean": 0.02534008026123047, "grad_norm": 22.337549209594727, "kl/avg_steps": -0.03125, "kl/beta": 0.09692709147930145, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 4.253731343283582e-07, "logits/chosen": 0.42537829279899597, "logits/rejected": 0.3799484670162201, "logps/chosen": -99.42530822753906, "logps/ref_chosen": -99.30342864990234, "logps/ref_rejected": -111.72179412841797, "logps/rejected": -111.8690185546875, "loss": 1.385, "rewards/accuracies": 0.484375, "rewards/chosen": -0.012063508853316307, "rewards/margins": 0.002034379169344902, "rewards/rejected": -0.014097888022661209, "step": 58 }, { "epoch": 0.08919123204837491, "epsilon_dpo/beta": 0.09672466665506363, "epsilon_dpo/beta_margin_grad_mean": -0.4958970248699188, "epsilon_dpo/beta_margin_grad_std": 0.012421243824064732, "epsilon_dpo/beta_margin_mean": 0.016427284106612206, "epsilon_dpo/beta_margin_std": 0.04973134025931358, "epsilon_dpo/loss_margin_mean": 0.17345958948135376, "grad_norm": 19.432371139526367, "kl/avg_steps": 0.25, "kl/beta": 0.09695739299058914, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.3283582089552234e-07, "logits/chosen": 0.4255552291870117, "logits/rejected": 0.2160358428955078, "logps/chosen": -71.87731170654297, "logps/ref_chosen": -71.85493469238281, "logps/ref_rejected": -97.24525451660156, "logps/rejected": -97.44110107421875, "loss": 1.3706, "rewards/accuracies": 0.609375, "rewards/chosen": -0.002300878055393696, "rewards/margins": 0.01642729714512825, "rewards/rejected": -0.01872817426919937, "step": 59 }, { "epoch": 0.09070294784580499, "epsilon_dpo/beta": 0.09657414257526398, "epsilon_dpo/beta_margin_grad_mean": -0.4995685815811157, "epsilon_dpo/beta_margin_grad_std": 0.012486227788031101, "epsilon_dpo/beta_margin_mean": 0.001717747189104557, "epsilon_dpo/beta_margin_std": 0.04999338835477829, "epsilon_dpo/loss_margin_mean": 0.021374523639678955, "grad_norm": 20.249168395996094, "kl/avg_steps": 0.15625, "kl/beta": 0.09671560674905777, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.4029850746268654e-07, "logits/chosen": 0.3270602822303772, "logits/rejected": 0.4148187041282654, "logps/chosen": -87.35511779785156, "logps/ref_chosen": -87.21815490722656, "logps/ref_rejected": -82.96167755126953, "logps/rejected": -83.12001037597656, "loss": 1.3852, "rewards/accuracies": 0.578125, "rewards/chosen": -0.013473069295287132, "rewards/margins": 0.0017178304260596633, "rewards/rejected": -0.01519089937210083, "step": 60 }, { "epoch": 0.09221466364323508, "epsilon_dpo/beta": 0.09663473814725876, "epsilon_dpo/beta_margin_grad_mean": -0.4984928071498871, "epsilon_dpo/beta_margin_grad_std": 0.013760149478912354, "epsilon_dpo/beta_margin_mean": 0.006036721635609865, "epsilon_dpo/beta_margin_std": 0.05508105829358101, "epsilon_dpo/loss_margin_mean": 0.06708309054374695, "grad_norm": 18.39585304260254, "kl/avg_steps": -0.0625, "kl/beta": 0.09656472504138947, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 4.4776119402985074e-07, "logits/chosen": 0.05619889125227928, "logits/rejected": 0.49303629994392395, "logps/chosen": -70.44015502929688, "logps/ref_chosen": -70.37516784667969, "logps/ref_rejected": -93.18226623535156, "logps/rejected": -93.3143310546875, "loss": 1.381, "rewards/accuracies": 0.453125, "rewards/chosen": -0.006510016042739153, "rewards/margins": 0.006036766339093447, "rewards/rejected": -0.012546783313155174, "step": 61 }, { "epoch": 0.09372637944066516, "epsilon_dpo/beta": 0.09655912220478058, "epsilon_dpo/beta_margin_grad_mean": -0.49866795539855957, "epsilon_dpo/beta_margin_grad_std": 0.014224527403712273, "epsilon_dpo/beta_margin_mean": 0.005336884409189224, "epsilon_dpo/beta_margin_std": 0.05695384740829468, "epsilon_dpo/loss_margin_mean": 0.059721678495407104, "grad_norm": 20.705806732177734, "kl/avg_steps": 0.078125, "kl/beta": 0.09662511199712753, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.53125, "learning_rate": 4.552238805970149e-07, "logits/chosen": 0.2867963910102844, "logits/rejected": 0.4729596972465515, "logps/chosen": -87.79457092285156, "logps/ref_chosen": -87.66146850585938, "logps/ref_rejected": -103.1637954711914, "logps/rejected": -103.35662078857422, "loss": 1.3818, "rewards/accuracies": 0.515625, "rewards/chosen": -0.013055233284831047, "rewards/margins": 0.0053368983790278435, "rewards/rejected": -0.018392130732536316, "step": 62 }, { "epoch": 0.09523809523809523, "epsilon_dpo/beta": 0.0963481143116951, "epsilon_dpo/beta_margin_grad_mean": -0.4944593906402588, "epsilon_dpo/beta_margin_grad_std": 0.014144835993647575, "epsilon_dpo/beta_margin_mean": 0.022186141461133957, "epsilon_dpo/beta_margin_std": 0.05663369968533516, "epsilon_dpo/loss_margin_mean": 0.2345995306968689, "grad_norm": 22.35875701904297, "kl/avg_steps": 0.21875, "kl/beta": 0.09654968231916428, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.626865671641791e-07, "logits/chosen": 0.3238663375377655, "logits/rejected": 0.24062295258045197, "logps/chosen": -83.79109954833984, "logps/ref_chosen": -83.74502563476562, "logps/ref_rejected": -107.7850341796875, "logps/rejected": -108.0656967163086, "loss": 1.365, "rewards/accuracies": 0.609375, "rewards/chosen": -0.004585597664117813, "rewards/margins": 0.02218601480126381, "rewards/rejected": -0.026771612465381622, "step": 63 }, { "epoch": 0.09674981103552532, "epsilon_dpo/beta": 0.09631846100091934, "epsilon_dpo/beta_margin_grad_mean": -0.4993444085121155, "epsilon_dpo/beta_margin_grad_std": 0.01322327833622694, "epsilon_dpo/beta_margin_mean": 0.0026313355192542076, "epsilon_dpo/beta_margin_std": 0.05294317379593849, "epsilon_dpo/loss_margin_mean": 0.03148919343948364, "grad_norm": 17.767595291137695, "kl/avg_steps": 0.03125, "kl/beta": 0.09633894264698029, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 4.701492537313433e-07, "logits/chosen": 0.24474751949310303, "logits/rejected": 0.3736121654510498, "logps/chosen": -74.78924560546875, "logps/ref_chosen": -74.63839721679688, "logps/ref_rejected": -88.96295166015625, "logps/rejected": -89.1452865600586, "loss": 1.3844, "rewards/accuracies": 0.53125, "rewards/chosen": -0.014759637415409088, "rewards/margins": 0.0026313296984881163, "rewards/rejected": -0.017390966415405273, "step": 64 }, { "epoch": 0.0982615268329554, "epsilon_dpo/beta": 0.09598737955093384, "epsilon_dpo/beta_margin_grad_mean": -0.4935208261013031, "epsilon_dpo/beta_margin_grad_std": 0.016076454892754555, "epsilon_dpo/beta_margin_mean": 0.02594032883644104, "epsilon_dpo/beta_margin_std": 0.06438540667295456, "epsilon_dpo/loss_margin_mean": 0.2748907208442688, "grad_norm": 21.29926872253418, "kl/avg_steps": 0.34375, "kl/beta": 0.09630884230136871, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.776119402985074e-07, "logits/chosen": 0.1908925324678421, "logits/rejected": 0.32332342863082886, "logps/chosen": -90.84135437011719, "logps/ref_chosen": -90.74732971191406, "logps/ref_rejected": -103.91267395019531, "logps/rejected": -104.28158569335938, "loss": 1.3616, "rewards/accuracies": 0.703125, "rewards/chosen": -0.009328576736152172, "rewards/margins": 0.025940343737602234, "rewards/rejected": -0.03526891767978668, "step": 65 }, { "epoch": 0.09977324263038549, "epsilon_dpo/beta": 0.09580853581428528, "epsilon_dpo/beta_margin_grad_mean": -0.49857693910598755, "epsilon_dpo/beta_margin_grad_std": 0.014432637952268124, "epsilon_dpo/beta_margin_mean": 0.005690241232514381, "epsilon_dpo/beta_margin_std": 0.05780486389994621, "epsilon_dpo/loss_margin_mean": 0.06374523043632507, "grad_norm": 20.495637893676758, "kl/avg_steps": 0.1875, "kl/beta": 0.09597891569137573, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.850746268656717e-07, "logits/chosen": 0.17386680841445923, "logits/rejected": 0.23358601331710815, "logps/chosen": -95.62905883789062, "logps/ref_chosen": -95.40287780761719, "logps/ref_rejected": -103.42100524902344, "logps/rejected": -103.7109375, "loss": 1.3814, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02190394513309002, "rewards/margins": 0.005690223537385464, "rewards/rejected": -0.027594169601798058, "step": 66 }, { "epoch": 0.10128495842781557, "epsilon_dpo/beta": 0.09571905434131622, "epsilon_dpo/beta_margin_grad_mean": -0.49876338243484497, "epsilon_dpo/beta_margin_grad_std": 0.013045243918895721, "epsilon_dpo/beta_margin_mean": 0.004954536911100149, "epsilon_dpo/beta_margin_std": 0.052225664258003235, "epsilon_dpo/loss_margin_mean": 0.056086331605911255, "grad_norm": 22.02642059326172, "kl/avg_steps": 0.09375, "kl/beta": 0.09579929709434509, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 4.925373134328357e-07, "logits/chosen": 0.32260456681251526, "logits/rejected": 0.4844122529029846, "logps/chosen": -84.48126220703125, "logps/ref_chosen": -84.30548095703125, "logps/ref_rejected": -107.89781188964844, "logps/rejected": -108.12968444824219, "loss": 1.382, "rewards/accuracies": 0.53125, "rewards/chosen": -0.017115123569965363, "rewards/margins": 0.004954596050083637, "rewards/rejected": -0.022069718688726425, "step": 67 }, { "epoch": 0.10279667422524566, "epsilon_dpo/beta": 0.09562939405441284, "epsilon_dpo/beta_margin_grad_mean": -0.4978659451007843, "epsilon_dpo/beta_margin_grad_std": 0.014736686833202839, "epsilon_dpo/beta_margin_mean": 0.008545473217964172, "epsilon_dpo/beta_margin_std": 0.059007227420806885, "epsilon_dpo/loss_margin_mean": 0.09425216913223267, "grad_norm": 21.87251091003418, "kl/avg_steps": 0.09375, "kl/beta": 0.09570956230163574, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 5e-07, "logits/chosen": 0.45750993490219116, "logits/rejected": 0.4745368957519531, "logps/chosen": -107.02864074707031, "logps/ref_chosen": -106.75498962402344, "logps/ref_rejected": -122.26151275634766, "logps/rejected": -122.62940979003906, "loss": 1.3786, "rewards/accuracies": 0.546875, "rewards/chosen": -0.02646515890955925, "rewards/margins": 0.008545521646738052, "rewards/rejected": -0.0350106805562973, "step": 68 }, { "epoch": 0.10430839002267574, "epsilon_dpo/beta": 0.09539041668176651, "epsilon_dpo/beta_margin_grad_mean": -0.4965830445289612, "epsilon_dpo/beta_margin_grad_std": 0.01738860085606575, "epsilon_dpo/beta_margin_mean": 0.013665448874235153, "epsilon_dpo/beta_margin_std": 0.06965122371912003, "epsilon_dpo/loss_margin_mean": 0.14875519275665283, "grad_norm": 21.092300415039062, "kl/avg_steps": 0.25, "kl/beta": 0.09561992436647415, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.999965034812934e-07, "logits/chosen": 0.41545820236206055, "logits/rejected": 0.5641710758209229, "logps/chosen": -91.29107666015625, "logps/ref_chosen": -91.08714294433594, "logps/ref_rejected": -115.80476379394531, "logps/rejected": -116.15745544433594, "loss": 1.3739, "rewards/accuracies": 0.609375, "rewards/chosen": -0.01966257020831108, "rewards/margins": 0.013665375299751759, "rewards/rejected": -0.033327944576740265, "step": 69 }, { "epoch": 0.10582010582010581, "epsilon_dpo/beta": 0.09519709646701813, "epsilon_dpo/beta_margin_grad_mean": -0.4971049427986145, "epsilon_dpo/beta_margin_grad_std": 0.01725912094116211, "epsilon_dpo/beta_margin_mean": 0.011591562069952488, "epsilon_dpo/beta_margin_std": 0.06912032514810562, "epsilon_dpo/loss_margin_mean": 0.12734776735305786, "grad_norm": 20.18518829345703, "kl/avg_steps": 0.203125, "kl/beta": 0.0953814685344696, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.999860140229787e-07, "logits/chosen": 0.43227583169937134, "logits/rejected": 0.42724496126174927, "logps/chosen": -88.18643188476562, "logps/ref_chosen": -88.12431335449219, "logps/ref_rejected": -99.41046905517578, "logps/rejected": -99.59992980957031, "loss": 1.3759, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006189244333654642, "rewards/margins": 0.011591588146984577, "rewards/rejected": -0.017780832946300507, "step": 70 }, { "epoch": 0.1073318216175359, "epsilon_dpo/beta": 0.09507863968610764, "epsilon_dpo/beta_margin_grad_mean": -0.5000821948051453, "epsilon_dpo/beta_margin_grad_std": 0.01915271021425724, "epsilon_dpo/beta_margin_mean": -0.00034239908563904464, "epsilon_dpo/beta_margin_std": 0.07671481370925903, "epsilon_dpo/loss_margin_mean": 0.002892807126045227, "grad_norm": 19.287860870361328, "kl/avg_steps": 0.125, "kl/beta": 0.09518811851739883, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.999685319184688e-07, "logits/chosen": 0.10468558967113495, "logits/rejected": 0.2944987118244171, "logps/chosen": -87.19418334960938, "logps/ref_chosen": -86.94244384765625, "logps/ref_rejected": -92.30281829833984, "logps/rejected": -92.55744934082031, "loss": 1.3881, "rewards/accuracies": 0.5625, "rewards/chosen": -0.024355504661798477, "rewards/margins": -0.000342401210218668, "rewards/rejected": -0.024013103917241096, "step": 71 }, { "epoch": 0.10884353741496598, "epsilon_dpo/beta": 0.09478166699409485, "epsilon_dpo/beta_margin_grad_mean": -0.49388766288757324, "epsilon_dpo/beta_margin_grad_std": 0.017899975180625916, "epsilon_dpo/beta_margin_mean": 0.024477774277329445, "epsilon_dpo/beta_margin_std": 0.07170306146144867, "epsilon_dpo/loss_margin_mean": 0.26375046372413635, "grad_norm": 21.809051513671875, "kl/avg_steps": 0.3125, "kl/beta": 0.0950692817568779, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.999440576567755e-07, "logits/chosen": 0.31559497117996216, "logits/rejected": 0.34403306245803833, "logps/chosen": -77.3045654296875, "logps/ref_chosen": -77.19921112060547, "logps/ref_rejected": -105.40240478515625, "logps/rejected": -105.77151489257812, "loss": 1.3633, "rewards/accuracies": 0.65625, "rewards/chosen": -0.010216562077403069, "rewards/margins": 0.02447780966758728, "rewards/rejected": -0.0346943698823452, "step": 72 }, { "epoch": 0.11035525321239607, "epsilon_dpo/beta": 0.09469373524188995, "epsilon_dpo/beta_margin_grad_mean": -0.4983028471469879, "epsilon_dpo/beta_margin_grad_std": 0.019205359742045403, "epsilon_dpo/beta_margin_mean": 0.006816650275141001, "epsilon_dpo/beta_margin_std": 0.07694670557975769, "epsilon_dpo/loss_margin_mean": 0.0784463882446289, "grad_norm": 21.700668334960938, "kl/avg_steps": 0.09375, "kl/beta": 0.09477311372756958, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 4.999125919224965e-07, "logits/chosen": 0.45154646039009094, "logits/rejected": 0.4655054211616516, "logps/chosen": -95.38381958007812, "logps/ref_chosen": -95.17266845703125, "logps/ref_rejected": -104.91008758544922, "logps/rejected": -105.19967651367188, "loss": 1.381, "rewards/accuracies": 0.515625, "rewards/chosen": -0.02027609571814537, "rewards/margins": 0.006816700100898743, "rewards/rejected": -0.027092795819044113, "step": 73 }, { "epoch": 0.11186696900982615, "epsilon_dpo/beta": 0.0943978950381279, "epsilon_dpo/beta_margin_grad_mean": -0.4949096143245697, "epsilon_dpo/beta_margin_grad_std": 0.015205258503556252, "epsilon_dpo/beta_margin_mean": 0.02038200944662094, "epsilon_dpo/beta_margin_std": 0.06088561564683914, "epsilon_dpo/loss_margin_mean": 0.22069677710533142, "grad_norm": 19.499149322509766, "kl/avg_steps": 0.3125, "kl/beta": 0.09468434751033783, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.998741355957963e-07, "logits/chosen": 0.26441439986228943, "logits/rejected": 0.3638787865638733, "logps/chosen": -70.67909240722656, "logps/ref_chosen": -70.6429214477539, "logps/ref_rejected": -102.84405517578125, "logps/rejected": -103.1009292602539, "loss": 1.3669, "rewards/accuracies": 0.640625, "rewards/chosen": -0.003621622920036316, "rewards/margins": 0.020382072776556015, "rewards/rejected": -0.02400369569659233, "step": 74 }, { "epoch": 0.11337868480725624, "epsilon_dpo/beta": 0.09413331747055054, "epsilon_dpo/beta_margin_grad_mean": -0.4952016770839691, "epsilon_dpo/beta_margin_grad_std": 0.017589138820767403, "epsilon_dpo/beta_margin_mean": 0.019206415861845016, "epsilon_dpo/beta_margin_std": 0.0704592615365982, "epsilon_dpo/loss_margin_mean": 0.2095319628715515, "grad_norm": 18.27823829650879, "kl/avg_steps": 0.28125, "kl/beta": 0.09438937902450562, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.998286897523808e-07, "logits/chosen": 0.15772899985313416, "logits/rejected": 0.2349020391702652, "logps/chosen": -65.92100524902344, "logps/ref_chosen": -65.809326171875, "logps/ref_rejected": -87.82281494140625, "logps/rejected": -88.14402770996094, "loss": 1.3684, "rewards/accuracies": 0.625, "rewards/chosen": -0.010867921635508537, "rewards/margins": 0.01920643448829651, "rewards/rejected": -0.030074356123805046, "step": 75 }, { "epoch": 0.11489040060468632, "epsilon_dpo/beta": 0.0939575582742691, "epsilon_dpo/beta_margin_grad_mean": -0.49271103739738464, "epsilon_dpo/beta_margin_grad_std": 0.019078722223639488, "epsilon_dpo/beta_margin_mean": 0.02922924794256687, "epsilon_dpo/beta_margin_std": 0.07647933810949326, "epsilon_dpo/loss_margin_mean": 0.3171220123767853, "grad_norm": 19.83082389831543, "kl/avg_steps": 0.1875, "kl/beta": 0.09412465244531631, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.997762556634679e-07, "logits/chosen": 0.2850942611694336, "logits/rejected": 0.3514692783355713, "logps/chosen": -75.77113342285156, "logps/ref_chosen": -75.6954574584961, "logps/ref_rejected": -104.007568359375, "logps/rejected": -104.40037536621094, "loss": 1.3587, "rewards/accuracies": 0.578125, "rewards/chosen": -0.007265619933605194, "rewards/margins": 0.029229262843728065, "rewards/rejected": -0.03649488091468811, "step": 76 }, { "epoch": 0.1164021164021164, "epsilon_dpo/beta": 0.09357620030641556, "epsilon_dpo/beta_margin_grad_mean": -0.491890013217926, "epsilon_dpo/beta_margin_grad_std": 0.020320318639278412, "epsilon_dpo/beta_margin_mean": 0.03250795230269432, "epsilon_dpo/beta_margin_std": 0.08144384622573853, "epsilon_dpo/loss_margin_mean": 0.3533650040626526, "grad_norm": 19.243268966674805, "kl/avg_steps": 0.40625, "kl/beta": 0.09394850581884384, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.99716834795752e-07, "logits/chosen": 0.18683543801307678, "logits/rejected": 0.33957868814468384, "logps/chosen": -78.54193878173828, "logps/ref_chosen": -78.43965148925781, "logps/ref_rejected": -101.59224700927734, "logps/rejected": -102.04789733886719, "loss": 1.3557, "rewards/accuracies": 0.703125, "rewards/chosen": -0.009762855246663094, "rewards/margins": 0.032507993280887604, "rewards/rejected": -0.04227085039019585, "step": 77 }, { "epoch": 0.11791383219954649, "epsilon_dpo/beta": 0.09346076846122742, "epsilon_dpo/beta_margin_grad_mean": -0.4948379695415497, "epsilon_dpo/beta_margin_grad_std": 0.02001882530748844, "epsilon_dpo/beta_margin_mean": 0.020674889907240868, "epsilon_dpo/beta_margin_std": 0.08018743991851807, "epsilon_dpo/loss_margin_mean": 0.22830542922019958, "grad_norm": 19.162540435791016, "kl/avg_steps": 0.125, "kl/beta": 0.09356838464736938, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.996504288113623e-07, "logits/chosen": 0.30345967411994934, "logits/rejected": 0.33946341276168823, "logps/chosen": -90.49433898925781, "logps/ref_chosen": -90.30522155761719, "logps/ref_rejected": -101.54324340820312, "logps/rejected": -101.96066284179688, "loss": 1.3673, "rewards/accuracies": 0.5625, "rewards/chosen": -0.017984943464398384, "rewards/margins": 0.020674871280789375, "rewards/rejected": -0.03865981474518776, "step": 78 }, { "epoch": 0.11942554799697656, "epsilon_dpo/beta": 0.09302281588315964, "epsilon_dpo/beta_margin_grad_mean": -0.4897191822528839, "epsilon_dpo/beta_margin_grad_std": 0.02295556291937828, "epsilon_dpo/beta_margin_mean": 0.04126286879181862, "epsilon_dpo/beta_margin_std": 0.09222093224525452, "epsilon_dpo/loss_margin_mean": 0.44970422983169556, "grad_norm": 20.854549407958984, "kl/avg_steps": 0.46875, "kl/beta": 0.09345156699419022, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.995770395678171e-07, "logits/chosen": 0.2500825524330139, "logits/rejected": 0.3519323766231537, "logps/chosen": -71.46902465820312, "logps/ref_chosen": -71.36767578125, "logps/ref_rejected": -110.53877258300781, "logps/rejected": -111.08982849121094, "loss": 1.3476, "rewards/accuracies": 0.734375, "rewards/chosen": -0.009614603593945503, "rewards/margins": 0.04126285761594772, "rewards/rejected": -0.05087745934724808, "step": 79 }, { "epoch": 0.12093726379440665, "epsilon_dpo/beta": 0.09285043925046921, "epsilon_dpo/beta_margin_grad_mean": -0.49484947323799133, "epsilon_dpo/beta_margin_grad_std": 0.022218074649572372, "epsilon_dpo/beta_margin_mean": 0.02064460888504982, "epsilon_dpo/beta_margin_std": 0.08906199038028717, "epsilon_dpo/loss_margin_mean": 0.2295338213443756, "grad_norm": 19.17706871032715, "kl/avg_steps": 0.1875, "kl/beta": 0.09301555901765823, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.994966691179711e-07, "logits/chosen": 0.3593171536922455, "logits/rejected": 0.20739956200122833, "logps/chosen": -83.57428741455078, "logps/ref_chosen": -83.30223083496094, "logps/ref_rejected": -108.11129760742188, "logps/rejected": -108.61288452148438, "loss": 1.3677, "rewards/accuracies": 0.609375, "rewards/chosen": -0.025694172829389572, "rewards/margins": 0.02064460888504982, "rewards/rejected": -0.04633878171443939, "step": 80 }, { "epoch": 0.12244897959183673, "epsilon_dpo/beta": 0.0924445390701294, "epsilon_dpo/beta_margin_grad_mean": -0.4913916289806366, "epsilon_dpo/beta_margin_grad_std": 0.019580332562327385, "epsilon_dpo/beta_margin_mean": 0.03447142615914345, "epsilon_dpo/beta_margin_std": 0.07844133675098419, "epsilon_dpo/loss_margin_mean": 0.3787657916545868, "grad_norm": 19.880578994750977, "kl/avg_steps": 0.4375, "kl/beta": 0.09284147620201111, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.994093197099587e-07, "logits/chosen": 0.18329188227653503, "logits/rejected": 0.18096214532852173, "logps/chosen": -86.397705078125, "logps/ref_chosen": -86.2278060913086, "logps/ref_rejected": -102.50193786621094, "logps/rejected": -103.05059051513672, "loss": 1.3537, "rewards/accuracies": 0.734375, "rewards/chosen": -0.01595979556441307, "rewards/margins": 0.034471385180950165, "rewards/rejected": -0.05043117702007294, "step": 81 }, { "epoch": 0.12396069538926682, "epsilon_dpo/beta": 0.09207074344158173, "epsilon_dpo/beta_margin_grad_mean": -0.48697853088378906, "epsilon_dpo/beta_margin_grad_std": 0.02084416151046753, "epsilon_dpo/beta_margin_mean": 0.05220121517777443, "epsilon_dpo/beta_margin_std": 0.0836104229092598, "epsilon_dpo/loss_margin_mean": 0.5733838677406311, "grad_norm": 18.803672790527344, "kl/avg_steps": 0.40625, "kl/beta": 0.09243706613779068, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.993149937871306e-07, "logits/chosen": 0.18002444505691528, "logits/rejected": 0.2574087381362915, "logps/chosen": -68.62680053710938, "logps/ref_chosen": -68.67723846435547, "logps/ref_rejected": -93.22198486328125, "logps/rejected": -93.74493408203125, "loss": 1.3365, "rewards/accuracies": 0.703125, "rewards/chosen": 0.0043902089819312096, "rewards/margins": 0.05220119655132294, "rewards/rejected": -0.04781098663806915, "step": 82 }, { "epoch": 0.1254724111866969, "epsilon_dpo/beta": 0.09175577014684677, "epsilon_dpo/beta_margin_grad_mean": -0.48969775438308716, "epsilon_dpo/beta_margin_grad_std": 0.021411551162600517, "epsilon_dpo/beta_margin_mean": 0.04130866378545761, "epsilon_dpo/beta_margin_std": 0.0858808383345604, "epsilon_dpo/loss_margin_mean": 0.4566318392753601, "grad_norm": 19.55249786376953, "kl/avg_steps": 0.34375, "kl/beta": 0.0920630618929863, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.992136939879856e-07, "logits/chosen": 0.1689714640378952, "logits/rejected": 0.10307054221630096, "logps/chosen": -79.06647491455078, "logps/ref_chosen": -79.0299072265625, "logps/ref_rejected": -100.61555480957031, "logps/rejected": -101.10874938964844, "loss": 1.3473, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0036600963212549686, "rewards/margins": 0.0413086824119091, "rewards/rejected": -0.044968780130147934, "step": 83 }, { "epoch": 0.12698412698412698, "epsilon_dpo/beta": 0.09152746200561523, "epsilon_dpo/beta_margin_grad_mean": -0.4955390691757202, "epsilon_dpo/beta_margin_grad_std": 0.024220502004027367, "epsilon_dpo/beta_margin_mean": 0.017843004316091537, "epsilon_dpo/beta_margin_std": 0.09716963768005371, "epsilon_dpo/loss_margin_mean": 0.20263022184371948, "grad_norm": 21.091583251953125, "kl/avg_steps": 0.25, "kl/beta": 0.09174767881631851, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.991054231460969e-07, "logits/chosen": 0.3326488733291626, "logits/rejected": 0.47113683819770813, "logps/chosen": -87.80223083496094, "logps/ref_chosen": -87.48562622070312, "logps/ref_rejected": -107.02767944335938, "logps/rejected": -107.54692077636719, "loss": 1.3709, "rewards/accuracies": 0.609375, "rewards/chosen": -0.029343653470277786, "rewards/margins": 0.01784295029938221, "rewards/rejected": -0.04718660190701485, "step": 84 }, { "epoch": 0.12849584278155707, "epsilon_dpo/beta": 0.09109899401664734, "epsilon_dpo/beta_margin_grad_mean": -0.4891459047794342, "epsilon_dpo/beta_margin_grad_std": 0.023020127788186073, "epsilon_dpo/beta_margin_mean": 0.043475423008203506, "epsilon_dpo/beta_margin_std": 0.09232480823993683, "epsilon_dpo/loss_margin_mean": 0.4838877320289612, "grad_norm": 18.248411178588867, "kl/avg_steps": 0.46875, "kl/beta": 0.09151887893676758, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.989901842900325e-07, "logits/chosen": 0.1454373598098755, "logits/rejected": 0.0769268125295639, "logps/chosen": -74.00840759277344, "logps/ref_chosen": -73.96684265136719, "logps/ref_rejected": -91.79646301269531, "logps/rejected": -92.32191467285156, "loss": 1.3454, "rewards/accuracies": 0.75, "rewards/chosen": -0.0040864236652851105, "rewards/margins": 0.04347540810704231, "rewards/rejected": -0.04756183177232742, "step": 85 }, { "epoch": 0.13000755857898716, "epsilon_dpo/beta": 0.09081630408763885, "epsilon_dpo/beta_margin_grad_mean": -0.49439719319343567, "epsilon_dpo/beta_margin_grad_std": 0.020537180826067924, "epsilon_dpo/beta_margin_mean": 0.02242736890912056, "epsilon_dpo/beta_margin_std": 0.08234351128339767, "epsilon_dpo/loss_margin_mean": 0.2534914016723633, "grad_norm": 18.693727493286133, "kl/avg_steps": 0.3125, "kl/beta": 0.09109188616275787, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.988679806432711e-07, "logits/chosen": -0.03249434754252434, "logits/rejected": -0.15132024884223938, "logps/chosen": -85.06571960449219, "logps/ref_chosen": -84.69587707519531, "logps/ref_rejected": -92.96185302734375, "logps/rejected": -93.58517456054688, "loss": 1.3657, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03381003811955452, "rewards/margins": 0.02242732048034668, "rewards/rejected": -0.0562373585999012, "step": 86 }, { "epoch": 0.13151927437641722, "epsilon_dpo/beta": 0.09053338319063187, "epsilon_dpo/beta_margin_grad_mean": -0.48980429768562317, "epsilon_dpo/beta_margin_grad_std": 0.02582697570323944, "epsilon_dpo/beta_margin_mean": 0.040944747626781464, "epsilon_dpo/beta_margin_std": 0.10375433415174484, "epsilon_dpo/loss_margin_mean": 0.4599999487400055, "grad_norm": 19.913551330566406, "kl/avg_steps": 0.3125, "kl/beta": 0.09080810844898224, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.987388156241114e-07, "logits/chosen": 0.16291941702365875, "logits/rejected": 0.10878261923789978, "logps/chosen": -91.9239730834961, "logps/ref_chosen": -91.67546844482422, "logps/ref_rejected": -113.0994644165039, "logps/rejected": -113.80797576904297, "loss": 1.3485, "rewards/accuracies": 0.703125, "rewards/chosen": -0.022793810814619064, "rewards/margins": 0.040944769978523254, "rewards/rejected": -0.06373857706785202, "step": 87 }, { "epoch": 0.1330309901738473, "epsilon_dpo/beta": 0.09036451578140259, "epsilon_dpo/beta_margin_grad_mean": -0.49780169129371643, "epsilon_dpo/beta_margin_grad_std": 0.02381385862827301, "epsilon_dpo/beta_margin_mean": 0.008791986852884293, "epsilon_dpo/beta_margin_std": 0.09545378386974335, "epsilon_dpo/loss_margin_mean": 0.1055518090724945, "grad_norm": 19.109310150146484, "kl/avg_steps": 0.1875, "kl/beta": 0.09052521735429764, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.986026928455767e-07, "logits/chosen": 0.1496576964855194, "logits/rejected": 0.30393093824386597, "logps/chosen": -88.25755310058594, "logps/ref_chosen": -87.91022491455078, "logps/ref_rejected": -97.09971618652344, "logps/rejected": -97.55259704589844, "loss": 1.3798, "rewards/accuracies": 0.546875, "rewards/chosen": -0.03172972798347473, "rewards/margins": 0.008791987784206867, "rewards/rejected": -0.04052171856164932, "step": 88 }, { "epoch": 0.1345427059712774, "epsilon_dpo/beta": 0.09002596884965897, "epsilon_dpo/beta_margin_grad_mean": -0.4860003590583801, "epsilon_dpo/beta_margin_grad_std": 0.02549973875284195, "epsilon_dpo/beta_margin_mean": 0.056202370673418045, "epsilon_dpo/beta_margin_std": 0.10239987820386887, "epsilon_dpo/loss_margin_mean": 0.6319105625152588, "grad_norm": 19.2386417388916, "kl/avg_steps": 0.375, "kl/beta": 0.09035580605268478, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.984596161153135e-07, "logits/chosen": 0.22525973618030548, "logits/rejected": 0.19805394113063812, "logps/chosen": -61.868133544921875, "logps/ref_chosen": -61.86646270751953, "logps/ref_rejected": -109.35748291015625, "logps/rejected": -109.99107360839844, "loss": 1.3335, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0005648209480568767, "rewards/margins": 0.05620241165161133, "rewards/rejected": -0.05676723271608353, "step": 89 }, { "epoch": 0.1360544217687075, "epsilon_dpo/beta": 0.08960523456335068, "epsilon_dpo/beta_margin_grad_mean": -0.48835986852645874, "epsilon_dpo/beta_margin_grad_std": 0.02396991290152073, "epsilon_dpo/beta_margin_mean": 0.0466950498521328, "epsilon_dpo/beta_margin_std": 0.09624534845352173, "epsilon_dpo/loss_margin_mean": 0.5280405282974243, "grad_norm": 20.006235122680664, "kl/avg_steps": 0.46875, "kl/beta": 0.09001823514699936, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.983095894354857e-07, "logits/chosen": 0.19097547233104706, "logits/rejected": 0.24766327440738678, "logps/chosen": -81.07152557373047, "logps/ref_chosen": -80.89642333984375, "logps/ref_rejected": -115.47240447998047, "logps/rejected": -116.17554473876953, "loss": 1.3425, "rewards/accuracies": 0.734375, "rewards/chosen": -0.01596353016793728, "rewards/margins": 0.046695053577423096, "rewards/rejected": -0.06265857815742493, "step": 90 }, { "epoch": 0.13756613756613756, "epsilon_dpo/beta": 0.08943919092416763, "epsilon_dpo/beta_margin_grad_mean": -0.4875122010707855, "epsilon_dpo/beta_margin_grad_std": 0.027514338493347168, "epsilon_dpo/beta_margin_mean": 0.05014381930232048, "epsilon_dpo/beta_margin_std": 0.11042484641075134, "epsilon_dpo/loss_margin_mean": 0.5706114172935486, "grad_norm": 17.863554000854492, "kl/avg_steps": 0.1875, "kl/beta": 0.08959824591875076, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.98152617002662e-07, "logits/chosen": 0.08106125146150589, "logits/rejected": 0.11380688101053238, "logps/chosen": -74.26775360107422, "logps/ref_chosen": -74.18321228027344, "logps/ref_rejected": -98.54835510253906, "logps/rejected": -99.20350646972656, "loss": 1.3398, "rewards/accuracies": 0.59375, "rewards/chosen": -0.008093868382275105, "rewards/margins": 0.0501437783241272, "rewards/rejected": -0.058237649500370026, "step": 91 }, { "epoch": 0.13907785336356765, "epsilon_dpo/beta": 0.08935565501451492, "epsilon_dpo/beta_margin_grad_mean": -0.49239903688430786, "epsilon_dpo/beta_margin_grad_std": 0.03181348368525505, "epsilon_dpo/beta_margin_mean": 0.03056526929140091, "epsilon_dpo/beta_margin_std": 0.1277761161327362, "epsilon_dpo/loss_margin_mean": 0.35297879576683044, "grad_norm": 19.929975509643555, "kl/avg_steps": 0.09375, "kl/beta": 0.0894305631518364, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 4.979887032076988e-07, "logits/chosen": 0.1341371238231659, "logits/rejected": 0.2965472638607025, "logps/chosen": -77.81866455078125, "logps/ref_chosen": -77.45124816894531, "logps/ref_rejected": -98.57789611816406, "logps/rejected": -99.29828643798828, "loss": 1.36, "rewards/accuracies": 0.515625, "rewards/chosen": -0.033338889479637146, "rewards/margins": 0.030565232038497925, "rewards/rejected": -0.06390412151813507, "step": 92 }, { "epoch": 0.14058956916099774, "epsilon_dpo/beta": 0.08924403786659241, "epsilon_dpo/beta_margin_grad_mean": -0.49429938197135925, "epsilon_dpo/beta_margin_grad_std": 0.03447291627526283, "epsilon_dpo/beta_margin_mean": 0.022963959723711014, "epsilon_dpo/beta_margin_std": 0.13867661356925964, "epsilon_dpo/loss_margin_mean": 0.2693634033203125, "grad_norm": 16.53227424621582, "kl/avg_steps": 0.125, "kl/beta": 0.08934679627418518, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.978178526356172e-07, "logits/chosen": -0.046473048627376556, "logits/rejected": 0.13449837267398834, "logps/chosen": -67.758544921875, "logps/ref_chosen": -67.58747100830078, "logps/ref_rejected": -81.46165466308594, "logps/rejected": -81.90208435058594, "loss": 1.3683, "rewards/accuracies": 0.546875, "rewards/chosen": -0.015754221007227898, "rewards/margins": 0.022963931784033775, "rewards/rejected": -0.03871815279126167, "step": 93 }, { "epoch": 0.1421012849584278, "epsilon_dpo/beta": 0.0889652892947197, "epsilon_dpo/beta_margin_grad_mean": -0.4860864281654358, "epsilon_dpo/beta_margin_grad_std": 0.038560304790735245, "epsilon_dpo/beta_margin_mean": 0.05622333660721779, "epsilon_dpo/beta_margin_std": 0.1561013162136078, "epsilon_dpo/loss_margin_mean": 0.6430590152740479, "grad_norm": 19.217985153198242, "kl/avg_steps": 0.3125, "kl/beta": 0.08923525363206863, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.976400700654751e-07, "logits/chosen": 0.021004803478717804, "logits/rejected": -0.012107852846384048, "logps/chosen": -85.24430847167969, "logps/ref_chosen": -85.1270523071289, "logps/ref_rejected": -102.55231475830078, "logps/rejected": -103.31263732910156, "loss": 1.3369, "rewards/accuracies": 0.671875, "rewards/chosen": -0.011046299710869789, "rewards/margins": 0.05622336268424988, "rewards/rejected": -0.06726966798305511, "step": 94 }, { "epoch": 0.1436130007558579, "epsilon_dpo/beta": 0.08874374628067017, "epsilon_dpo/beta_margin_grad_mean": -0.4858720302581787, "epsilon_dpo/beta_margin_grad_std": 0.036910202354192734, "epsilon_dpo/beta_margin_mean": 0.05691284313797951, "epsilon_dpo/beta_margin_std": 0.1488497406244278, "epsilon_dpo/loss_margin_mean": 0.6532175540924072, "grad_norm": 20.199888229370117, "kl/avg_steps": 0.25, "kl/beta": 0.08895726501941681, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.974553604702332e-07, "logits/chosen": 0.19126339256763458, "logits/rejected": 0.3154688775539398, "logps/chosen": -85.20614624023438, "logps/ref_chosen": -84.89950561523438, "logps/ref_rejected": -116.98944091796875, "logps/rejected": -117.94930267333984, "loss": 1.3357, "rewards/accuracies": 0.625, "rewards/chosen": -0.027601946145296097, "rewards/margins": 0.056912824511528015, "rewards/rejected": -0.08451476693153381, "step": 95 }, { "epoch": 0.14512471655328799, "epsilon_dpo/beta": 0.08855017274618149, "epsilon_dpo/beta_margin_grad_mean": -0.48356518149375916, "epsilon_dpo/beta_margin_grad_std": 0.038781337440013885, "epsilon_dpo/beta_margin_mean": 0.06643398851156235, "epsilon_dpo/beta_margin_std": 0.15669092535972595, "epsilon_dpo/loss_margin_mean": 0.7625659704208374, "grad_norm": 18.814918518066406, "kl/avg_steps": 0.21875, "kl/beta": 0.0887354239821434, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.972637290166157e-07, "logits/chosen": 0.20108938217163086, "logits/rejected": 0.3051398694515228, "logps/chosen": -83.65084838867188, "logps/ref_chosen": -83.37467956542969, "logps/ref_rejected": -109.94418334960938, "logps/rejected": -110.98291015625, "loss": 1.3271, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02498556300997734, "rewards/margins": 0.06643401086330414, "rewards/rejected": -0.09141957014799118, "step": 96 }, { "epoch": 0.14663643235071808, "epsilon_dpo/beta": 0.08843990415334702, "epsilon_dpo/beta_margin_grad_mean": -0.49306973814964294, "epsilon_dpo/beta_margin_grad_std": 0.036061882972717285, "epsilon_dpo/beta_margin_mean": 0.027809543535113335, "epsilon_dpo/beta_margin_std": 0.14509648084640503, "epsilon_dpo/loss_margin_mean": 0.3274966776371002, "grad_norm": 19.39443016052246, "kl/avg_steps": 0.125, "kl/beta": 0.08854173868894577, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.970651810649666e-07, "logits/chosen": 0.16154778003692627, "logits/rejected": 0.19344012439250946, "logps/chosen": -90.42269897460938, "logps/ref_chosen": -89.9176025390625, "logps/ref_rejected": -106.89872741699219, "logps/rejected": -107.73133087158203, "loss": 1.3639, "rewards/accuracies": 0.578125, "rewards/chosen": -0.04536530748009682, "rewards/margins": 0.02780964970588684, "rewards/rejected": -0.07317495346069336, "step": 97 }, { "epoch": 0.14814814814814814, "epsilon_dpo/beta": 0.08821894228458405, "epsilon_dpo/beta_margin_grad_mean": -0.4927210807800293, "epsilon_dpo/beta_margin_grad_std": 0.03449036926031113, "epsilon_dpo/beta_margin_mean": 0.029177924618124962, "epsilon_dpo/beta_margin_std": 0.13922542333602905, "epsilon_dpo/loss_margin_mean": 0.3414802551269531, "grad_norm": 18.046215057373047, "kl/avg_steps": 0.25, "kl/beta": 0.08843120187520981, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.968597221690985e-07, "logits/chosen": 0.11720709502696991, "logits/rejected": 0.3440602421760559, "logps/chosen": -79.58689880371094, "logps/ref_chosen": -79.12416076660156, "logps/ref_rejected": -96.21038818359375, "logps/rejected": -97.01461029052734, "loss": 1.3622, "rewards/accuracies": 0.625, "rewards/chosen": -0.04131529480218887, "rewards/margins": 0.02917795069515705, "rewards/rejected": -0.07049324363470078, "step": 98 }, { "epoch": 0.14965986394557823, "epsilon_dpo/beta": 0.08808165043592453, "epsilon_dpo/beta_margin_grad_mean": -0.48774832487106323, "epsilon_dpo/beta_margin_grad_std": 0.043416302651166916, "epsilon_dpo/beta_margin_mean": 0.049811091274023056, "epsilon_dpo/beta_margin_std": 0.1761889010667801, "epsilon_dpo/loss_margin_mean": 0.5790194272994995, "grad_norm": 19.454317092895508, "kl/avg_steps": 0.15625, "kl/beta": 0.08821067214012146, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.966473580761389e-07, "logits/chosen": 0.06653241068124771, "logits/rejected": 0.320538729429245, "logps/chosen": -82.21040344238281, "logps/ref_chosen": -81.95954895019531, "logps/ref_rejected": -110.06712341308594, "logps/rejected": -110.8969955444336, "loss": 1.3448, "rewards/accuracies": 0.53125, "rewards/chosen": -0.022646091878414154, "rewards/margins": 0.04981113597750664, "rewards/rejected": -0.07245723158121109, "step": 99 }, { "epoch": 0.15117157974300832, "epsilon_dpo/beta": 0.0878891870379448, "epsilon_dpo/beta_margin_grad_mean": -0.4885050058364868, "epsilon_dpo/beta_margin_grad_std": 0.04928956553339958, "epsilon_dpo/beta_margin_mean": 0.0469319149851799, "epsilon_dpo/beta_margin_std": 0.2008965015411377, "epsilon_dpo/loss_margin_mean": 0.5497127771377563, "grad_norm": 19.978914260864258, "kl/avg_steps": 0.21875, "kl/beta": 0.08807305991649628, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.964280947263676e-07, "logits/chosen": 0.1993522197008133, "logits/rejected": 0.27488380670547485, "logps/chosen": -106.40570068359375, "logps/ref_chosen": -105.83000183105469, "logps/ref_rejected": -113.98239135742188, "logps/rejected": -115.10780334472656, "loss": 1.3499, "rewards/accuracies": 0.609375, "rewards/chosen": -0.051381174474954605, "rewards/margins": 0.046931929886341095, "rewards/rejected": -0.098313108086586, "step": 100 }, { "epoch": 0.15117157974300832, "eval_epsilon_dpo/beta": 0.0876714214682579, "eval_epsilon_dpo/beta_margin_grad_mean": -0.48583516478538513, "eval_epsilon_dpo/beta_margin_grad_std": 0.04487954452633858, "eval_epsilon_dpo/beta_margin_mean": 0.05737590044736862, "eval_epsilon_dpo/beta_margin_std": 0.18227516114711761, "eval_epsilon_dpo/loss_margin_mean": 0.6693454384803772, "eval_kl/n_epsilon_steps": 0.37588027119636536, "eval_kl/p_epsilon_steps": 0.6241196990013123, "eval_logits/chosen": 0.11220303922891617, "eval_logits/rejected": 0.12404737621545792, "eval_logps/chosen": -87.733154296875, "eval_logps/ref_chosen": -87.42715454101562, "eval_logps/ref_rejected": -104.23548889160156, "eval_logps/rejected": -105.2108383178711, "eval_loss": 0.6692028045654297, "eval_rewards/accuracies": 0.6311619877815247, "eval_rewards/chosen": -0.027519822120666504, "eval_rewards/margins": 0.057375892996788025, "eval_rewards/rejected": -0.08489571511745453, "eval_runtime": 47.7081, "eval_samples_per_second": 48.273, "eval_steps_per_second": 1.509, "step": 100 }, { "epoch": 0.15268329554043839, "epsilon_dpo/beta": 0.08745016157627106, "epsilon_dpo/beta_margin_grad_mean": -0.48087450861930847, "epsilon_dpo/beta_margin_grad_std": 0.035437703132629395, "epsilon_dpo/beta_margin_mean": 0.07677444070577621, "epsilon_dpo/beta_margin_std": 0.142720028758049, "epsilon_dpo/loss_margin_mean": 0.8880053758621216, "grad_norm": 16.972736358642578, "kl/avg_steps": 0.5, "kl/beta": 0.08788082003593445, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.96201938253052e-07, "logits/chosen": -0.2826855480670929, "logits/rejected": -0.02019951120018959, "logps/chosen": -74.92662048339844, "logps/ref_chosen": -74.89061737060547, "logps/ref_rejected": -96.2606430053711, "logps/rejected": -97.18464660644531, "loss": 1.3161, "rewards/accuracies": 0.734375, "rewards/chosen": -0.003706066869199276, "rewards/margins": 0.07677440345287323, "rewards/rejected": -0.08048047125339508, "step": 101 }, { "epoch": 0.15419501133786848, "epsilon_dpo/beta": 0.08742501586675644, "epsilon_dpo/beta_margin_grad_mean": -0.4893290102481842, "epsilon_dpo/beta_margin_grad_std": 0.0464043989777565, "epsilon_dpo/beta_margin_mean": 0.043459098786115646, "epsilon_dpo/beta_margin_std": 0.18812763690948486, "epsilon_dpo/loss_margin_mean": 0.5132368206977844, "grad_norm": 17.96248435974121, "kl/avg_steps": 0.03125, "kl/beta": 0.08744360506534576, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 4.959688949822748e-07, "logits/chosen": 0.18720057606697083, "logits/rejected": 0.10392941534519196, "logps/chosen": -87.634521484375, "logps/ref_chosen": -87.23258972167969, "logps/ref_rejected": -97.16923522949219, "logps/rejected": -98.08440399169922, "loss": 1.3521, "rewards/accuracies": 0.53125, "rewards/chosen": -0.035716209560632706, "rewards/margins": 0.04345907270908356, "rewards/rejected": -0.07917527854442596, "step": 102 }, { "epoch": 0.15570672713529857, "epsilon_dpo/beta": 0.08715182542800903, "epsilon_dpo/beta_margin_grad_mean": -0.47798603773117065, "epsilon_dpo/beta_margin_grad_std": 0.04238240793347359, "epsilon_dpo/beta_margin_mean": 0.08902125805616379, "epsilon_dpo/beta_margin_std": 0.17188170552253723, "epsilon_dpo/loss_margin_mean": 1.0343337059020996, "grad_norm": 20.719120025634766, "kl/avg_steps": 0.3125, "kl/beta": 0.08741628378629684, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.957289714327572e-07, "logits/chosen": -0.14926955103874207, "logits/rejected": 0.17941465973854065, "logps/chosen": -85.25311279296875, "logps/ref_chosen": -85.21533203125, "logps/ref_rejected": -105.19367980957031, "logps/rejected": -106.26579284667969, "loss": 1.3066, "rewards/accuracies": 0.6875, "rewards/chosen": -0.004020987544208765, "rewards/margins": 0.08902129530906677, "rewards/rejected": -0.0930422842502594, "step": 103 }, { "epoch": 0.15721844293272866, "epsilon_dpo/beta": 0.08682584762573242, "epsilon_dpo/beta_margin_grad_mean": -0.47746115922927856, "epsilon_dpo/beta_margin_grad_std": 0.05114719271659851, "epsilon_dpo/beta_margin_mean": 0.0918625146150589, "epsilon_dpo/beta_margin_std": 0.2094704508781433, "epsilon_dpo/loss_margin_mean": 1.073301911354065, "grad_norm": 19.88721466064453, "kl/avg_steps": 0.375, "kl/beta": 0.08714395761489868, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.954821743156767e-07, "logits/chosen": 0.26530805230140686, "logits/rejected": 0.4458683133125305, "logps/chosen": -79.91732788085938, "logps/ref_chosen": -79.83291625976562, "logps/ref_rejected": -128.39964294433594, "logps/rejected": -129.557373046875, "loss": 1.3074, "rewards/accuracies": 0.671875, "rewards/chosen": -0.007967567071318626, "rewards/margins": 0.09186242520809174, "rewards/rejected": -0.09982998669147491, "step": 104 }, { "epoch": 0.15873015873015872, "epsilon_dpo/beta": 0.08647433668375015, "epsilon_dpo/beta_margin_grad_mean": -0.47956883907318115, "epsilon_dpo/beta_margin_grad_std": 0.05354239419102669, "epsilon_dpo/beta_margin_mean": 0.0827774778008461, "epsilon_dpo/beta_margin_std": 0.21730993688106537, "epsilon_dpo/loss_margin_mean": 0.9738532304763794, "grad_norm": 19.439359664916992, "kl/avg_steps": 0.40625, "kl/beta": 0.0868183895945549, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.952285105344791e-07, "logits/chosen": -0.006076253950595856, "logits/rejected": 0.18681776523590088, "logps/chosen": -95.36561584472656, "logps/ref_chosen": -94.9561767578125, "logps/ref_rejected": -119.2974853515625, "logps/rejected": -120.68077087402344, "loss": 1.3169, "rewards/accuracies": 0.6875, "rewards/chosen": -0.036373164504766464, "rewards/margins": 0.08277745544910431, "rewards/rejected": -0.11915061622858047, "step": 105 }, { "epoch": 0.1602418745275888, "epsilon_dpo/beta": 0.08609743416309357, "epsilon_dpo/beta_margin_grad_mean": -0.4816688895225525, "epsilon_dpo/beta_margin_grad_std": 0.04480881616473198, "epsilon_dpo/beta_margin_mean": 0.07405252009630203, "epsilon_dpo/beta_margin_std": 0.18235161900520325, "epsilon_dpo/loss_margin_mean": 0.8727110624313354, "grad_norm": 17.128450393676758, "kl/avg_steps": 0.4375, "kl/beta": 0.08646711707115173, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.949679871846857e-07, "logits/chosen": 0.1752602756023407, "logits/rejected": 0.12919707596302032, "logps/chosen": -81.92420959472656, "logps/ref_chosen": -82.00679016113281, "logps/ref_rejected": -89.77203369140625, "logps/rejected": -90.56216430664062, "loss": 1.3218, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0064305951818823814, "rewards/margins": 0.07405255734920502, "rewards/rejected": -0.06762196123600006, "step": 106 }, { "epoch": 0.1617535903250189, "epsilon_dpo/beta": 0.08591073751449585, "epsilon_dpo/beta_margin_grad_mean": -0.4878389835357666, "epsilon_dpo/beta_margin_grad_std": 0.04847485199570656, "epsilon_dpo/beta_margin_mean": 0.04896136373281479, "epsilon_dpo/beta_margin_std": 0.19566507637500763, "epsilon_dpo/loss_margin_mean": 0.5875788331031799, "grad_norm": 19.390165328979492, "kl/avg_steps": 0.21875, "kl/beta": 0.08609047532081604, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.947006115536947e-07, "logits/chosen": -0.012461047619581223, "logits/rejected": 0.23972593247890472, "logps/chosen": -102.92835998535156, "logps/ref_chosen": -102.37884521484375, "logps/ref_rejected": -113.08601379394531, "logps/rejected": -114.22311401367188, "loss": 1.3475, "rewards/accuracies": 0.609375, "rewards/chosen": -0.04798401519656181, "rewards/margins": 0.04896135628223419, "rewards/rejected": -0.0969453752040863, "step": 107 }, { "epoch": 0.16326530612244897, "epsilon_dpo/beta": 0.08564267307519913, "epsilon_dpo/beta_margin_grad_mean": -0.487002968788147, "epsilon_dpo/beta_margin_grad_std": 0.051084764301776886, "epsilon_dpo/beta_margin_mean": 0.05215499550104141, "epsilon_dpo/beta_margin_std": 0.20733976364135742, "epsilon_dpo/loss_margin_mean": 0.6264809966087341, "grad_norm": 17.811731338500977, "kl/avg_steps": 0.3125, "kl/beta": 0.08590255677700043, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.944263911205772e-07, "logits/chosen": 0.044683102518320084, "logits/rejected": -0.024815678596496582, "logps/chosen": -91.81816101074219, "logps/ref_chosen": -91.51483154296875, "logps/ref_rejected": -103.61510467529297, "logps/rejected": -104.544921875, "loss": 1.3455, "rewards/accuracies": 0.671875, "rewards/chosen": -0.02699120342731476, "rewards/margins": 0.052154965698719025, "rewards/rejected": -0.07914617657661438, "step": 108 }, { "epoch": 0.16477702191987906, "epsilon_dpo/beta": 0.08545617759227753, "epsilon_dpo/beta_margin_grad_mean": -0.48020491003990173, "epsilon_dpo/beta_margin_grad_std": 0.05594097077846527, "epsilon_dpo/beta_margin_mean": 0.08056788146495819, "epsilon_dpo/beta_margin_std": 0.22767306864261627, "epsilon_dpo/loss_margin_mean": 0.9629918336868286, "grad_norm": 17.902441024780273, "kl/avg_steps": 0.21875, "kl/beta": 0.08563495427370071, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.941453335558681e-07, "logits/chosen": 0.17891252040863037, "logits/rejected": -0.03912736475467682, "logps/chosen": -83.23799896240234, "logps/ref_chosen": -83.30767822265625, "logps/ref_rejected": -107.25475311279297, "logps/rejected": -108.14806365966797, "loss": 1.3202, "rewards/accuracies": 0.640625, "rewards/chosen": 0.005163197405636311, "rewards/margins": 0.08056794106960297, "rewards/rejected": -0.07540474086999893, "step": 109 }, { "epoch": 0.16628873771730915, "epsilon_dpo/beta": 0.08529634773731232, "epsilon_dpo/beta_margin_grad_mean": -0.496372789144516, "epsilon_dpo/beta_margin_grad_std": 0.0572805292904377, "epsilon_dpo/beta_margin_mean": 0.014190551824867725, "epsilon_dpo/beta_margin_std": 0.23347991704940796, "epsilon_dpo/loss_margin_mean": 0.18647027015686035, "grad_norm": 20.98345375061035, "kl/avg_steps": 0.1875, "kl/beta": 0.08544803410768509, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.938574467213517e-07, "logits/chosen": -0.02646956965327263, "logits/rejected": 0.17269685864448547, "logps/chosen": -104.24122619628906, "logps/ref_chosen": -103.62700653076172, "logps/ref_rejected": -98.57982635498047, "logps/rejected": -99.3805160522461, "loss": 1.3857, "rewards/accuracies": 0.59375, "rewards/chosen": -0.05344737321138382, "rewards/margins": 0.014190555550158024, "rewards/rejected": -0.06763792783021927, "step": 110 }, { "epoch": 0.16780045351473924, "epsilon_dpo/beta": 0.08516336977481842, "epsilon_dpo/beta_margin_grad_mean": -0.4857950806617737, "epsilon_dpo/beta_margin_grad_std": 0.05001501739025116, "epsilon_dpo/beta_margin_mean": 0.05734188109636307, "epsilon_dpo/beta_margin_std": 0.20205387473106384, "epsilon_dpo/loss_margin_mean": 0.691691517829895, "grad_norm": 16.861454010009766, "kl/avg_steps": 0.15625, "kl/beta": 0.08528811484575272, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.935627386698418e-07, "logits/chosen": -0.08613793551921844, "logits/rejected": -0.06585603207349777, "logps/chosen": -75.0671157836914, "logps/ref_chosen": -74.81674194335938, "logps/ref_rejected": -97.25582122802734, "logps/rejected": -98.19789123535156, "loss": 1.3399, "rewards/accuracies": 0.625, "rewards/chosen": -0.022064432501792908, "rewards/margins": 0.05734182894229889, "rewards/rejected": -0.0794062614440918, "step": 111 }, { "epoch": 0.1693121693121693, "epsilon_dpo/beta": 0.08471114188432693, "epsilon_dpo/beta_margin_grad_mean": -0.46657806634902954, "epsilon_dpo/beta_margin_grad_std": 0.05246131494641304, "epsilon_dpo/beta_margin_mean": 0.13550089299678802, "epsilon_dpo/beta_margin_std": 0.21291041374206543, "epsilon_dpo/loss_margin_mean": 1.614532470703125, "grad_norm": 19.040555953979492, "kl/avg_steps": 0.53125, "kl/beta": 0.08515506237745285, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.932612176449559e-07, "logits/chosen": 0.0674043744802475, "logits/rejected": 0.21437129378318787, "logps/chosen": -82.7444839477539, "logps/ref_chosen": -83.02973175048828, "logps/ref_rejected": -130.31686401367188, "logps/rejected": -131.64613342285156, "loss": 1.2666, "rewards/accuracies": 0.78125, "rewards/chosen": 0.02339255064725876, "rewards/margins": 0.13550087809562683, "rewards/rejected": -0.11210831254720688, "step": 112 }, { "epoch": 0.1708238851095994, "epsilon_dpo/beta": 0.08447527885437012, "epsilon_dpo/beta_margin_grad_mean": -0.48426786065101624, "epsilon_dpo/beta_margin_grad_std": 0.05853166803717613, "epsilon_dpo/beta_margin_mean": 0.06381077319383621, "epsilon_dpo/beta_margin_std": 0.2405387908220291, "epsilon_dpo/loss_margin_mean": 0.7743654251098633, "grad_norm": 17.83749771118164, "kl/avg_steps": 0.28125, "kl/beta": 0.08470506966114044, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.929528920808854e-07, "logits/chosen": -0.16013295948505402, "logits/rejected": 0.22781725227832794, "logps/chosen": -77.02522277832031, "logps/ref_chosen": -77.30706787109375, "logps/ref_rejected": -95.55424499511719, "logps/rejected": -96.04676818847656, "loss": 1.3378, "rewards/accuracies": 0.671875, "rewards/chosen": 0.022567156702280045, "rewards/margins": 0.06381077319383621, "rewards/rejected": -0.04124361276626587, "step": 113 }, { "epoch": 0.17233560090702948, "epsilon_dpo/beta": 0.08426475524902344, "epsilon_dpo/beta_margin_grad_mean": -0.4811977446079254, "epsilon_dpo/beta_margin_grad_std": 0.055794645100831985, "epsilon_dpo/beta_margin_mean": 0.07640980184078217, "epsilon_dpo/beta_margin_std": 0.226527139544487, "epsilon_dpo/loss_margin_mean": 0.9263713955879211, "grad_norm": 18.02449607849121, "kl/avg_steps": 0.25, "kl/beta": 0.08446750044822693, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.92637770602159e-07, "logits/chosen": 0.0650443509221077, "logits/rejected": -0.15849056839942932, "logps/chosen": -91.77374267578125, "logps/ref_chosen": -91.81670379638672, "logps/ref_rejected": -104.94361877441406, "logps/rejected": -105.82703399658203, "loss": 1.3241, "rewards/accuracies": 0.625, "rewards/chosen": 0.0029620258137583733, "rewards/margins": 0.07640981674194336, "rewards/rejected": -0.07344779372215271, "step": 114 }, { "epoch": 0.17384731670445955, "epsilon_dpo/beta": 0.08402828872203827, "epsilon_dpo/beta_margin_grad_mean": -0.4740752875804901, "epsilon_dpo/beta_margin_grad_std": 0.060490477830171585, "epsilon_dpo/beta_margin_mean": 0.1062379777431488, "epsilon_dpo/beta_margin_std": 0.24839532375335693, "epsilon_dpo/loss_margin_mean": 1.2842512130737305, "grad_norm": 17.447057723999023, "kl/avg_steps": 0.28125, "kl/beta": 0.0842568576335907, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.923158620234019e-07, "logits/chosen": -0.1340171992778778, "logits/rejected": -0.15569829940795898, "logps/chosen": -74.81437683105469, "logps/ref_chosen": -75.18775939941406, "logps/ref_rejected": -103.025634765625, "logps/rejected": -103.93651580810547, "loss": 1.2981, "rewards/accuracies": 0.671875, "rewards/chosen": 0.03075750358402729, "rewards/margins": 0.10623794794082642, "rewards/rejected": -0.07548044621944427, "step": 115 }, { "epoch": 0.17535903250188964, "epsilon_dpo/beta": 0.08368758857250214, "epsilon_dpo/beta_margin_grad_mean": -0.4681667387485504, "epsilon_dpo/beta_margin_grad_std": 0.0566866435110569, "epsilon_dpo/beta_margin_mean": 0.1293656975030899, "epsilon_dpo/beta_margin_std": 0.23101994395256042, "epsilon_dpo/loss_margin_mean": 1.5630714893341064, "grad_norm": 17.443565368652344, "kl/avg_steps": 0.40625, "kl/beta": 0.08402055501937866, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.91987175349089e-07, "logits/chosen": -0.1186988577246666, "logits/rejected": 0.14887529611587524, "logps/chosen": -68.73472595214844, "logps/ref_chosen": -69.45286560058594, "logps/ref_rejected": -96.70513153076172, "logps/rejected": -97.55005645751953, "loss": 1.2743, "rewards/accuracies": 0.734375, "rewards/chosen": 0.05944942682981491, "rewards/margins": 0.1293656826019287, "rewards/rejected": -0.0699162632226944, "step": 116 }, { "epoch": 0.17687074829931973, "epsilon_dpo/beta": 0.08342743664979935, "epsilon_dpo/beta_margin_grad_mean": -0.47609150409698486, "epsilon_dpo/beta_margin_grad_std": 0.061780449002981186, "epsilon_dpo/beta_margin_mean": 0.09696857631206512, "epsilon_dpo/beta_margin_std": 0.2511261999607086, "epsilon_dpo/loss_margin_mean": 1.1843879222869873, "grad_norm": 16.713768005371094, "kl/avg_steps": 0.3125, "kl/beta": 0.08368059992790222, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.916517197732933e-07, "logits/chosen": -0.10571661591529846, "logits/rejected": -0.055547840893268585, "logps/chosen": -77.54984283447266, "logps/ref_chosen": -78.27897644042969, "logps/ref_rejected": -92.01252746582031, "logps/rejected": -92.46778869628906, "loss": 1.3073, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05986713990569115, "rewards/margins": 0.09696857631206512, "rewards/rejected": -0.03710143640637398, "step": 117 }, { "epoch": 0.17838246409674982, "epsilon_dpo/beta": 0.08308932185173035, "epsilon_dpo/beta_margin_grad_mean": -0.47561800479888916, "epsilon_dpo/beta_margin_grad_std": 0.0590248741209507, "epsilon_dpo/beta_margin_mean": 0.09878911077976227, "epsilon_dpo/beta_margin_std": 0.23957890272140503, "epsilon_dpo/loss_margin_mean": 1.2095437049865723, "grad_norm": 16.629575729370117, "kl/avg_steps": 0.40625, "kl/beta": 0.08341991156339645, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.913095046794281e-07, "logits/chosen": 0.1436939239501953, "logits/rejected": 0.16659829020500183, "logps/chosen": -75.42961120605469, "logps/ref_chosen": -76.03759765625, "logps/ref_rejected": -103.52017211914062, "logps/rejected": -104.12173461914062, "loss": 1.3042, "rewards/accuracies": 0.703125, "rewards/chosen": 0.04961453005671501, "rewards/margins": 0.09878911077976227, "rewards/rejected": -0.049174580723047256, "step": 118 }, { "epoch": 0.17989417989417988, "epsilon_dpo/beta": 0.0829349085688591, "epsilon_dpo/beta_margin_grad_mean": -0.4789765775203705, "epsilon_dpo/beta_margin_grad_std": 0.07665891945362091, "epsilon_dpo/beta_margin_mean": 0.08581650257110596, "epsilon_dpo/beta_margin_std": 0.3149167597293854, "epsilon_dpo/loss_margin_mean": 1.063791036605835, "grad_norm": 17.0283260345459, "kl/avg_steps": 0.1875, "kl/beta": 0.08308239281177521, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.909605396399855e-07, "logits/chosen": -0.09115805476903915, "logits/rejected": -0.07789916545152664, "logps/chosen": -82.70497131347656, "logps/ref_chosen": -83.196044921875, "logps/ref_rejected": -103.79010009765625, "logps/rejected": -104.3628158569336, "loss": 1.3268, "rewards/accuracies": 0.59375, "rewards/chosen": 0.03945211321115494, "rewards/margins": 0.08581651002168655, "rewards/rejected": -0.046364400535821915, "step": 119 }, { "epoch": 0.18140589569160998, "epsilon_dpo/beta": 0.08257235586643219, "epsilon_dpo/beta_margin_grad_mean": -0.46213966608047485, "epsilon_dpo/beta_margin_grad_std": 0.055534347891807556, "epsilon_dpo/beta_margin_mean": 0.15391035377979279, "epsilon_dpo/beta_margin_std": 0.22700658440589905, "epsilon_dpo/loss_margin_mean": 1.8817929029464722, "grad_norm": 18.204940795898438, "kl/avg_steps": 0.4375, "kl/beta": 0.082926906645298, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.906048344162676e-07, "logits/chosen": 0.13377802073955536, "logits/rejected": 0.15807950496673584, "logps/chosen": -83.54419708251953, "logps/ref_chosen": -84.4485855102539, "logps/ref_rejected": -110.97505950927734, "logps/rejected": -111.95246887207031, "loss": 1.251, "rewards/accuracies": 0.765625, "rewards/chosen": 0.07405398041009903, "rewards/margins": 0.15391036868095398, "rewards/rejected": -0.07985638827085495, "step": 120 }, { "epoch": 0.18291761148904007, "epsilon_dpo/beta": 0.08239330351352692, "epsilon_dpo/beta_margin_grad_mean": -0.4695954918861389, "epsilon_dpo/beta_margin_grad_std": 0.07052520662546158, "epsilon_dpo/beta_margin_mean": 0.12572382390499115, "epsilon_dpo/beta_margin_std": 0.2914615273475647, "epsilon_dpo/loss_margin_mean": 1.5515471696853638, "grad_norm": 17.93547821044922, "kl/avg_steps": 0.21875, "kl/beta": 0.08256568014621735, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.902423989581143e-07, "logits/chosen": -0.16934193670749664, "logits/rejected": -0.0674653947353363, "logps/chosen": -76.95396423339844, "logps/ref_chosen": -77.71607971191406, "logps/ref_rejected": -127.38249969482422, "logps/rejected": -128.17193603515625, "loss": 1.2853, "rewards/accuracies": 0.625, "rewards/chosen": 0.06202112138271332, "rewards/margins": 0.12572380900382996, "rewards/rejected": -0.06370268762111664, "step": 121 }, { "epoch": 0.18442932728647016, "epsilon_dpo/beta": 0.08221346139907837, "epsilon_dpo/beta_margin_grad_mean": -0.4754737317562103, "epsilon_dpo/beta_margin_grad_std": 0.07240553200244904, "epsilon_dpo/beta_margin_mean": 0.10210616141557693, "epsilon_dpo/beta_margin_std": 0.30038896203041077, "epsilon_dpo/loss_margin_mean": 1.2675155401229858, "grad_norm": 17.199674606323242, "kl/avg_steps": 0.21875, "kl/beta": 0.08238545805215836, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.898732434036243e-07, "logits/chosen": -0.11136245727539062, "logits/rejected": -0.16447636485099792, "logps/chosen": -83.83956909179688, "logps/ref_chosen": -84.65371704101562, "logps/ref_rejected": -98.93002319335938, "logps/rejected": -99.38338470458984, "loss": 1.3089, "rewards/accuracies": 0.625, "rewards/chosen": 0.0658092126250267, "rewards/margins": 0.10210618376731873, "rewards/rejected": -0.03629697486758232, "step": 122 }, { "epoch": 0.18594104308390022, "epsilon_dpo/beta": 0.08200832456350327, "epsilon_dpo/beta_margin_grad_mean": -0.4690355062484741, "epsilon_dpo/beta_margin_grad_std": 0.06405887752771378, "epsilon_dpo/beta_margin_mean": 0.12652172148227692, "epsilon_dpo/beta_margin_std": 0.2623896598815918, "epsilon_dpo/loss_margin_mean": 1.5664746761322021, "grad_norm": 16.217876434326172, "kl/avg_steps": 0.25, "kl/beta": 0.0822056382894516, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.894973780788722e-07, "logits/chosen": 0.05034737288951874, "logits/rejected": 0.10809341073036194, "logps/chosen": -76.31719970703125, "logps/ref_chosen": -77.63322448730469, "logps/ref_rejected": -101.60614013671875, "logps/rejected": -101.85658264160156, "loss": 1.2807, "rewards/accuracies": 0.640625, "rewards/chosen": 0.10695463418960571, "rewards/margins": 0.1265217363834381, "rewards/rejected": -0.019567107781767845, "step": 123 }, { "epoch": 0.1874527588813303, "epsilon_dpo/beta": 0.08172693103551865, "epsilon_dpo/beta_margin_grad_mean": -0.4645516574382782, "epsilon_dpo/beta_margin_grad_std": 0.08100985735654831, "epsilon_dpo/beta_margin_mean": 0.14648139476776123, "epsilon_dpo/beta_margin_std": 0.3352731168270111, "epsilon_dpo/loss_margin_mean": 1.8201913833618164, "grad_norm": 17.60958480834961, "kl/avg_steps": 0.34375, "kl/beta": 0.08200063556432724, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.89114813497619e-07, "logits/chosen": -0.2651221752166748, "logits/rejected": 0.24253401160240173, "logps/chosen": -75.82962036132812, "logps/ref_chosen": -77.36146545410156, "logps/ref_rejected": -116.55441284179688, "logps/rejected": -116.84275817871094, "loss": 1.2727, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12411925941705704, "rewards/margins": 0.14648135006427765, "rewards/rejected": -0.02236209250986576, "step": 124 }, { "epoch": 0.1889644746787604, "epsilon_dpo/beta": 0.08131925761699677, "epsilon_dpo/beta_margin_grad_mean": -0.4667989909648895, "epsilon_dpo/beta_margin_grad_std": 0.07144204527139664, "epsilon_dpo/beta_margin_mean": 0.13624414801597595, "epsilon_dpo/beta_margin_std": 0.2952669858932495, "epsilon_dpo/loss_margin_mean": 1.698812484741211, "grad_norm": 18.070085525512695, "kl/avg_steps": 0.5, "kl/beta": 0.08171971887350082, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.887255603610184e-07, "logits/chosen": 0.05429578199982643, "logits/rejected": -0.024670738726854324, "logps/chosen": -87.46548461914062, "logps/ref_chosen": -88.697998046875, "logps/ref_rejected": -121.4852523803711, "logps/rejected": -121.95155334472656, "loss": 1.2761, "rewards/accuracies": 0.734375, "rewards/chosen": 0.09958155453205109, "rewards/margins": 0.13624417781829834, "rewards/rejected": -0.03666263073682785, "step": 125 }, { "epoch": 0.19047619047619047, "epsilon_dpo/beta": 0.08111798763275146, "epsilon_dpo/beta_margin_grad_mean": -0.4818357229232788, "epsilon_dpo/beta_margin_grad_std": 0.07389285415410995, "epsilon_dpo/beta_margin_mean": 0.07455535233020782, "epsilon_dpo/beta_margin_std": 0.3029778301715851, "epsilon_dpo/loss_margin_mean": 0.946387767791748, "grad_norm": 17.885833740234375, "kl/avg_steps": 0.25, "kl/beta": 0.08131315559148788, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.883296295573176e-07, "logits/chosen": 0.0120854452252388, "logits/rejected": 0.16786350309848785, "logps/chosen": -90.82383728027344, "logps/ref_chosen": -92.44024658203125, "logps/ref_rejected": -91.21477508544922, "logps/rejected": -90.54476928710938, "loss": 1.3358, "rewards/accuracies": 0.625, "rewards/chosen": 0.12970831990242004, "rewards/margins": 0.07455536723136902, "rewards/rejected": 0.05515296012163162, "step": 126 }, { "epoch": 0.19198790627362056, "epsilon_dpo/beta": 0.08101709187030792, "epsilon_dpo/beta_margin_grad_mean": -0.47536662220954895, "epsilon_dpo/beta_margin_grad_std": 0.06849021464586258, "epsilon_dpo/beta_margin_mean": 0.10088615864515305, "epsilon_dpo/beta_margin_std": 0.27939197421073914, "epsilon_dpo/loss_margin_mean": 1.2726936340332031, "grad_norm": 16.920372009277344, "kl/avg_steps": 0.125, "kl/beta": 0.081110380589962, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.87927032161552e-07, "logits/chosen": -0.09022588282823563, "logits/rejected": 0.08949241042137146, "logps/chosen": -85.76454162597656, "logps/ref_chosen": -87.55062103271484, "logps/ref_rejected": -97.16986083984375, "logps/rejected": -96.65647888183594, "loss": 1.3072, "rewards/accuracies": 0.546875, "rewards/chosen": 0.1433461308479309, "rewards/margins": 0.10088618844747543, "rewards/rejected": 0.04245994985103607, "step": 127 }, { "epoch": 0.19349962207105065, "epsilon_dpo/beta": 0.080764040350914, "epsilon_dpo/beta_margin_grad_mean": -0.47240495681762695, "epsilon_dpo/beta_margin_grad_std": 0.09400974959135056, "epsilon_dpo/beta_margin_mean": 0.11469464004039764, "epsilon_dpo/beta_margin_std": 0.39688840508461, "epsilon_dpo/loss_margin_mean": 1.452173113822937, "grad_norm": 18.381317138671875, "kl/avg_steps": 0.3125, "kl/beta": 0.08100911974906921, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.875177794352363e-07, "logits/chosen": 0.08796774595975876, "logits/rejected": -0.0978081226348877, "logps/chosen": -91.56980895996094, "logps/ref_chosen": -93.4315185546875, "logps/ref_rejected": -118.90899658203125, "logps/rejected": -118.49946594238281, "loss": 1.3131, "rewards/accuracies": 0.640625, "rewards/chosen": 0.14945122599601746, "rewards/margins": 0.11469465494155884, "rewards/rejected": 0.03475657105445862, "step": 128 }, { "epoch": 0.19501133786848074, "epsilon_dpo/beta": 0.08068910986185074, "epsilon_dpo/beta_margin_grad_mean": -0.48256921768188477, "epsilon_dpo/beta_margin_grad_std": 0.09119141101837158, "epsilon_dpo/beta_margin_mean": 0.07243873178958893, "epsilon_dpo/beta_margin_std": 0.3800669014453888, "epsilon_dpo/loss_margin_mean": 0.9343692064285278, "grad_norm": 18.35399627685547, "kl/avg_steps": 0.09375, "kl/beta": 0.08075675368309021, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 4.871018828260491e-07, "logits/chosen": -0.10205899924039841, "logits/rejected": -6.125867366790771e-05, "logps/chosen": -90.28472900390625, "logps/ref_chosen": -91.62559509277344, "logps/ref_rejected": -93.87883758544922, "logps/rejected": -93.47234344482422, "loss": 1.3505, "rewards/accuracies": 0.5625, "rewards/chosen": 0.10695064067840576, "rewards/margins": 0.07243867963552475, "rewards/rejected": 0.03451196104288101, "step": 129 }, { "epoch": 0.1965230536659108, "epsilon_dpo/beta": 0.08038660138845444, "epsilon_dpo/beta_margin_grad_mean": -0.4643150269985199, "epsilon_dpo/beta_margin_grad_std": 0.0763268992304802, "epsilon_dpo/beta_margin_mean": 0.1470072716474533, "epsilon_dpo/beta_margin_std": 0.31455495953559875, "epsilon_dpo/loss_margin_mean": 1.8561002016067505, "grad_norm": 17.832624435424805, "kl/avg_steps": 0.375, "kl/beta": 0.08068111538887024, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.866793539675126e-07, "logits/chosen": -0.015426471829414368, "logits/rejected": 0.08768454939126968, "logps/chosen": -82.5267333984375, "logps/ref_chosen": -84.67145538330078, "logps/ref_rejected": -104.28001403808594, "logps/rejected": -103.99138641357422, "loss": 1.269, "rewards/accuracies": 0.671875, "rewards/chosen": 0.17135189473628998, "rewards/margins": 0.1470073014497757, "rewards/rejected": 0.024344589561223984, "step": 130 }, { "epoch": 0.1980347694633409, "epsilon_dpo/beta": 0.08016163855791092, "epsilon_dpo/beta_margin_grad_mean": -0.45822128653526306, "epsilon_dpo/beta_margin_grad_std": 0.08405930548906326, "epsilon_dpo/beta_margin_mean": 0.17493918538093567, "epsilon_dpo/beta_margin_std": 0.35443899035453796, "epsilon_dpo/loss_margin_mean": 2.212343692779541, "grad_norm": 16.11795997619629, "kl/avg_steps": 0.28125, "kl/beta": 0.08037969470024109, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.86250204678667e-07, "logits/chosen": -0.13019153475761414, "logits/rejected": -0.10465170443058014, "logps/chosen": -71.01728057861328, "logps/ref_chosen": -73.30256652832031, "logps/ref_rejected": -106.04727172851562, "logps/rejected": -105.97431945800781, "loss": 1.2494, "rewards/accuracies": 0.65625, "rewards/chosen": 0.18196183443069458, "rewards/margins": 0.17493918538093567, "rewards/rejected": 0.00702265277504921, "step": 131 }, { "epoch": 0.19954648526077098, "epsilon_dpo/beta": 0.07983660697937012, "epsilon_dpo/beta_margin_grad_mean": -0.47368597984313965, "epsilon_dpo/beta_margin_grad_std": 0.07017382979393005, "epsilon_dpo/beta_margin_mean": 0.10802603513002396, "epsilon_dpo/beta_margin_std": 0.28927475214004517, "epsilon_dpo/loss_margin_mean": 1.3770979642868042, "grad_norm": 16.9212646484375, "kl/avg_steps": 0.40625, "kl/beta": 0.08015425503253937, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.858144469637408e-07, "logits/chosen": -0.0966077670454979, "logits/rejected": -0.13967418670654297, "logps/chosen": -85.59697723388672, "logps/ref_chosen": -87.72061920166016, "logps/ref_rejected": -96.35941314697266, "logps/rejected": -95.61286926269531, "loss": 1.3017, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16805227100849152, "rewards/margins": 0.10802602767944336, "rewards/rejected": 0.06002624332904816, "step": 132 }, { "epoch": 0.20105820105820105, "epsilon_dpo/beta": 0.07968823611736298, "epsilon_dpo/beta_margin_grad_mean": -0.4730938971042633, "epsilon_dpo/beta_margin_grad_std": 0.07494788616895676, "epsilon_dpo/beta_margin_mean": 0.1104988306760788, "epsilon_dpo/beta_margin_std": 0.30697232484817505, "epsilon_dpo/loss_margin_mean": 1.416254997253418, "grad_norm": 17.388769149780273, "kl/avg_steps": 0.1875, "kl/beta": 0.07982994616031647, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.853720930118138e-07, "logits/chosen": -0.20302733778953552, "logits/rejected": 0.018017714843153954, "logps/chosen": -87.78105163574219, "logps/ref_chosen": -89.97994232177734, "logps/ref_rejected": -94.67462158203125, "logps/rejected": -93.89199829101562, "loss": 1.3021, "rewards/accuracies": 0.578125, "rewards/chosen": 0.17398512363433838, "rewards/margins": 0.11049885302782059, "rewards/rejected": 0.0634862631559372, "step": 133 }, { "epoch": 0.20256991685563114, "epsilon_dpo/beta": 0.07929006963968277, "epsilon_dpo/beta_margin_grad_mean": -0.45269298553466797, "epsilon_dpo/beta_margin_grad_std": 0.09166352450847626, "epsilon_dpo/beta_margin_mean": 0.19899149239063263, "epsilon_dpo/beta_margin_std": 0.3938853442668915, "epsilon_dpo/loss_margin_mean": 2.538273334503174, "grad_norm": 15.28248119354248, "kl/avg_steps": 0.5, "kl/beta": 0.07968054711818695, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.849231551964771e-07, "logits/chosen": -0.2937329411506653, "logits/rejected": -0.21715673804283142, "logps/chosen": -73.44625854492188, "logps/ref_chosen": -76.23042297363281, "logps/ref_rejected": -100.25575256347656, "logps/rejected": -100.0098648071289, "loss": 1.2343, "rewards/accuracies": 0.734375, "rewards/chosen": 0.21985578536987305, "rewards/margins": 0.19899150729179382, "rewards/rejected": 0.020864300429821014, "step": 134 }, { "epoch": 0.20408163265306123, "epsilon_dpo/beta": 0.07901948690414429, "epsilon_dpo/beta_margin_grad_mean": -0.4722573459148407, "epsilon_dpo/beta_margin_grad_std": 0.07582727819681168, "epsilon_dpo/beta_margin_mean": 0.11306837946176529, "epsilon_dpo/beta_margin_std": 0.3117622435092926, "epsilon_dpo/loss_margin_mean": 1.458648443222046, "grad_norm": 14.768946647644043, "kl/avg_steps": 0.34375, "kl/beta": 0.07928412407636642, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.844676460754862e-07, "logits/chosen": -0.32549357414245605, "logits/rejected": -0.032874904572963715, "logps/chosen": -67.47518920898438, "logps/ref_chosen": -70.47183990478516, "logps/ref_rejected": -87.71185302734375, "logps/rejected": -86.17385864257812, "loss": 1.3004, "rewards/accuracies": 0.71875, "rewards/chosen": 0.23563119769096375, "rewards/margins": 0.11306841671466827, "rewards/rejected": 0.12256277352571487, "step": 135 }, { "epoch": 0.20559334845049132, "epsilon_dpo/beta": 0.07887225598096848, "epsilon_dpo/beta_margin_grad_mean": -0.45937618613243103, "epsilon_dpo/beta_margin_grad_std": 0.1084052100777626, "epsilon_dpo/beta_margin_mean": 0.17362651228904724, "epsilon_dpo/beta_margin_std": 0.46317678689956665, "epsilon_dpo/loss_margin_mean": 2.243760824203491, "grad_norm": 17.279558181762695, "kl/avg_steps": 0.1875, "kl/beta": 0.07901252061128616, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.840055783904106e-07, "logits/chosen": 0.16665995121002197, "logits/rejected": 0.0742223784327507, "logps/chosen": -88.99594116210938, "logps/ref_chosen": -91.05680847167969, "logps/ref_rejected": -125.45597839355469, "logps/rejected": -125.63887023925781, "loss": 1.2718, "rewards/accuracies": 0.609375, "rewards/chosen": 0.1612674742937088, "rewards/margins": 0.17362651228904724, "rewards/rejected": -0.012359026819467545, "step": 136 }, { "epoch": 0.20710506424792138, "epsilon_dpo/beta": 0.07857676595449448, "epsilon_dpo/beta_margin_grad_mean": -0.46005377173423767, "epsilon_dpo/beta_margin_grad_std": 0.087054044008255, "epsilon_dpo/beta_margin_mean": 0.16871152818202972, "epsilon_dpo/beta_margin_std": 0.3723953664302826, "epsilon_dpo/loss_margin_mean": 2.1775121688842773, "grad_norm": 14.91991138458252, "kl/avg_steps": 0.375, "kl/beta": 0.07886464893817902, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.835369650662767e-07, "logits/chosen": -0.2173086702823639, "logits/rejected": -0.10981342196464539, "logps/chosen": -73.40845489501953, "logps/ref_chosen": -76.20159912109375, "logps/ref_rejected": -95.24153137207031, "logps/rejected": -94.62591552734375, "loss": 1.258, "rewards/accuracies": 0.671875, "rewards/chosen": 0.2189544439315796, "rewards/margins": 0.1687115877866745, "rewards/rejected": 0.05024286359548569, "step": 137 }, { "epoch": 0.20861678004535147, "epsilon_dpo/beta": 0.07833231985569, "epsilon_dpo/beta_margin_grad_mean": -0.47148823738098145, "epsilon_dpo/beta_margin_grad_std": 0.09270107001066208, "epsilon_dpo/beta_margin_mean": 0.11959164589643478, "epsilon_dpo/beta_margin_std": 0.39499062299728394, "epsilon_dpo/loss_margin_mean": 1.5597786903381348, "grad_norm": 15.54496955871582, "kl/avg_steps": 0.3125, "kl/beta": 0.07857001572847366, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.830618192112065e-07, "logits/chosen": -0.33799946308135986, "logits/rejected": -0.5601421594619751, "logps/chosen": -76.29164123535156, "logps/ref_chosen": -78.20573425292969, "logps/ref_rejected": -90.40987396240234, "logps/rejected": -90.05555725097656, "loss": 1.3079, "rewards/accuracies": 0.65625, "rewards/chosen": 0.14817282557487488, "rewards/margins": 0.11959163099527359, "rewards/rejected": 0.028581196442246437, "step": 138 }, { "epoch": 0.21012849584278157, "epsilon_dpo/beta": 0.07813724875450134, "epsilon_dpo/beta_margin_grad_mean": -0.4711554944515228, "epsilon_dpo/beta_margin_grad_std": 0.09030873328447342, "epsilon_dpo/beta_margin_mean": 0.11854705959558487, "epsilon_dpo/beta_margin_std": 0.3753059506416321, "epsilon_dpo/loss_margin_mean": 1.5526247024536133, "grad_norm": 18.069461822509766, "kl/avg_steps": 0.25, "kl/beta": 0.07832524925470352, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.825801541160509e-07, "logits/chosen": -0.15845316648483276, "logits/rejected": -0.138905331492424, "logps/chosen": -93.53260803222656, "logps/ref_chosen": -94.67202758789062, "logps/ref_rejected": -108.34249114990234, "logps/rejected": -108.75569152832031, "loss": 1.3057, "rewards/accuracies": 0.640625, "rewards/chosen": 0.0876041054725647, "rewards/margins": 0.11854708194732666, "rewards/rejected": -0.030942972749471664, "step": 139 }, { "epoch": 0.21164021164021163, "epsilon_dpo/beta": 0.07794239372015, "epsilon_dpo/beta_margin_grad_mean": -0.4526260495185852, "epsilon_dpo/beta_margin_grad_std": 0.10473916679620743, "epsilon_dpo/beta_margin_mean": 0.19932420551776886, "epsilon_dpo/beta_margin_std": 0.4362448453903198, "epsilon_dpo/loss_margin_mean": 2.601555824279785, "grad_norm": 17.814538955688477, "kl/avg_steps": 0.25, "kl/beta": 0.07812992483377457, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.820919832540181e-07, "logits/chosen": -0.23502662777900696, "logits/rejected": -0.037265803664922714, "logps/chosen": -85.66082763671875, "logps/ref_chosen": -87.68214416503906, "logps/ref_rejected": -109.80048370361328, "logps/rejected": -110.38072967529297, "loss": 1.2433, "rewards/accuracies": 0.625, "rewards/chosen": 0.15537020564079285, "rewards/margins": 0.19932423532009125, "rewards/rejected": -0.0439540296792984, "step": 140 }, { "epoch": 0.21315192743764172, "epsilon_dpo/beta": 0.07765059173107147, "epsilon_dpo/beta_margin_grad_mean": -0.45165613293647766, "epsilon_dpo/beta_margin_grad_std": 0.09902672469615936, "epsilon_dpo/beta_margin_mean": 0.20049114525318146, "epsilon_dpo/beta_margin_std": 0.4123653471469879, "epsilon_dpo/loss_margin_mean": 2.6210761070251465, "grad_norm": 15.114709854125977, "kl/avg_steps": 0.375, "kl/beta": 0.07793508470058441, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.815973202802966e-07, "logits/chosen": -0.14430901408195496, "logits/rejected": 0.059034563601017, "logps/chosen": -81.68354797363281, "logps/ref_chosen": -83.4490966796875, "logps/ref_rejected": -108.92472839355469, "logps/rejected": -109.78024291992188, "loss": 1.2373, "rewards/accuracies": 0.703125, "rewards/chosen": 0.13589784502983093, "rewards/margins": 0.20049111545085907, "rewards/rejected": -0.06459328532218933, "step": 141 }, { "epoch": 0.2146636432350718, "epsilon_dpo/beta": 0.07750608026981354, "epsilon_dpo/beta_margin_grad_mean": -0.46795445680618286, "epsilon_dpo/beta_margin_grad_std": 0.09034660458564758, "epsilon_dpo/beta_margin_mean": 0.13596083223819733, "epsilon_dpo/beta_margin_std": 0.3836277425289154, "epsilon_dpo/loss_margin_mean": 1.7882254123687744, "grad_norm": 16.118061065673828, "kl/avg_steps": 0.1875, "kl/beta": 0.07764391601085663, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.810961790316729e-07, "logits/chosen": -0.17365016043186188, "logits/rejected": -0.21685616672039032, "logps/chosen": -88.09880065917969, "logps/ref_chosen": -89.39207458496094, "logps/ref_rejected": -102.29324340820312, "logps/rejected": -102.78819274902344, "loss": 1.2905, "rewards/accuracies": 0.640625, "rewards/chosen": 0.09935611486434937, "rewards/margins": 0.13596080243587494, "rewards/rejected": -0.03660469129681587, "step": 142 }, { "epoch": 0.2161753590325019, "epsilon_dpo/beta": 0.07738525420427322, "epsilon_dpo/beta_margin_grad_mean": -0.47416046261787415, "epsilon_dpo/beta_margin_grad_std": 0.10722901672124863, "epsilon_dpo/beta_margin_mean": 0.10877778381109238, "epsilon_dpo/beta_margin_std": 0.4481872320175171, "epsilon_dpo/loss_margin_mean": 1.4516710042953491, "grad_norm": 18.209449768066406, "kl/avg_steps": 0.15625, "kl/beta": 0.07749860733747482, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.805885735261454e-07, "logits/chosen": -0.11513273417949677, "logits/rejected": -0.07733216881752014, "logps/chosen": -85.3487777709961, "logps/ref_chosen": -86.82447814941406, "logps/ref_rejected": -95.78927612304688, "logps/rejected": -95.76524353027344, "loss": 1.3295, "rewards/accuracies": 0.59375, "rewards/chosen": 0.11212005466222763, "rewards/margins": 0.10877779126167297, "rewards/rejected": 0.0033422596752643585, "step": 143 }, { "epoch": 0.21768707482993196, "epsilon_dpo/beta": 0.07736126333475113, "epsilon_dpo/beta_margin_grad_mean": -0.47966277599334717, "epsilon_dpo/beta_margin_grad_std": 0.11707232892513275, "epsilon_dpo/beta_margin_mean": 0.09194158017635345, "epsilon_dpo/beta_margin_std": 0.4984915256500244, "epsilon_dpo/loss_margin_mean": 1.2385411262512207, "grad_norm": 18.285436630249023, "kl/avg_steps": 0.03125, "kl/beta": 0.07737770676612854, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 4.800745179625307e-07, "logits/chosen": -0.055155664682388306, "logits/rejected": 0.14967641234397888, "logps/chosen": -83.60164642333984, "logps/ref_chosen": -84.393310546875, "logps/ref_rejected": -98.43213653564453, "logps/rejected": -98.87901306152344, "loss": 1.3565, "rewards/accuracies": 0.484375, "rewards/chosen": 0.05851306766271591, "rewards/margins": 0.09194157272577286, "rewards/rejected": -0.03342851251363754, "step": 144 }, { "epoch": 0.21919879062736206, "epsilon_dpo/beta": 0.07711951434612274, "epsilon_dpo/beta_margin_grad_mean": -0.46541881561279297, "epsilon_dpo/beta_margin_grad_std": 0.1151481494307518, "epsilon_dpo/beta_margin_mean": 0.142835795879364, "epsilon_dpo/beta_margin_std": 0.48386383056640625, "epsilon_dpo/loss_margin_mean": 1.9007998704910278, "grad_norm": 19.481618881225586, "kl/avg_steps": 0.3125, "kl/beta": 0.07735353708267212, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.795540267200686e-07, "logits/chosen": -0.24535146355628967, "logits/rejected": -0.1606999635696411, "logps/chosen": -103.21641540527344, "logps/ref_chosen": -104.59640502929688, "logps/ref_rejected": -104.37325286865234, "logps/rejected": -104.8940658569336, "loss": 1.3056, "rewards/accuracies": 0.640625, "rewards/chosen": 0.10378595441579819, "rewards/margins": 0.14283576607704163, "rewards/rejected": -0.03904980793595314, "step": 145 }, { "epoch": 0.22071050642479215, "epsilon_dpo/beta": 0.07699976861476898, "epsilon_dpo/beta_margin_grad_mean": -0.4759455621242523, "epsilon_dpo/beta_margin_grad_std": 0.09818226099014282, "epsilon_dpo/beta_margin_mean": 0.09853528439998627, "epsilon_dpo/beta_margin_std": 0.4120221436023712, "epsilon_dpo/loss_margin_mean": 1.321279764175415, "grad_norm": 16.349870681762695, "kl/avg_steps": 0.15625, "kl/beta": 0.07711255550384521, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.790271143580173e-07, "logits/chosen": -0.3498794436454773, "logits/rejected": -0.0407610684633255, "logps/chosen": -80.74744415283203, "logps/ref_chosen": -82.47695922851562, "logps/ref_rejected": -88.8701171875, "logps/rejected": -88.46188354492188, "loss": 1.3316, "rewards/accuracies": 0.5625, "rewards/chosen": 0.13155552744865417, "rewards/margins": 0.09853523969650269, "rewards/rejected": 0.033020272850990295, "step": 146 }, { "epoch": 0.2222222222222222, "epsilon_dpo/beta": 0.07683151960372925, "epsilon_dpo/beta_margin_grad_mean": -0.4728216528892517, "epsilon_dpo/beta_margin_grad_std": 0.09379173070192337, "epsilon_dpo/beta_margin_mean": 0.11323540657758713, "epsilon_dpo/beta_margin_std": 0.3893289864063263, "epsilon_dpo/loss_margin_mean": 1.5119967460632324, "grad_norm": 17.558822631835938, "kl/avg_steps": 0.21875, "kl/beta": 0.07699225842952728, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.784937956152489e-07, "logits/chosen": -0.10575242340564728, "logits/rejected": -0.1430339366197586, "logps/chosen": -89.42935943603516, "logps/ref_chosen": -90.40057373046875, "logps/ref_rejected": -104.02629089355469, "logps/rejected": -104.56707763671875, "loss": 1.3134, "rewards/accuracies": 0.609375, "rewards/chosen": 0.07326846569776535, "rewards/margins": 0.11323542892932892, "rewards/rejected": -0.03996695578098297, "step": 147 }, { "epoch": 0.2237339380196523, "epsilon_dpo/beta": 0.07651975750923157, "epsilon_dpo/beta_margin_grad_mean": -0.44731733202934265, "epsilon_dpo/beta_margin_grad_std": 0.11125834286212921, "epsilon_dpo/beta_margin_mean": 0.22437097132205963, "epsilon_dpo/beta_margin_std": 0.4774388372898102, "epsilon_dpo/loss_margin_mean": 2.9725053310394287, "grad_norm": 14.532676696777344, "kl/avg_steps": 0.40625, "kl/beta": 0.07682420313358307, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.779540854098347e-07, "logits/chosen": -0.1696356236934662, "logits/rejected": -0.37039607763290405, "logps/chosen": -69.44158935546875, "logps/ref_chosen": -71.44349670410156, "logps/ref_rejected": -97.9690170288086, "logps/rejected": -98.93960571289062, "loss": 1.229, "rewards/accuracies": 0.703125, "rewards/chosen": 0.15155889093875885, "rewards/margins": 0.22437095642089844, "rewards/rejected": -0.07281208038330078, "step": 148 }, { "epoch": 0.2252456538170824, "epsilon_dpo/beta": 0.07632970809936523, "epsilon_dpo/beta_margin_grad_mean": -0.46323949098587036, "epsilon_dpo/beta_margin_grad_std": 0.09741820394992828, "epsilon_dpo/beta_margin_mean": 0.15397903323173523, "epsilon_dpo/beta_margin_std": 0.40842485427856445, "epsilon_dpo/loss_margin_mean": 2.057908773422241, "grad_norm": 16.44625473022461, "kl/avg_steps": 0.25, "kl/beta": 0.0765133649110794, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.774079988386296e-07, "logits/chosen": -0.2737234830856323, "logits/rejected": -0.3755127191543579, "logps/chosen": -77.71635437011719, "logps/ref_chosen": -78.34549713134766, "logps/ref_rejected": -90.18255615234375, "logps/rejected": -91.611328125, "loss": 1.2788, "rewards/accuracies": 0.640625, "rewards/chosen": 0.04624457284808159, "rewards/margins": 0.15397900342941284, "rewards/rejected": -0.10773443430662155, "step": 149 }, { "epoch": 0.22675736961451248, "epsilon_dpo/beta": 0.07602010667324066, "epsilon_dpo/beta_margin_grad_mean": -0.43523797392845154, "epsilon_dpo/beta_margin_grad_std": 0.12384334951639175, "epsilon_dpo/beta_margin_mean": 0.27531003952026367, "epsilon_dpo/beta_margin_std": 0.5316606163978577, "epsilon_dpo/loss_margin_mean": 3.6705944538116455, "grad_norm": 15.97244930267334, "kl/avg_steps": 0.40625, "kl/beta": 0.07632256299257278, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.768555511768486e-07, "logits/chosen": -0.1420062780380249, "logits/rejected": 0.244110107421875, "logps/chosen": -91.69683837890625, "logps/ref_chosen": -92.33145904541016, "logps/ref_rejected": -113.8134765625, "logps/rejected": -116.84945678710938, "loss": 1.1975, "rewards/accuracies": 0.71875, "rewards/chosen": 0.045774202793836594, "rewards/margins": 0.27531006932258606, "rewards/rejected": -0.22953587770462036, "step": 150 }, { "epoch": 0.22826908541194255, "epsilon_dpo/beta": 0.07576003670692444, "epsilon_dpo/beta_margin_grad_mean": -0.44170546531677246, "epsilon_dpo/beta_margin_grad_std": 0.11201171576976776, "epsilon_dpo/beta_margin_mean": 0.25496262311935425, "epsilon_dpo/beta_margin_std": 0.4993725121021271, "epsilon_dpo/loss_margin_mean": 3.4074244499206543, "grad_norm": 15.150566101074219, "kl/avg_steps": 0.34375, "kl/beta": 0.07601375877857208, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.762967578776406e-07, "logits/chosen": -0.12288770824670792, "logits/rejected": -0.15996623039245605, "logps/chosen": -72.63821411132812, "logps/ref_chosen": -74.43798828125, "logps/ref_rejected": -96.98008728027344, "logps/rejected": -98.58773803710938, "loss": 1.2056, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13440963625907898, "rewards/margins": 0.25496259331703186, "rewards/rejected": -0.12055293470621109, "step": 151 }, { "epoch": 0.22978080120937264, "epsilon_dpo/beta": 0.07542947679758072, "epsilon_dpo/beta_margin_grad_mean": -0.4548986852169037, "epsilon_dpo/beta_margin_grad_std": 0.11138568818569183, "epsilon_dpo/beta_margin_mean": 0.1916198432445526, "epsilon_dpo/beta_margin_std": 0.4807240962982178, "epsilon_dpo/loss_margin_mean": 2.5821034908294678, "grad_norm": 17.452680587768555, "kl/avg_steps": 0.4375, "kl/beta": 0.075753353536129, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.757316345716553e-07, "logits/chosen": -0.05779954418540001, "logits/rejected": -0.016348371282219887, "logps/chosen": -77.57362365722656, "logps/ref_chosen": -77.98807525634766, "logps/ref_rejected": -103.52305603027344, "logps/rejected": -105.69070434570312, "loss": 1.259, "rewards/accuracies": 0.703125, "rewards/chosen": 0.02951541170477867, "rewards/margins": 0.1916198581457138, "rewards/rejected": -0.16210445761680603, "step": 152 }, { "epoch": 0.23129251700680273, "epsilon_dpo/beta": 0.07521876692771912, "epsilon_dpo/beta_margin_grad_mean": -0.4610231816768646, "epsilon_dpo/beta_margin_grad_std": 0.09761146456003189, "epsilon_dpo/beta_margin_mean": 0.16371819376945496, "epsilon_dpo/beta_margin_std": 0.4085807204246521, "epsilon_dpo/loss_margin_mean": 2.216705560684204, "grad_norm": 16.362926483154297, "kl/avg_steps": 0.28125, "kl/beta": 0.07542337477207184, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.751601970666064e-07, "logits/chosen": -0.28554776310920715, "logits/rejected": -0.3057416081428528, "logps/chosen": -96.85858154296875, "logps/ref_chosen": -95.72166442871094, "logps/ref_rejected": -100.04006958007812, "logps/rejected": -103.39369201660156, "loss": 1.2699, "rewards/accuracies": 0.640625, "rewards/chosen": -0.08693025261163712, "rewards/margins": 0.16371814906597137, "rewards/rejected": -0.2506484091281891, "step": 153 }, { "epoch": 0.2328042328042328, "epsilon_dpo/beta": 0.07503131777048111, "epsilon_dpo/beta_margin_grad_mean": -0.4663916826248169, "epsilon_dpo/beta_margin_grad_std": 0.11229141056537628, "epsilon_dpo/beta_margin_mean": 0.1404602825641632, "epsilon_dpo/beta_margin_std": 0.47098055481910706, "epsilon_dpo/loss_margin_mean": 1.9198356866836548, "grad_norm": 16.172679901123047, "kl/avg_steps": 0.25, "kl/beta": 0.07521184533834457, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.745824613468292e-07, "logits/chosen": -0.3652782440185547, "logits/rejected": -0.49543774127960205, "logps/chosen": -82.0560302734375, "logps/ref_chosen": -82.50230407714844, "logps/ref_rejected": -82.01812744140625, "logps/rejected": -83.49168395996094, "loss": 1.3048, "rewards/accuracies": 0.625, "rewards/chosen": 0.03172617405653, "rewards/margins": 0.14046026766300201, "rewards/rejected": -0.10873409360647202, "step": 154 }, { "epoch": 0.23431594860166288, "epsilon_dpo/beta": 0.0747738629579544, "epsilon_dpo/beta_margin_grad_mean": -0.4513029158115387, "epsilon_dpo/beta_margin_grad_std": 0.11095915734767914, "epsilon_dpo/beta_margin_mean": 0.20482796430587769, "epsilon_dpo/beta_margin_std": 0.47065576910972595, "epsilon_dpo/loss_margin_mean": 2.7832815647125244, "grad_norm": 16.804861068725586, "kl/avg_steps": 0.34375, "kl/beta": 0.07502428442239761, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.7399844357283393e-07, "logits/chosen": -0.12051165103912354, "logits/rejected": -0.13051287829875946, "logps/chosen": -90.16606903076172, "logps/ref_chosen": -89.09580993652344, "logps/ref_rejected": -101.94664001464844, "logps/rejected": -105.80018615722656, "loss": 1.2454, "rewards/accuracies": 0.671875, "rewards/chosen": -0.08176211267709732, "rewards/margins": 0.20482799410820007, "rewards/rejected": -0.2865900993347168, "step": 155 }, { "epoch": 0.23582766439909297, "epsilon_dpo/beta": 0.07442424446344376, "epsilon_dpo/beta_margin_grad_mean": -0.42783400416374207, "epsilon_dpo/beta_margin_grad_std": 0.10387593507766724, "epsilon_dpo/beta_margin_mean": 0.3065878450870514, "epsilon_dpo/beta_margin_std": 0.44690340757369995, "epsilon_dpo/loss_margin_mean": 4.155799865722656, "grad_norm": 17.409835815429688, "kl/avg_steps": 0.46875, "kl/beta": 0.07476726919412613, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.7340816008085305e-07, "logits/chosen": -0.25020796060562134, "logits/rejected": 0.00035034865140914917, "logps/chosen": -95.34709930419922, "logps/ref_chosen": -94.36488342285156, "logps/ref_rejected": -119.41386413574219, "logps/rejected": -124.5518798828125, "loss": 1.1506, "rewards/accuracies": 0.75, "rewards/chosen": -0.0749220922589302, "rewards/margins": 0.306587815284729, "rewards/rejected": -0.3815099000930786, "step": 156 }, { "epoch": 0.23733938019652306, "epsilon_dpo/beta": 0.07421655207872391, "epsilon_dpo/beta_margin_grad_mean": -0.4635535776615143, "epsilon_dpo/beta_margin_grad_std": 0.11218099296092987, "epsilon_dpo/beta_margin_mean": 0.15487989783287048, "epsilon_dpo/beta_margin_std": 0.4757290482521057, "epsilon_dpo/loss_margin_mean": 2.131009817123413, "grad_norm": 14.102409362792969, "kl/avg_steps": 0.28125, "kl/beta": 0.07441843301057816, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.728116273823847e-07, "logits/chosen": -0.44304779171943665, "logits/rejected": -0.15374836325645447, "logps/chosen": -75.98442077636719, "logps/ref_chosen": -75.79052734375, "logps/ref_rejected": -84.51288604736328, "logps/rejected": -86.83779907226562, "loss": 1.2921, "rewards/accuracies": 0.65625, "rewards/chosen": -0.016395092010498047, "rewards/margins": 0.15487989783287048, "rewards/rejected": -0.17127500474452972, "step": 157 }, { "epoch": 0.23885109599395313, "epsilon_dpo/beta": 0.07407798618078232, "epsilon_dpo/beta_margin_grad_mean": -0.46485766768455505, "epsilon_dpo/beta_margin_grad_std": 0.1275767832994461, "epsilon_dpo/beta_margin_mean": 0.15245291590690613, "epsilon_dpo/beta_margin_std": 0.5510936379432678, "epsilon_dpo/loss_margin_mean": 2.1135518550872803, "grad_norm": 15.723213195800781, "kl/avg_steps": 0.1875, "kl/beta": 0.07420971989631653, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.7220886216373085e-07, "logits/chosen": -0.3810843527317047, "logits/rejected": -0.2918296754360199, "logps/chosen": -87.435302734375, "logps/ref_chosen": -87.19711303710938, "logps/ref_rejected": -104.29627227783203, "logps/rejected": -106.64801025390625, "loss": 1.3123, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0198702123016119, "rewards/margins": 0.1524529755115509, "rewards/rejected": -0.17232318222522736, "step": 158 }, { "epoch": 0.24036281179138322, "epsilon_dpo/beta": 0.07382359355688095, "epsilon_dpo/beta_margin_grad_mean": -0.4507734477519989, "epsilon_dpo/beta_margin_grad_std": 0.1107042208313942, "epsilon_dpo/beta_margin_mean": 0.20948928594589233, "epsilon_dpo/beta_margin_std": 0.4829138219356537, "epsilon_dpo/loss_margin_mean": 2.880485773086548, "grad_norm": 15.230273246765137, "kl/avg_steps": 0.34375, "kl/beta": 0.07407083362340927, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.715998812855304e-07, "logits/chosen": -0.2649644911289215, "logits/rejected": -0.09603996574878693, "logps/chosen": -78.42363739013672, "logps/ref_chosen": -78.20480346679688, "logps/ref_rejected": -95.99697875976562, "logps/rejected": -99.09629821777344, "loss": 1.2431, "rewards/accuracies": 0.671875, "rewards/chosen": -0.017694566398859024, "rewards/margins": 0.20948931574821472, "rewards/rejected": -0.22718387842178345, "step": 159 }, { "epoch": 0.2418745275888133, "epsilon_dpo/beta": 0.07359376549720764, "epsilon_dpo/beta_margin_grad_mean": -0.4621080160140991, "epsilon_dpo/beta_margin_grad_std": 0.10441814363002777, "epsilon_dpo/beta_margin_mean": 0.16097773611545563, "epsilon_dpo/beta_margin_std": 0.44186532497406006, "epsilon_dpo/loss_margin_mean": 2.2304911613464355, "grad_norm": 14.367936134338379, "kl/avg_steps": 0.3125, "kl/beta": 0.07381708920001984, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.7098470178228755e-07, "logits/chosen": -0.6084429621696472, "logits/rejected": -0.48294705152511597, "logps/chosen": -68.7177505493164, "logps/ref_chosen": -68.0002670288086, "logps/ref_rejected": -89.75141143798828, "logps/rejected": -92.69938659667969, "loss": 1.279, "rewards/accuracies": 0.65625, "rewards/chosen": -0.054394595324993134, "rewards/margins": 0.16097773611545563, "rewards/rejected": -0.21537232398986816, "step": 160 }, { "epoch": 0.24338624338624337, "epsilon_dpo/beta": 0.07329551130533218, "epsilon_dpo/beta_margin_grad_mean": -0.45018425583839417, "epsilon_dpo/beta_margin_grad_std": 0.1030559092760086, "epsilon_dpo/beta_margin_mean": 0.20355476438999176, "epsilon_dpo/beta_margin_std": 0.4396621882915497, "epsilon_dpo/loss_margin_mean": 2.8164150714874268, "grad_norm": 14.061203002929688, "kl/avg_steps": 0.40625, "kl/beta": 0.07358712702989578, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.703633408618955e-07, "logits/chosen": -0.23171710968017578, "logits/rejected": -0.12584610283374786, "logps/chosen": -75.62057495117188, "logps/ref_chosen": -75.69575500488281, "logps/ref_rejected": -87.35690307617188, "logps/rejected": -90.09814453125, "loss": 1.2397, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0038347430527210236, "rewards/margins": 0.20355473458766937, "rewards/rejected": -0.19971999526023865, "step": 161 }, { "epoch": 0.24489795918367346, "epsilon_dpo/beta": 0.07290733605623245, "epsilon_dpo/beta_margin_grad_mean": -0.42896273732185364, "epsilon_dpo/beta_margin_grad_std": 0.09937798976898193, "epsilon_dpo/beta_margin_mean": 0.2959074378013611, "epsilon_dpo/beta_margin_std": 0.4288792908191681, "epsilon_dpo/loss_margin_mean": 4.093425273895264, "grad_norm": 16.56731414794922, "kl/avg_steps": 0.53125, "kl/beta": 0.07328939437866211, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.697358159051549e-07, "logits/chosen": -0.42089033126831055, "logits/rejected": -0.21701693534851074, "logps/chosen": -98.07493591308594, "logps/ref_chosen": -97.45020294189453, "logps/ref_rejected": -125.35346984863281, "logps/rejected": -130.07164001464844, "loss": 1.156, "rewards/accuracies": 0.765625, "rewards/chosen": -0.04721798747777939, "rewards/margins": 0.2959074378013611, "rewards/rejected": -0.34312543272972107, "step": 162 }, { "epoch": 0.24640967498110355, "epsilon_dpo/beta": 0.0726131945848465, "epsilon_dpo/beta_margin_grad_mean": -0.4502231180667877, "epsilon_dpo/beta_margin_grad_std": 0.11270121484994888, "epsilon_dpo/beta_margin_mean": 0.209677055478096, "epsilon_dpo/beta_margin_std": 0.47489598393440247, "epsilon_dpo/loss_margin_mean": 2.9341773986816406, "grad_norm": 14.430964469909668, "kl/avg_steps": 0.40625, "kl/beta": 0.07290209829807281, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.691021444652876e-07, "logits/chosen": -0.2879790663719177, "logits/rejected": -0.12870730459690094, "logps/chosen": -73.21658325195312, "logps/ref_chosen": -73.41002655029297, "logps/ref_rejected": -95.94908905029297, "logps/rejected": -98.68983459472656, "loss": 1.2422, "rewards/accuracies": 0.703125, "rewards/chosen": 0.012572411447763443, "rewards/margins": 0.2096770703792572, "rewards/rejected": -0.19710463285446167, "step": 163 }, { "epoch": 0.24792139077853365, "epsilon_dpo/beta": 0.072319395840168, "epsilon_dpo/beta_margin_grad_mean": -0.4320308566093445, "epsilon_dpo/beta_margin_grad_std": 0.12061776965856552, "epsilon_dpo/beta_margin_mean": 0.2894424796104431, "epsilon_dpo/beta_margin_std": 0.5126076936721802, "epsilon_dpo/loss_margin_mean": 4.052041530609131, "grad_norm": 15.295539855957031, "kl/avg_steps": 0.40625, "kl/beta": 0.0726071298122406, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.6846234426744624e-07, "logits/chosen": -0.36896005272865295, "logits/rejected": -0.37494707107543945, "logps/chosen": -77.27589416503906, "logps/ref_chosen": -77.78909301757812, "logps/ref_rejected": -102.98542785644531, "logps/rejected": -106.52427673339844, "loss": 1.1808, "rewards/accuracies": 0.703125, "rewards/chosen": 0.034895047545433044, "rewards/margins": 0.2894425392150879, "rewards/rejected": -0.25454750657081604, "step": 164 }, { "epoch": 0.2494331065759637, "epsilon_dpo/beta": 0.071958988904953, "epsilon_dpo/beta_margin_grad_mean": -0.44002482295036316, "epsilon_dpo/beta_margin_grad_std": 0.1064891368150711, "epsilon_dpo/beta_margin_mean": 0.2548327147960663, "epsilon_dpo/beta_margin_std": 0.4599698781967163, "epsilon_dpo/loss_margin_mean": 3.5790834426879883, "grad_norm": 14.637746810913086, "kl/avg_steps": 0.5, "kl/beta": 0.07231336086988449, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.678164332082175e-07, "logits/chosen": -0.16319644451141357, "logits/rejected": -0.32601630687713623, "logps/chosen": -72.39887237548828, "logps/ref_chosen": -72.82852935791016, "logps/ref_rejected": -91.37947845458984, "logps/rejected": -94.52890014648438, "loss": 1.198, "rewards/accuracies": 0.75, "rewards/chosen": 0.029420459643006325, "rewards/margins": 0.2548326849937439, "rewards/rejected": -0.22541223466396332, "step": 165 }, { "epoch": 0.2509448223733938, "epsilon_dpo/beta": 0.07180337607860565, "epsilon_dpo/beta_margin_grad_mean": -0.45170384645462036, "epsilon_dpo/beta_margin_grad_std": 0.10957895219326019, "epsilon_dpo/beta_margin_mean": 0.20550940930843353, "epsilon_dpo/beta_margin_std": 0.46381574869155884, "epsilon_dpo/loss_margin_mean": 2.909606456756592, "grad_norm": 16.492395401000977, "kl/avg_steps": 0.21875, "kl/beta": 0.07195359468460083, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.6716442935512214e-07, "logits/chosen": -0.33589503169059753, "logits/rejected": -0.10567793250083923, "logps/chosen": -88.24549865722656, "logps/ref_chosen": -87.35054016113281, "logps/ref_rejected": -120.07548522949219, "logps/rejected": -123.88005065917969, "loss": 1.2433, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06544095277786255, "rewards/margins": 0.20550942420959473, "rewards/rejected": -0.2709503769874573, "step": 166 }, { "epoch": 0.25245653817082386, "epsilon_dpo/beta": 0.07151201367378235, "epsilon_dpo/beta_margin_grad_mean": -0.4427506625652313, "epsilon_dpo/beta_margin_grad_std": 0.10156535357236862, "epsilon_dpo/beta_margin_mean": 0.24112164974212646, "epsilon_dpo/beta_margin_std": 0.43905171751976013, "epsilon_dpo/loss_margin_mean": 3.4096128940582275, "grad_norm": 14.830485343933105, "kl/avg_steps": 0.40625, "kl/beta": 0.07179653644561768, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -0.08851295709609985, "logits/rejected": -0.012575190514326096, "logps/chosen": -80.8412857055664, "logps/ref_chosen": -81.61276245117188, "logps/ref_rejected": -98.826171875, "logps/rejected": -101.46430969238281, "loss": 1.2056, "rewards/accuracies": 0.703125, "rewards/chosen": 0.05437930300831795, "rewards/margins": 0.24112167954444885, "rewards/rejected": -0.18674236536026, "step": 167 }, { "epoch": 0.25396825396825395, "epsilon_dpo/beta": 0.07133441418409348, "epsilon_dpo/beta_margin_grad_mean": -0.4560203552246094, "epsilon_dpo/beta_margin_grad_std": 0.10047944635152817, "epsilon_dpo/beta_margin_mean": 0.18437211215496063, "epsilon_dpo/beta_margin_std": 0.4224920868873596, "epsilon_dpo/loss_margin_mean": 2.6299386024475098, "grad_norm": 15.255393028259277, "kl/avg_steps": 0.25, "kl/beta": 0.07150604575872421, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.6584221638904767e-07, "logits/chosen": -0.13477931916713715, "logits/rejected": -0.16727593541145325, "logps/chosen": -88.02157592773438, "logps/ref_chosen": -88.23593139648438, "logps/ref_rejected": -101.98030090332031, "logps/rejected": -104.3958740234375, "loss": 1.2537, "rewards/accuracies": 0.625, "rewards/chosen": 0.013690168038010597, "rewards/margins": 0.18437206745147705, "rewards/rejected": -0.1706819087266922, "step": 168 }, { "epoch": 0.25547996976568405, "epsilon_dpo/beta": 0.07113422453403473, "epsilon_dpo/beta_margin_grad_mean": -0.43899574875831604, "epsilon_dpo/beta_margin_grad_std": 0.13024921715259552, "epsilon_dpo/beta_margin_mean": 0.2600567936897278, "epsilon_dpo/beta_margin_std": 0.559776246547699, "epsilon_dpo/loss_margin_mean": 3.7151412963867188, "grad_norm": 17.136293411254883, "kl/avg_steps": 0.28125, "kl/beta": 0.07132772356271744, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.651720442612075e-07, "logits/chosen": -0.38109487295150757, "logits/rejected": -0.07364605367183685, "logps/chosen": -82.69183349609375, "logps/ref_chosen": -84.35769653320312, "logps/ref_rejected": -102.46897888183594, "logps/rejected": -104.51824951171875, "loss": 1.2181, "rewards/accuracies": 0.640625, "rewards/chosen": 0.11652705073356628, "rewards/margins": 0.2600567936897278, "rewards/rejected": -0.1435297578573227, "step": 169 }, { "epoch": 0.25699168556311414, "epsilon_dpo/beta": 0.07097917795181274, "epsilon_dpo/beta_margin_grad_mean": -0.46621790528297424, "epsilon_dpo/beta_margin_grad_std": 0.1452290415763855, "epsilon_dpo/beta_margin_mean": 0.14188018441200256, "epsilon_dpo/beta_margin_std": 0.6213703155517578, "epsilon_dpo/loss_margin_mean": 2.0687479972839355, "grad_norm": 15.189737319946289, "kl/avg_steps": 0.21875, "kl/beta": 0.07112767547369003, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.6449585330874425e-07, "logits/chosen": -0.35476934909820557, "logits/rejected": -0.16348260641098022, "logps/chosen": -76.20692443847656, "logps/ref_chosen": -77.82612609863281, "logps/ref_rejected": -79.933349609375, "logps/rejected": -80.38288879394531, "loss": 1.3426, "rewards/accuracies": 0.625, "rewards/chosen": 0.11196792125701904, "rewards/margins": 0.14188018441200256, "rewards/rejected": -0.029912270605564117, "step": 170 }, { "epoch": 0.2585034013605442, "epsilon_dpo/beta": 0.0707133561372757, "epsilon_dpo/beta_margin_grad_mean": -0.44336846470832825, "epsilon_dpo/beta_margin_grad_std": 0.1324419528245926, "epsilon_dpo/beta_margin_mean": 0.23818716406822205, "epsilon_dpo/beta_margin_std": 0.5732473731040955, "epsilon_dpo/loss_margin_mean": 3.4258179664611816, "grad_norm": 14.782027244567871, "kl/avg_steps": 0.375, "kl/beta": 0.07097242772579193, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.6381366244617224e-07, "logits/chosen": -0.18573148548603058, "logits/rejected": -0.20463137328624725, "logps/chosen": -83.67991638183594, "logps/ref_chosen": -85.51094055175781, "logps/ref_rejected": -104.4660873413086, "logps/rejected": -106.06088256835938, "loss": 1.2408, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12760449945926666, "rewards/margins": 0.23818716406822205, "rewards/rejected": -0.11058266460895538, "step": 171 }, { "epoch": 0.2600151171579743, "epsilon_dpo/beta": 0.07049336284399033, "epsilon_dpo/beta_margin_grad_mean": -0.43517836928367615, "epsilon_dpo/beta_margin_grad_std": 0.11793834716081619, "epsilon_dpo/beta_margin_mean": 0.28078243136405945, "epsilon_dpo/beta_margin_std": 0.5101608037948608, "epsilon_dpo/loss_margin_mean": 4.030537128448486, "grad_norm": 14.178646087646484, "kl/avg_steps": 0.3125, "kl/beta": 0.0707072764635086, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.631254907558365e-07, "logits/chosen": -0.385815292596817, "logits/rejected": -0.11347803473472595, "logps/chosen": -72.62882995605469, "logps/ref_chosen": -74.67696380615234, "logps/ref_rejected": -106.31913757324219, "logps/rejected": -108.30152893066406, "loss": 1.1869, "rewards/accuracies": 0.671875, "rewards/chosen": 0.14252778887748718, "rewards/margins": 0.28078240156173706, "rewards/rejected": -0.13825464248657227, "step": 172 }, { "epoch": 0.2615268329554044, "epsilon_dpo/beta": 0.07025172561407089, "epsilon_dpo/beta_margin_grad_mean": -0.4308141767978668, "epsilon_dpo/beta_margin_grad_std": 0.14216464757919312, "epsilon_dpo/beta_margin_mean": 0.3127928078174591, "epsilon_dpo/beta_margin_std": 0.6453730463981628, "epsilon_dpo/loss_margin_mean": 4.508734703063965, "grad_norm": 14.488515853881836, "kl/avg_steps": 0.34375, "kl/beta": 0.07048700004816055, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.624313574873786e-07, "logits/chosen": -0.12725037336349487, "logits/rejected": -0.3742305636405945, "logps/chosen": -74.35868072509766, "logps/ref_chosen": -77.24813842773438, "logps/ref_rejected": -101.34745025634766, "logps/rejected": -102.96672058105469, "loss": 1.1937, "rewards/accuracies": 0.671875, "rewards/chosen": 0.20155192911624908, "rewards/margins": 0.31279274821281433, "rewards/rejected": -0.11124081909656525, "step": 173 }, { "epoch": 0.26303854875283444, "epsilon_dpo/beta": 0.06990130245685577, "epsilon_dpo/beta_margin_grad_mean": -0.4227776527404785, "epsilon_dpo/beta_margin_grad_std": 0.11874371767044067, "epsilon_dpo/beta_margin_mean": 0.3279976546764374, "epsilon_dpo/beta_margin_std": 0.512088418006897, "epsilon_dpo/loss_margin_mean": 4.736902713775635, "grad_norm": 15.51446533203125, "kl/avg_steps": 0.5, "kl/beta": 0.07024553418159485, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.61731282057198e-07, "logits/chosen": -0.1422739326953888, "logits/rejected": -0.12538309395313263, "logps/chosen": -80.69758605957031, "logps/ref_chosen": -83.51936340332031, "logps/ref_rejected": -115.21611785888672, "logps/rejected": -117.13125610351562, "loss": 1.1474, "rewards/accuracies": 0.796875, "rewards/chosen": 0.19557899236679077, "rewards/margins": 0.3279976546764374, "rewards/rejected": -0.1324186623096466, "step": 174 }, { "epoch": 0.26455026455026454, "epsilon_dpo/beta": 0.0696190595626831, "epsilon_dpo/beta_margin_grad_mean": -0.434917688369751, "epsilon_dpo/beta_margin_grad_std": 0.13931988179683685, "epsilon_dpo/beta_margin_mean": 0.28064674139022827, "epsilon_dpo/beta_margin_std": 0.6153140664100647, "epsilon_dpo/loss_margin_mean": 4.090598106384277, "grad_norm": 15.483464241027832, "kl/avg_steps": 0.40625, "kl/beta": 0.06989604979753494, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.6102528404790965e-07, "logits/chosen": -0.2617759704589844, "logits/rejected": -0.09605693817138672, "logps/chosen": -89.11908721923828, "logps/ref_chosen": -91.16883087158203, "logps/ref_rejected": -111.66940307617188, "logps/rejected": -113.71025085449219, "loss": 1.2145, "rewards/accuracies": 0.703125, "rewards/chosen": 0.14015761017799377, "rewards/margins": 0.28064674139022827, "rewards/rejected": -0.1404891163110733, "step": 175 }, { "epoch": 0.2660619803476946, "epsilon_dpo/beta": 0.06951142847537994, "epsilon_dpo/beta_margin_grad_mean": -0.4651341736316681, "epsilon_dpo/beta_margin_grad_std": 0.12862923741340637, "epsilon_dpo/beta_margin_mean": 0.15457409620285034, "epsilon_dpo/beta_margin_std": 0.5627329349517822, "epsilon_dpo/loss_margin_mean": 2.2834548950195312, "grad_norm": 19.406566619873047, "kl/avg_steps": 0.15625, "kl/beta": 0.0696132481098175, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.603133832077953e-07, "logits/chosen": -0.01893380470573902, "logits/rejected": -0.12899591028690338, "logps/chosen": -104.9249038696289, "logps/ref_chosen": -105.64872741699219, "logps/ref_rejected": -114.00184631347656, "logps/rejected": -115.56147766113281, "loss": 1.3128, "rewards/accuracies": 0.625, "rewards/chosen": 0.04813181981444359, "rewards/margins": 0.15457406640052795, "rewards/rejected": -0.10644224286079407, "step": 176 }, { "epoch": 0.2675736961451247, "epsilon_dpo/beta": 0.0691530704498291, "epsilon_dpo/beta_margin_grad_mean": -0.3918303847312927, "epsilon_dpo/beta_margin_grad_std": 0.128380686044693, "epsilon_dpo/beta_margin_mean": 0.47757646441459656, "epsilon_dpo/beta_margin_std": 0.5835011601448059, "epsilon_dpo/loss_margin_mean": 6.953527450561523, "grad_norm": 16.948251724243164, "kl/avg_steps": 0.515625, "kl/beta": 0.06950464844703674, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.5959559945025183e-07, "logits/chosen": -0.1951223760843277, "logits/rejected": -0.2172054648399353, "logps/chosen": -79.67855834960938, "logps/ref_chosen": -83.90325927734375, "logps/ref_rejected": -124.04662322998047, "logps/rejected": -126.77544403076172, "loss": 1.0424, "rewards/accuracies": 0.796875, "rewards/chosen": 0.29099011421203613, "rewards/margins": 0.4775765538215637, "rewards/rejected": -0.1865864247083664, "step": 177 }, { "epoch": 0.2690854119425548, "epsilon_dpo/beta": 0.06880924105644226, "epsilon_dpo/beta_margin_grad_mean": -0.4229211211204529, "epsilon_dpo/beta_margin_grad_std": 0.1171141117811203, "epsilon_dpo/beta_margin_mean": 0.3316393792629242, "epsilon_dpo/beta_margin_std": 0.5047381520271301, "epsilon_dpo/loss_margin_mean": 4.8638386726379395, "grad_norm": 14.013132095336914, "kl/avg_steps": 0.5, "kl/beta": 0.06914810091257095, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.588719528532341e-07, "logits/chosen": -0.4607001841068268, "logits/rejected": -0.426104336977005, "logps/chosen": -82.26956176757812, "logps/ref_chosen": -85.39146423339844, "logps/ref_rejected": -104.77743530273438, "logps/rejected": -106.51936340332031, "loss": 1.1424, "rewards/accuracies": 0.75, "rewards/chosen": 0.2138364315032959, "rewards/margins": 0.3316393494606018, "rewards/rejected": -0.1178029254078865, "step": 178 }, { "epoch": 0.2705971277399849, "epsilon_dpo/beta": 0.06862808018922806, "epsilon_dpo/beta_margin_grad_mean": -0.4533432424068451, "epsilon_dpo/beta_margin_grad_std": 0.13178221881389618, "epsilon_dpo/beta_margin_mean": 0.20900557935237885, "epsilon_dpo/beta_margin_std": 0.5843459367752075, "epsilon_dpo/loss_margin_mean": 3.102468729019165, "grad_norm": 15.235759735107422, "kl/avg_steps": 0.265625, "kl/beta": 0.06880408525466919, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.581424636586928e-07, "logits/chosen": -0.32951170206069946, "logits/rejected": -0.2942412495613098, "logps/chosen": -86.45561218261719, "logps/ref_chosen": -89.62193298339844, "logps/ref_rejected": -100.42626190185547, "logps/rejected": -100.36241149902344, "loss": 1.2684, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21552518010139465, "rewards/margins": 0.20900559425354004, "rewards/rejected": 0.00651959702372551, "step": 179 }, { "epoch": 0.272108843537415, "epsilon_dpo/beta": 0.06852144002914429, "epsilon_dpo/beta_margin_grad_mean": -0.45583203434944153, "epsilon_dpo/beta_margin_grad_std": 0.12107642740011215, "epsilon_dpo/beta_margin_mean": 0.1924244910478592, "epsilon_dpo/beta_margin_std": 0.5202975869178772, "epsilon_dpo/loss_margin_mean": 2.8651440143585205, "grad_norm": 13.99289321899414, "kl/avg_steps": 0.15625, "kl/beta": 0.06862180680036545, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.5740715227200897e-07, "logits/chosen": -0.39849674701690674, "logits/rejected": -0.4529162645339966, "logps/chosen": -77.17005920410156, "logps/ref_chosen": -80.65196228027344, "logps/ref_rejected": -86.28292083740234, "logps/rejected": -85.66616821289062, "loss": 1.2679, "rewards/accuracies": 0.546875, "rewards/chosen": 0.2367887794971466, "rewards/margins": 0.19242452085018158, "rewards/rejected": 0.044364262372255325, "step": 180 }, { "epoch": 0.273620559334845, "epsilon_dpo/beta": 0.06830747425556183, "epsilon_dpo/beta_margin_grad_mean": -0.43471357226371765, "epsilon_dpo/beta_margin_grad_std": 0.12012200802564621, "epsilon_dpo/beta_margin_mean": 0.28556621074676514, "epsilon_dpo/beta_margin_std": 0.5327972173690796, "epsilon_dpo/loss_margin_mean": 4.225931644439697, "grad_norm": 13.607747077941895, "kl/avg_steps": 0.3125, "kl/beta": 0.06851474940776825, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.566660392614228e-07, "logits/chosen": -0.5008969902992249, "logits/rejected": -0.48943305015563965, "logps/chosen": -79.5925521850586, "logps/ref_chosen": -83.57610321044922, "logps/ref_rejected": -103.83201599121094, "logps/rejected": -104.07440185546875, "loss": 1.1874, "rewards/accuracies": 0.703125, "rewards/chosen": 0.2721977233886719, "rewards/margins": 0.28556621074676514, "rewards/rejected": -0.0133685152977705, "step": 181 }, { "epoch": 0.2751322751322751, "epsilon_dpo/beta": 0.06791312992572784, "epsilon_dpo/beta_margin_grad_mean": -0.40505993366241455, "epsilon_dpo/beta_margin_grad_std": 0.126272052526474, "epsilon_dpo/beta_margin_mean": 0.410185843706131, "epsilon_dpo/beta_margin_std": 0.5547764897346497, "epsilon_dpo/loss_margin_mean": 6.082636833190918, "grad_norm": 14.72663688659668, "kl/avg_steps": 0.578125, "kl/beta": 0.06830131262540817, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.5591914535745817e-07, "logits/chosen": -0.30189305543899536, "logits/rejected": -0.24266424775123596, "logps/chosen": -77.49993896484375, "logps/ref_chosen": -80.32312774658203, "logps/ref_rejected": -116.65972900390625, "logps/rejected": -119.919189453125, "loss": 1.0898, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1906897872686386, "rewards/margins": 0.4101858139038086, "rewards/rejected": -0.21949604153633118, "step": 182 }, { "epoch": 0.2766439909297052, "epsilon_dpo/beta": 0.06780938804149628, "epsilon_dpo/beta_margin_grad_mean": -0.45864781737327576, "epsilon_dpo/beta_margin_grad_std": 0.1380331814289093, "epsilon_dpo/beta_margin_mean": 0.18381942808628082, "epsilon_dpo/beta_margin_std": 0.6125727295875549, "epsilon_dpo/loss_margin_mean": 2.769742727279663, "grad_norm": 15.092416763305664, "kl/avg_steps": 0.15625, "kl/beta": 0.06790871173143387, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.551664914523433e-07, "logits/chosen": -0.32394590973854065, "logits/rejected": -0.3546528220176697, "logps/chosen": -86.47184753417969, "logps/ref_chosen": -86.9107437133789, "logps/ref_rejected": -97.24227142333984, "logps/rejected": -99.5731201171875, "loss": 1.2994, "rewards/accuracies": 0.640625, "rewards/chosen": 0.028311219066381454, "rewards/margins": 0.18381944298744202, "rewards/rejected": -0.15550823509693146, "step": 183 }, { "epoch": 0.2781557067271353, "epsilon_dpo/beta": 0.06756575405597687, "epsilon_dpo/beta_margin_grad_mean": -0.4294240176677704, "epsilon_dpo/beta_margin_grad_std": 0.10901588201522827, "epsilon_dpo/beta_margin_mean": 0.30280619859695435, "epsilon_dpo/beta_margin_std": 0.47133877873420715, "epsilon_dpo/loss_margin_mean": 4.52435302734375, "grad_norm": 13.005136489868164, "kl/avg_steps": 0.359375, "kl/beta": 0.06780277192592621, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.544080985994258e-07, "logits/chosen": -0.3298104405403137, "logits/rejected": -0.39565029740333557, "logps/chosen": -62.69512176513672, "logps/ref_chosen": -66.81837463378906, "logps/ref_rejected": -89.96328735351562, "logps/rejected": -90.36438751220703, "loss": 1.159, "rewards/accuracies": 0.75, "rewards/chosen": 0.2780967354774475, "rewards/margins": 0.30280619859695435, "rewards/rejected": -0.02470945566892624, "step": 184 }, { "epoch": 0.2796674225245654, "epsilon_dpo/beta": 0.06735558807849884, "epsilon_dpo/beta_margin_grad_mean": -0.43276891112327576, "epsilon_dpo/beta_margin_grad_std": 0.15843592584133148, "epsilon_dpo/beta_margin_mean": 0.29344916343688965, "epsilon_dpo/beta_margin_std": 0.6960242986679077, "epsilon_dpo/loss_margin_mean": 4.434981822967529, "grad_norm": 13.516302108764648, "kl/avg_steps": 0.3125, "kl/beta": 0.06755997985601425, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.5364398801258394e-07, "logits/chosen": -0.15685924887657166, "logits/rejected": -0.12343727052211761, "logps/chosen": -69.97865295410156, "logps/ref_chosen": -72.42000579833984, "logps/ref_rejected": -92.72299194335938, "logps/rejected": -94.71661376953125, "loss": 1.2288, "rewards/accuracies": 0.671875, "rewards/chosen": 0.16207338869571686, "rewards/margins": 0.29344916343688965, "rewards/rejected": -0.13137578964233398, "step": 185 }, { "epoch": 0.2811791383219955, "epsilon_dpo/beta": 0.06727205216884613, "epsilon_dpo/beta_margin_grad_mean": -0.43999987840652466, "epsilon_dpo/beta_margin_grad_std": 0.1602521687746048, "epsilon_dpo/beta_margin_mean": 0.27616897225379944, "epsilon_dpo/beta_margin_std": 0.7218940258026123, "epsilon_dpo/loss_margin_mean": 4.186398506164551, "grad_norm": 15.226896286010742, "kl/avg_steps": 0.125, "kl/beta": 0.06734950840473175, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.5287418106563354e-07, "logits/chosen": -0.2841332256793976, "logits/rejected": -0.4217131435871124, "logps/chosen": -90.3564453125, "logps/ref_chosen": -92.51256561279297, "logps/ref_rejected": -108.71427917480469, "logps/rejected": -110.74455261230469, "loss": 1.2501, "rewards/accuracies": 0.625, "rewards/chosen": 0.14296837151050568, "rewards/margins": 0.2761688828468323, "rewards/rejected": -0.1332004964351654, "step": 186 }, { "epoch": 0.28269085411942557, "epsilon_dpo/beta": 0.06704090535640717, "epsilon_dpo/beta_margin_grad_mean": -0.43871721625328064, "epsilon_dpo/beta_margin_grad_std": 0.14905036985874176, "epsilon_dpo/beta_margin_mean": 0.2713906466960907, "epsilon_dpo/beta_margin_std": 0.6602548956871033, "epsilon_dpo/loss_margin_mean": 4.1168060302734375, "grad_norm": 18.4125919342041, "kl/avg_steps": 0.34375, "kl/beta": 0.06726542860269547, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.520986992917297e-07, "logits/chosen": -0.38112810254096985, "logits/rejected": -0.31438401341438293, "logps/chosen": -102.13521575927734, "logps/ref_chosen": -102.16010284423828, "logps/ref_rejected": -131.34176635742188, "logps/rejected": -135.43368530273438, "loss": 1.2356, "rewards/accuracies": 0.671875, "rewards/chosen": -0.0005250517278909683, "rewards/margins": 0.2713906168937683, "rewards/rejected": -0.27191564440727234, "step": 187 }, { "epoch": 0.2842025699168556, "epsilon_dpo/beta": 0.06681124120950699, "epsilon_dpo/beta_margin_grad_mean": -0.4493896961212158, "epsilon_dpo/beta_margin_grad_std": 0.12668459117412567, "epsilon_dpo/beta_margin_mean": 0.21586395800113678, "epsilon_dpo/beta_margin_std": 0.5554152131080627, "epsilon_dpo/loss_margin_mean": 3.2856740951538086, "grad_norm": 14.878789901733398, "kl/avg_steps": 0.34375, "kl/beta": 0.0670349970459938, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.5131756438276466e-07, "logits/chosen": -0.021359417587518692, "logits/rejected": -0.14762753248214722, "logps/chosen": -93.09027099609375, "logps/ref_chosen": -93.75152587890625, "logps/ref_rejected": -102.76870727539062, "logps/rejected": -105.39312744140625, "loss": 1.2552, "rewards/accuracies": 0.671875, "rewards/chosen": 0.04284510016441345, "rewards/margins": 0.21586398780345917, "rewards/rejected": -0.17301888763904572, "step": 188 }, { "epoch": 0.2857142857142857, "epsilon_dpo/beta": 0.06656148284673691, "epsilon_dpo/beta_margin_grad_mean": -0.453034371137619, "epsilon_dpo/beta_margin_grad_std": 0.1559441089630127, "epsilon_dpo/beta_margin_mean": 0.1943899691104889, "epsilon_dpo/beta_margin_std": 0.6854284405708313, "epsilon_dpo/loss_margin_mean": 2.9941604137420654, "grad_norm": 18.815332412719727, "kl/avg_steps": 0.375, "kl/beta": 0.06680534780025482, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.5053079818876096e-07, "logits/chosen": 0.04337912052869797, "logits/rejected": -0.1719989776611328, "logps/chosen": -99.32482147216797, "logps/ref_chosen": -100.89401245117188, "logps/ref_rejected": -95.35594177246094, "logps/rejected": -96.78091430664062, "loss": 1.313, "rewards/accuracies": 0.703125, "rewards/chosen": 0.10105113685131073, "rewards/margins": 0.19438999891281128, "rewards/rejected": -0.09333885461091995, "step": 189 }, { "epoch": 0.2872260015117158, "epsilon_dpo/beta": 0.06612560898065567, "epsilon_dpo/beta_margin_grad_mean": -0.3864794373512268, "epsilon_dpo/beta_margin_grad_std": 0.1279653012752533, "epsilon_dpo/beta_margin_mean": 0.49961158633232117, "epsilon_dpo/beta_margin_std": 0.5765790939331055, "epsilon_dpo/loss_margin_mean": 7.596166610717773, "grad_norm": 17.344053268432617, "kl/avg_steps": 0.65625, "kl/beta": 0.06655576825141907, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.4973842271726024e-07, "logits/chosen": -0.1879466474056244, "logits/rejected": -0.4992838501930237, "logps/chosen": -74.71463012695312, "logps/ref_chosen": -77.86666107177734, "logps/ref_rejected": -133.73385620117188, "logps/rejected": -138.17800903320312, "loss": 1.0242, "rewards/accuracies": 0.859375, "rewards/chosen": 0.20717057585716248, "rewards/margins": 0.49961161613464355, "rewards/rejected": -0.2924410402774811, "step": 190 }, { "epoch": 0.2887377173091459, "epsilon_dpo/beta": 0.06591136008501053, "epsilon_dpo/beta_margin_grad_mean": -0.44709786772727966, "epsilon_dpo/beta_margin_grad_std": 0.1486724317073822, "epsilon_dpo/beta_margin_mean": 0.2231767624616623, "epsilon_dpo/beta_margin_std": 0.6557565331459045, "epsilon_dpo/loss_margin_mean": 3.4554193019866943, "grad_norm": 18.134878158569336, "kl/avg_steps": 0.328125, "kl/beta": 0.06612183898687363, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.48940460132708e-07, "logits/chosen": -0.19696848094463348, "logits/rejected": -0.2077828049659729, "logps/chosen": -100.0518798828125, "logps/ref_chosen": -99.27456665039062, "logps/ref_rejected": -114.78330993652344, "logps/rejected": -119.01604461669922, "loss": 1.2774, "rewards/accuracies": 0.6875, "rewards/chosen": -0.053733598440885544, "rewards/margins": 0.22317670285701752, "rewards/rejected": -0.27691030502319336, "step": 191 }, { "epoch": 0.29024943310657597, "epsilon_dpo/beta": 0.06574740260839462, "epsilon_dpo/beta_margin_grad_mean": -0.4566137194633484, "epsilon_dpo/beta_margin_grad_std": 0.1029883325099945, "epsilon_dpo/beta_margin_mean": 0.18087714910507202, "epsilon_dpo/beta_margin_std": 0.4595518112182617, "epsilon_dpo/loss_margin_mean": 2.7946553230285645, "grad_norm": 12.453398704528809, "kl/avg_steps": 0.25, "kl/beta": 0.06590559333562851, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.481369327558329e-07, "logits/chosen": -0.22844773530960083, "logits/rejected": -0.22785356640815735, "logps/chosen": -73.15953063964844, "logps/ref_chosen": -73.45323181152344, "logps/ref_rejected": -85.39158630371094, "logps/rejected": -87.8925552368164, "loss": 1.2632, "rewards/accuracies": 0.671875, "rewards/chosen": 0.018335875123739243, "rewards/margins": 0.18087714910507202, "rewards/rejected": -0.16254127025604248, "step": 192 }, { "epoch": 0.29176114890400606, "epsilon_dpo/beta": 0.06554234027862549, "epsilon_dpo/beta_margin_grad_mean": -0.42428433895111084, "epsilon_dpo/beta_margin_grad_std": 0.13521122932434082, "epsilon_dpo/beta_margin_mean": 0.3341885805130005, "epsilon_dpo/beta_margin_std": 0.594582200050354, "epsilon_dpo/loss_margin_mean": 5.164119243621826, "grad_norm": 16.317344665527344, "kl/avg_steps": 0.3125, "kl/beta": 0.06574123352766037, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.47327863063023e-07, "logits/chosen": -0.36393725872039795, "logits/rejected": -0.1663932204246521, "logps/chosen": -78.37696838378906, "logps/ref_chosen": -79.29537963867188, "logps/ref_rejected": -85.5767822265625, "logps/rejected": -89.82247924804688, "loss": 1.1627, "rewards/accuracies": 0.640625, "rewards/chosen": 0.05778112635016441, "rewards/margins": 0.3341885805130005, "rewards/rejected": -0.27640748023986816, "step": 193 }, { "epoch": 0.29327286470143615, "epsilon_dpo/beta": 0.0654405727982521, "epsilon_dpo/beta_margin_grad_mean": -0.46438026428222656, "epsilon_dpo/beta_margin_grad_std": 0.15544359385967255, "epsilon_dpo/beta_margin_mean": 0.15591198205947876, "epsilon_dpo/beta_margin_std": 0.6794894933700562, "epsilon_dpo/loss_margin_mean": 2.4638755321502686, "grad_norm": 15.462275505065918, "kl/avg_steps": 0.15625, "kl/beta": 0.06553643196821213, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -0.4889231324195862, "logits/rejected": -0.4247901439666748, "logps/chosen": -84.87882995605469, "logps/ref_chosen": -83.12230682373047, "logps/ref_rejected": -93.76890563964844, "logps/rejected": -97.98930358886719, "loss": 1.3464, "rewards/accuracies": 0.578125, "rewards/chosen": -0.11777393519878387, "rewards/margins": 0.15591202676296234, "rewards/rejected": -0.2736859619617462, "step": 194 }, { "epoch": 0.2947845804988662, "epsilon_dpo/beta": 0.06521578133106232, "epsilon_dpo/beta_margin_grad_mean": -0.4328776001930237, "epsilon_dpo/beta_margin_grad_std": 0.13702498376369476, "epsilon_dpo/beta_margin_mean": 0.29111796617507935, "epsilon_dpo/beta_margin_std": 0.5985894799232483, "epsilon_dpo/loss_margin_mean": 4.530278205871582, "grad_norm": 16.169418334960938, "kl/avg_steps": 0.34375, "kl/beta": 0.06543419510126114, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.4569318740967043e-07, "logits/chosen": -0.3322495222091675, "logits/rejected": -0.22550660371780396, "logps/chosen": -99.76998901367188, "logps/ref_chosen": -96.8233642578125, "logps/ref_rejected": -96.86456298828125, "logps/rejected": -104.34147644042969, "loss": 1.201, "rewards/accuracies": 0.671875, "rewards/chosen": -0.19461895525455475, "rewards/margins": 0.2911180257797241, "rewards/rejected": -0.4857369661331177, "step": 195 }, { "epoch": 0.2962962962962963, "epsilon_dpo/beta": 0.06499237567186356, "epsilon_dpo/beta_margin_grad_mean": -0.4459932744503021, "epsilon_dpo/beta_margin_grad_std": 0.1392875760793686, "epsilon_dpo/beta_margin_mean": 0.23551592230796814, "epsilon_dpo/beta_margin_std": 0.6242495179176331, "epsilon_dpo/loss_margin_mean": 3.6898553371429443, "grad_norm": 14.393869400024414, "kl/avg_steps": 0.34375, "kl/beta": 0.06521003693342209, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.448676271745197e-07, "logits/chosen": -0.21855714917182922, "logits/rejected": -0.2570999562740326, "logps/chosen": -83.5823974609375, "logps/ref_chosen": -82.40751647949219, "logps/ref_rejected": -105.99139404296875, "logps/rejected": -110.85611724853516, "loss": 1.2556, "rewards/accuracies": 0.703125, "rewards/chosen": -0.07799074053764343, "rewards/margins": 0.23551593720912933, "rewards/rejected": -0.3135066628456116, "step": 196 }, { "epoch": 0.29780801209372637, "epsilon_dpo/beta": 0.06474941968917847, "epsilon_dpo/beta_margin_grad_mean": -0.42367416620254517, "epsilon_dpo/beta_margin_grad_std": 0.16686230897903442, "epsilon_dpo/beta_margin_mean": 0.32828599214553833, "epsilon_dpo/beta_margin_std": 0.7847632169723511, "epsilon_dpo/loss_margin_mean": 5.152902603149414, "grad_norm": 15.43140697479248, "kl/avg_steps": 0.375, "kl/beta": 0.06498664617538452, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.440366160729392e-07, "logits/chosen": -0.6653242111206055, "logits/rejected": -0.3941148519515991, "logps/chosen": -75.24999237060547, "logps/ref_chosen": -74.27025604248047, "logps/ref_rejected": -95.37227630615234, "logps/rejected": -101.50491333007812, "loss": 1.2247, "rewards/accuracies": 0.703125, "rewards/chosen": -0.06569398194551468, "rewards/margins": 0.32828599214553833, "rewards/rejected": -0.3939799666404724, "step": 197 }, { "epoch": 0.29931972789115646, "epsilon_dpo/beta": 0.06440634280443192, "epsilon_dpo/beta_margin_grad_mean": -0.41623857617378235, "epsilon_dpo/beta_margin_grad_std": 0.12911495566368103, "epsilon_dpo/beta_margin_mean": 0.358009934425354, "epsilon_dpo/beta_margin_std": 0.5631706118583679, "epsilon_dpo/loss_margin_mean": 5.61193323135376, "grad_norm": 14.343767166137695, "kl/avg_steps": 0.53125, "kl/beta": 0.06474385410547256, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.432001773500957e-07, "logits/chosen": -0.4333815574645996, "logits/rejected": -0.5065505504608154, "logps/chosen": -86.1900634765625, "logps/ref_chosen": -84.80004119873047, "logps/ref_rejected": -98.13402557373047, "logps/rejected": -105.135986328125, "loss": 1.1351, "rewards/accuracies": 0.78125, "rewards/chosen": -0.09154967963695526, "rewards/margins": 0.358009934425354, "rewards/rejected": -0.44955962896347046, "step": 198 }, { "epoch": 0.30083144368858655, "epsilon_dpo/beta": 0.06418675184249878, "epsilon_dpo/beta_margin_grad_mean": -0.4314574897289276, "epsilon_dpo/beta_margin_grad_std": 0.16398167610168457, "epsilon_dpo/beta_margin_mean": 0.2991337776184082, "epsilon_dpo/beta_margin_std": 0.7234874963760376, "epsilon_dpo/loss_margin_mean": 4.746899127960205, "grad_norm": 14.889111518859863, "kl/avg_steps": 0.34375, "kl/beta": 0.06440171599388123, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.4235833440297856e-07, "logits/chosen": -0.2650891840457916, "logits/rejected": -0.44171708822250366, "logps/chosen": -82.58881378173828, "logps/ref_chosen": -79.6952133178711, "logps/ref_rejected": -110.57994079589844, "logps/rejected": -118.2204360961914, "loss": 1.2329, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18838956952095032, "rewards/margins": 0.2991337776184082, "rewards/rejected": -0.48752331733703613, "step": 199 }, { "epoch": 0.30234315948601664, "epsilon_dpo/beta": 0.06386657804250717, "epsilon_dpo/beta_margin_grad_mean": -0.4096252918243408, "epsilon_dpo/beta_margin_grad_std": 0.14677537977695465, "epsilon_dpo/beta_margin_mean": 0.39887288212776184, "epsilon_dpo/beta_margin_std": 0.6632535457611084, "epsilon_dpo/loss_margin_mean": 6.310145378112793, "grad_norm": 13.412322044372559, "kl/avg_steps": 0.5, "kl/beta": 0.06418109685182571, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.415111107797445e-07, "logits/chosen": -0.3834892511367798, "logits/rejected": -0.5493783354759216, "logps/chosen": -81.55732727050781, "logps/ref_chosen": -80.175048828125, "logps/ref_rejected": -113.15582275390625, "logps/rejected": -120.84823608398438, "loss": 1.1283, "rewards/accuracies": 0.734375, "rewards/chosen": -0.09074348211288452, "rewards/margins": 0.39887285232543945, "rewards/rejected": -0.489616334438324, "step": 200 }, { "epoch": 0.30234315948601664, "eval_epsilon_dpo/beta": 0.06364637613296509, "eval_epsilon_dpo/beta_margin_grad_mean": -0.4313368499279022, "eval_epsilon_dpo/beta_margin_grad_std": 0.14446234703063965, "eval_epsilon_dpo/beta_margin_mean": 0.3057065010070801, "eval_epsilon_dpo/beta_margin_std": 0.6497645974159241, "eval_epsilon_dpo/loss_margin_mean": 4.874636650085449, "eval_kl/n_epsilon_steps": 0.32614436745643616, "eval_kl/p_epsilon_steps": 0.67341548204422, "eval_logits/chosen": -0.338997483253479, "eval_logits/rejected": -0.38629141449928284, "eval_logps/chosen": -89.90900421142578, "eval_logps/ref_chosen": -87.42715454101562, "eval_logps/ref_rejected": -104.23548889160156, "eval_logps/rejected": -111.59197998046875, "eval_loss": 0.6027774810791016, "eval_rewards/accuracies": 0.6830986142158508, "eval_rewards/chosen": -0.16038812696933746, "eval_rewards/margins": 0.30570653080940247, "eval_rewards/rejected": -0.46609458327293396, "eval_runtime": 47.6329, "eval_samples_per_second": 48.349, "eval_steps_per_second": 1.512, "step": 200 }, { "epoch": 0.30385487528344673, "epsilon_dpo/beta": 0.06350891292095184, "epsilon_dpo/beta_margin_grad_mean": -0.42801105976104736, "epsilon_dpo/beta_margin_grad_std": 0.1340506672859192, "epsilon_dpo/beta_margin_mean": 0.3071900010108948, "epsilon_dpo/beta_margin_std": 0.5804892182350159, "epsilon_dpo/loss_margin_mean": 4.892658710479736, "grad_norm": 16.5773983001709, "kl/avg_steps": 0.5625, "kl/beta": 0.06386178731918335, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.4065853017905953e-07, "logits/chosen": -0.4059561491012573, "logits/rejected": -0.4868294894695282, "logps/chosen": -88.41127014160156, "logps/ref_chosen": -84.85652160644531, "logps/ref_rejected": -108.86637878417969, "logps/rejected": -117.31378173828125, "loss": 1.1828, "rewards/accuracies": 0.765625, "rewards/chosen": -0.22657953202724457, "rewards/margins": 0.30719006061553955, "rewards/rejected": -0.5337696075439453, "step": 201 }, { "epoch": 0.30536659108087677, "epsilon_dpo/beta": 0.06325291842222214, "epsilon_dpo/beta_margin_grad_mean": -0.4159080684185028, "epsilon_dpo/beta_margin_grad_std": 0.12844079732894897, "epsilon_dpo/beta_margin_mean": 0.3679068386554718, "epsilon_dpo/beta_margin_std": 0.5659220218658447, "epsilon_dpo/loss_margin_mean": 5.876931667327881, "grad_norm": 13.360112190246582, "kl/avg_steps": 0.40625, "kl/beta": 0.06350457668304443, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.3980061644943575e-07, "logits/chosen": -0.134725883603096, "logits/rejected": -0.25779327750205994, "logps/chosen": -71.51620483398438, "logps/ref_chosen": -70.20382690429688, "logps/ref_rejected": -100.29237365722656, "logps/rejected": -107.481689453125, "loss": 1.1267, "rewards/accuracies": 0.703125, "rewards/chosen": -0.08515353500843048, "rewards/margins": 0.3679068088531494, "rewards/rejected": -0.4530603289604187, "step": 202 }, { "epoch": 0.30687830687830686, "epsilon_dpo/beta": 0.06301674991846085, "epsilon_dpo/beta_margin_grad_mean": -0.42265784740448, "epsilon_dpo/beta_margin_grad_std": 0.12643830478191376, "epsilon_dpo/beta_margin_mean": 0.3411216735839844, "epsilon_dpo/beta_margin_std": 0.5760848522186279, "epsilon_dpo/loss_margin_mean": 5.4683661460876465, "grad_norm": 14.424872398376465, "kl/avg_steps": 0.375, "kl/beta": 0.06324762850999832, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.3893739358856455e-07, "logits/chosen": -0.09640582650899887, "logits/rejected": -0.22556348145008087, "logps/chosen": -89.77330017089844, "logps/ref_chosen": -86.46543884277344, "logps/ref_rejected": -120.1901626586914, "logps/rejected": -128.9663848876953, "loss": 1.1503, "rewards/accuracies": 0.703125, "rewards/chosen": -0.21057778596878052, "rewards/margins": 0.3411216735839844, "rewards/rejected": -0.5516994595527649, "step": 203 }, { "epoch": 0.30839002267573695, "epsilon_dpo/beta": 0.06276163458824158, "epsilon_dpo/beta_margin_grad_mean": -0.4142310321331024, "epsilon_dpo/beta_margin_grad_std": 0.14236027002334595, "epsilon_dpo/beta_margin_mean": 0.38478749990463257, "epsilon_dpo/beta_margin_std": 0.6492862701416016, "epsilon_dpo/loss_margin_mean": 6.1974663734436035, "grad_norm": 16.044506072998047, "kl/avg_steps": 0.40625, "kl/beta": 0.06301134079694748, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.380688857426449e-07, "logits/chosen": -0.31047549843788147, "logits/rejected": -0.5649159550666809, "logps/chosen": -71.08057403564453, "logps/ref_chosen": -70.06008911132812, "logps/ref_rejected": -91.88562774658203, "logps/rejected": -99.10357666015625, "loss": 1.1346, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0658092349767685, "rewards/margins": 0.38478749990463257, "rewards/rejected": -0.4505966901779175, "step": 204 }, { "epoch": 0.30990173847316704, "epsilon_dpo/beta": 0.06256653368473053, "epsilon_dpo/beta_margin_grad_mean": -0.442996084690094, "epsilon_dpo/beta_margin_grad_std": 0.1461196094751358, "epsilon_dpo/beta_margin_mean": 0.26055842638015747, "epsilon_dpo/beta_margin_std": 0.6598429679870605, "epsilon_dpo/loss_margin_mean": 4.234796047210693, "grad_norm": 17.57623863220215, "kl/avg_steps": 0.3125, "kl/beta": 0.06275638937950134, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.3719511720570814e-07, "logits/chosen": -0.4885348081588745, "logits/rejected": -0.2372855544090271, "logps/chosen": -96.82564544677734, "logps/ref_chosen": -93.42503356933594, "logps/ref_rejected": -119.87689208984375, "logps/rejected": -127.51229858398438, "loss": 1.2436, "rewards/accuracies": 0.640625, "rewards/chosen": -0.21464571356773376, "rewards/margins": 0.26055842638015747, "rewards/rejected": -0.47520411014556885, "step": 205 }, { "epoch": 0.31141345427059713, "epsilon_dpo/beta": 0.06254758685827255, "epsilon_dpo/beta_margin_grad_mean": -0.47024503350257874, "epsilon_dpo/beta_margin_grad_std": 0.14309605956077576, "epsilon_dpo/beta_margin_mean": 0.14074857532978058, "epsilon_dpo/beta_margin_std": 0.6398756504058838, "epsilon_dpo/loss_margin_mean": 2.3255414962768555, "grad_norm": 17.001508712768555, "kl/avg_steps": 0.03125, "kl/beta": 0.06256088614463806, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 4.363161124189387e-07, "logits/chosen": -0.2468646764755249, "logits/rejected": -0.2657265067100525, "logps/chosen": -98.77737426757812, "logps/ref_chosen": -95.62075805664062, "logps/ref_rejected": -104.67747497558594, "logps/rejected": -110.15962982177734, "loss": 1.3465, "rewards/accuracies": 0.515625, "rewards/chosen": -0.2001642882823944, "rewards/margins": 0.14074859023094177, "rewards/rejected": -0.3409128785133362, "step": 206 }, { "epoch": 0.3129251700680272, "epsilon_dpo/beta": 0.06233258917927742, "epsilon_dpo/beta_margin_grad_mean": -0.4336472749710083, "epsilon_dpo/beta_margin_grad_std": 0.1409965306520462, "epsilon_dpo/beta_margin_mean": 0.3076102137565613, "epsilon_dpo/beta_margin_std": 0.657630980014801, "epsilon_dpo/loss_margin_mean": 5.001832008361816, "grad_norm": 15.785563468933105, "kl/avg_steps": 0.34375, "kl/beta": 0.06254134327173233, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.3543189596998986e-07, "logits/chosen": -0.29754525423049927, "logits/rejected": -0.6161233186721802, "logps/chosen": -96.36029815673828, "logps/ref_chosen": -92.49520111083984, "logps/ref_rejected": -111.60420989990234, "logps/rejected": -120.47113800048828, "loss": 1.1999, "rewards/accuracies": 0.609375, "rewards/chosen": -0.2434634566307068, "rewards/margins": 0.3076101243495941, "rewards/rejected": -0.5510735511779785, "step": 207 }, { "epoch": 0.3144368858654573, "epsilon_dpo/beta": 0.06219697371125221, "epsilon_dpo/beta_margin_grad_mean": -0.46254763007164, "epsilon_dpo/beta_margin_grad_std": 0.1336774379014969, "epsilon_dpo/beta_margin_mean": 0.16149714589118958, "epsilon_dpo/beta_margin_std": 0.579863429069519, "epsilon_dpo/loss_margin_mean": 2.666975736618042, "grad_norm": 18.437692642211914, "kl/avg_steps": 0.21875, "kl/beta": 0.06232709437608719, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.3454249259229664e-07, "logits/chosen": -0.3396787643432617, "logits/rejected": -0.10453208535909653, "logps/chosen": -84.78022003173828, "logps/ref_chosen": -83.91865539550781, "logps/ref_rejected": -95.56571960449219, "logps/rejected": -99.09425354003906, "loss": 1.3116, "rewards/accuracies": 0.609375, "rewards/chosen": -0.05670349299907684, "rewards/margins": 0.16149716079235077, "rewards/rejected": -0.21820063889026642, "step": 208 }, { "epoch": 0.31594860166288735, "epsilon_dpo/beta": 0.061905719339847565, "epsilon_dpo/beta_margin_grad_mean": -0.4098471403121948, "epsilon_dpo/beta_margin_grad_std": 0.1620044857263565, "epsilon_dpo/beta_margin_mean": 0.41248318552970886, "epsilon_dpo/beta_margin_std": 0.7426827549934387, "epsilon_dpo/loss_margin_mean": 6.738656044006348, "grad_norm": 15.497926712036133, "kl/avg_steps": 0.46875, "kl/beta": 0.06219105049967766, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.336479271643833e-07, "logits/chosen": -0.33976924419403076, "logits/rejected": -0.22139668464660645, "logps/chosen": -94.8779296875, "logps/ref_chosen": -93.79673767089844, "logps/ref_rejected": -115.7679672241211, "logps/rejected": -123.58782196044922, "loss": 1.1415, "rewards/accuracies": 0.734375, "rewards/chosen": -0.06984454393386841, "rewards/margins": 0.4124831557273865, "rewards/rejected": -0.4823276996612549, "step": 209 }, { "epoch": 0.31746031746031744, "epsilon_dpo/beta": 0.06163623929023743, "epsilon_dpo/beta_margin_grad_mean": -0.40473130345344543, "epsilon_dpo/beta_margin_grad_std": 0.14648084342479706, "epsilon_dpo/beta_margin_mean": 0.42534348368644714, "epsilon_dpo/beta_margin_std": 0.6573120951652527, "epsilon_dpo/loss_margin_mean": 6.972165584564209, "grad_norm": 13.21751594543457, "kl/avg_steps": 0.4375, "kl/beta": 0.06190089136362076, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.327482247091679e-07, "logits/chosen": -0.26555413007736206, "logits/rejected": -0.3610946238040924, "logps/chosen": -78.26898956298828, "logps/ref_chosen": -77.90690612792969, "logps/ref_rejected": -110.78089141845703, "logps/rejected": -118.11514282226562, "loss": 1.1053, "rewards/accuracies": 0.71875, "rewards/chosen": -0.024063657969236374, "rewards/margins": 0.42534342408180237, "rewards/rejected": -0.44940710067749023, "step": 210 }, { "epoch": 0.31897203325774753, "epsilon_dpo/beta": 0.06140627712011337, "epsilon_dpo/beta_margin_grad_mean": -0.4172515571117401, "epsilon_dpo/beta_margin_grad_std": 0.14959876239299774, "epsilon_dpo/beta_margin_mean": 0.3725390136241913, "epsilon_dpo/beta_margin_std": 0.6855942010879517, "epsilon_dpo/loss_margin_mean": 6.1406707763671875, "grad_norm": 14.281912803649902, "kl/avg_steps": 0.375, "kl/beta": 0.061631254851818085, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.3184341039326217e-07, "logits/chosen": -0.3958446979522705, "logits/rejected": -0.356731116771698, "logps/chosen": -76.7433853149414, "logps/ref_chosen": -76.47579956054688, "logps/ref_rejected": -117.42009735107422, "logps/rejected": -123.82835388183594, "loss": 1.1555, "rewards/accuracies": 0.703125, "rewards/chosen": -0.0184482354670763, "rewards/margins": 0.37253904342651367, "rewards/rejected": -0.3909872770309448, "step": 211 }, { "epoch": 0.3204837490551776, "epsilon_dpo/beta": 0.06119605526328087, "epsilon_dpo/beta_margin_grad_mean": -0.4093796908855438, "epsilon_dpo/beta_margin_grad_std": 0.16085541248321533, "epsilon_dpo/beta_margin_mean": 0.406364768743515, "epsilon_dpo/beta_margin_std": 0.7220626473426819, "epsilon_dpo/loss_margin_mean": 6.725605487823486, "grad_norm": 12.916170120239258, "kl/avg_steps": 0.34375, "kl/beta": 0.06140100210905075, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.309335095262675e-07, "logits/chosen": -0.272196501493454, "logits/rejected": -0.17583511769771576, "logps/chosen": -78.71829223632812, "logps/ref_chosen": -79.10020446777344, "logps/ref_rejected": -103.29380798339844, "logps/rejected": -109.63749694824219, "loss": 1.1416, "rewards/accuracies": 0.6875, "rewards/chosen": 0.020729010924696922, "rewards/margins": 0.4063647985458374, "rewards/rejected": -0.3856357932090759, "step": 212 }, { "epoch": 0.3219954648526077, "epsilon_dpo/beta": 0.06105324998497963, "epsilon_dpo/beta_margin_grad_mean": -0.4278034269809723, "epsilon_dpo/beta_margin_grad_std": 0.156768798828125, "epsilon_dpo/beta_margin_mean": 0.32995331287384033, "epsilon_dpo/beta_margin_std": 0.7114284038543701, "epsilon_dpo/loss_margin_mean": 5.488154888153076, "grad_norm": 14.290369033813477, "kl/avg_steps": 0.234375, "kl/beta": 0.0611906573176384, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.3001854756006724e-07, "logits/chosen": -0.6963798999786377, "logits/rejected": -0.4149439334869385, "logps/chosen": -86.58470153808594, "logps/ref_chosen": -86.60450744628906, "logps/ref_rejected": -100.115478515625, "logps/rejected": -105.58381652832031, "loss": 1.2, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0002879425883293152, "rewards/margins": 0.32995331287384033, "rewards/rejected": -0.33024126291275024, "step": 213 }, { "epoch": 0.3235071806500378, "epsilon_dpo/beta": 0.060843806713819504, "epsilon_dpo/beta_margin_grad_mean": -0.4379379451274872, "epsilon_dpo/beta_margin_grad_std": 0.14877955615520477, "epsilon_dpo/beta_margin_mean": 0.28321486711502075, "epsilon_dpo/beta_margin_std": 0.6764626502990723, "epsilon_dpo/loss_margin_mean": 4.7317938804626465, "grad_norm": 15.292763710021973, "kl/avg_steps": 0.34375, "kl/beta": 0.06104757636785507, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.290985500881143e-07, "logits/chosen": -0.5268340706825256, "logits/rejected": -0.4801381826400757, "logps/chosen": -83.47666931152344, "logps/ref_chosen": -83.61862182617188, "logps/ref_rejected": -89.77618408203125, "logps/rejected": -94.36602020263672, "loss": 1.2285, "rewards/accuracies": 0.65625, "rewards/chosen": 0.005956515669822693, "rewards/margins": 0.28321486711502075, "rewards/rejected": -0.27725836634635925, "step": 214 }, { "epoch": 0.3250188964474679, "epsilon_dpo/beta": 0.06063537672162056, "epsilon_dpo/beta_margin_grad_mean": -0.406276673078537, "epsilon_dpo/beta_margin_grad_std": 0.15514318645000458, "epsilon_dpo/beta_margin_mean": 0.43478846549987793, "epsilon_dpo/beta_margin_std": 0.7331078052520752, "epsilon_dpo/loss_margin_mean": 7.249897480010986, "grad_norm": 13.326201438903809, "kl/avg_steps": 0.34375, "kl/beta": 0.06083844602108002, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.281735428447157e-07, "logits/chosen": -0.2851630747318268, "logits/rejected": -0.34577393531799316, "logps/chosen": -87.37467956542969, "logps/ref_chosen": -85.94501495361328, "logps/ref_rejected": -125.37004089355469, "logps/rejected": -134.0496063232422, "loss": 1.1174, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08875112980604172, "rewards/margins": 0.43478846549987793, "rewards/rejected": -0.5235395431518555, "step": 215 }, { "epoch": 0.32653061224489793, "epsilon_dpo/beta": 0.060351863503456116, "epsilon_dpo/beta_margin_grad_mean": -0.421083003282547, "epsilon_dpo/beta_margin_grad_std": 0.1392340064048767, "epsilon_dpo/beta_margin_mean": 0.34421074390411377, "epsilon_dpo/beta_margin_std": 0.6146458983421326, "epsilon_dpo/loss_margin_mean": 5.769903659820557, "grad_norm": 13.648635864257812, "kl/avg_steps": 0.46875, "kl/beta": 0.06063003093004227, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.2724355170431247e-07, "logits/chosen": -0.1937863528728485, "logits/rejected": -0.4227668046951294, "logps/chosen": -91.29273986816406, "logps/ref_chosen": -90.13990783691406, "logps/ref_rejected": -121.17817687988281, "logps/rejected": -128.10092163085938, "loss": 1.1601, "rewards/accuracies": 0.734375, "rewards/chosen": -0.07172287255525589, "rewards/margins": 0.34421074390411377, "rewards/rejected": -0.41593360900878906, "step": 216 }, { "epoch": 0.328042328042328, "epsilon_dpo/beta": 0.060070279985666275, "epsilon_dpo/beta_margin_grad_mean": -0.41138482093811035, "epsilon_dpo/beta_margin_grad_std": 0.16538718342781067, "epsilon_dpo/beta_margin_mean": 0.41298747062683105, "epsilon_dpo/beta_margin_std": 0.7676586508750916, "epsilon_dpo/loss_margin_mean": 6.954167366027832, "grad_norm": 13.441911697387695, "kl/avg_steps": 0.46875, "kl/beta": 0.06034715101122856, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.26308602680756e-07, "logits/chosen": -0.3257751166820526, "logits/rejected": -0.33707672357559204, "logps/chosen": -84.47261047363281, "logps/ref_chosen": -83.57549285888672, "logps/ref_rejected": -116.87894439697266, "logps/rejected": -124.73023986816406, "loss": 1.1486, "rewards/accuracies": 0.703125, "rewards/chosen": -0.056727487593889236, "rewards/margins": 0.41298750042915344, "rewards/rejected": -0.4697149693965912, "step": 217 }, { "epoch": 0.3295540438397581, "epsilon_dpo/beta": 0.059921421110630035, "epsilon_dpo/beta_margin_grad_mean": -0.44961637258529663, "epsilon_dpo/beta_margin_grad_std": 0.1689213514328003, "epsilon_dpo/beta_margin_mean": 0.22604058682918549, "epsilon_dpo/beta_margin_std": 0.7624347805976868, "epsilon_dpo/loss_margin_mean": 3.869769334793091, "grad_norm": 15.115226745605469, "kl/avg_steps": 0.25, "kl/beta": 0.06006559357047081, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.253687219265803e-07, "logits/chosen": -0.2687731981277466, "logits/rejected": -0.22891171276569366, "logps/chosen": -101.1187744140625, "logps/ref_chosen": -98.71665954589844, "logps/ref_rejected": -97.98152160644531, "logps/rejected": -104.25341033935547, "loss": 1.3086, "rewards/accuracies": 0.625, "rewards/chosen": -0.1473478376865387, "rewards/margins": 0.22604063153266907, "rewards/rejected": -0.37338846921920776, "step": 218 }, { "epoch": 0.3310657596371882, "epsilon_dpo/beta": 0.05964091420173645, "epsilon_dpo/beta_margin_grad_mean": -0.4214233458042145, "epsilon_dpo/beta_margin_grad_std": 0.13727082312107086, "epsilon_dpo/beta_margin_mean": 0.34724336862564087, "epsilon_dpo/beta_margin_std": 0.6157631278038025, "epsilon_dpo/loss_margin_mean": 5.883241176605225, "grad_norm": 12.273845672607422, "kl/avg_steps": 0.46875, "kl/beta": 0.05991580709815025, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.2442393573227043e-07, "logits/chosen": -0.3256840109825134, "logits/rejected": -0.4019477367401123, "logps/chosen": -83.3803939819336, "logps/ref_chosen": -82.32901000976562, "logps/ref_rejected": -97.81877136230469, "logps/rejected": -104.7533950805664, "loss": 1.1569, "rewards/accuracies": 0.734375, "rewards/chosen": -0.06491810828447342, "rewards/margins": 0.34724336862564087, "rewards/rejected": -0.4121614992618561, "step": 219 }, { "epoch": 0.3325774754346183, "epsilon_dpo/beta": 0.05939992889761925, "epsilon_dpo/beta_margin_grad_mean": -0.43890780210494995, "epsilon_dpo/beta_margin_grad_std": 0.1572382152080536, "epsilon_dpo/beta_margin_mean": 0.27426812052726746, "epsilon_dpo/beta_margin_std": 0.6969823837280273, "epsilon_dpo/loss_margin_mean": 4.694486618041992, "grad_norm": 12.854135513305664, "kl/avg_steps": 0.40625, "kl/beta": 0.05963626131415367, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.234742705255272e-07, "logits/chosen": -0.30797386169433594, "logits/rejected": -0.4406611919403076, "logps/chosen": -67.56721496582031, "logps/ref_chosen": -67.49903106689453, "logps/ref_rejected": -90.92181396484375, "logps/rejected": -95.68447875976562, "loss": 1.2449, "rewards/accuracies": 0.625, "rewards/chosen": -0.006153309717774391, "rewards/margins": 0.27426809072494507, "rewards/rejected": -0.2804214358329773, "step": 220 }, { "epoch": 0.3340891912320484, "epsilon_dpo/beta": 0.059215277433395386, "epsilon_dpo/beta_margin_grad_mean": -0.41286715865135193, "epsilon_dpo/beta_margin_grad_std": 0.1621244251728058, "epsilon_dpo/beta_margin_mean": 0.4035100042819977, "epsilon_dpo/beta_margin_std": 0.7555540800094604, "epsilon_dpo/loss_margin_mean": 6.901949405670166, "grad_norm": 13.689908027648926, "kl/avg_steps": 0.3125, "kl/beta": 0.059394966810941696, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.22519752870528e-07, "logits/chosen": -0.3771657347679138, "logits/rejected": -0.46649545431137085, "logps/chosen": -83.02555847167969, "logps/ref_chosen": -83.29585266113281, "logps/ref_rejected": -115.793701171875, "logps/rejected": -122.42536163330078, "loss": 1.1516, "rewards/accuracies": 0.671875, "rewards/chosen": 0.014289181679487228, "rewards/margins": 0.4035099744796753, "rewards/rejected": -0.38922083377838135, "step": 221 }, { "epoch": 0.3356009070294785, "epsilon_dpo/beta": 0.05888276919722557, "epsilon_dpo/beta_margin_grad_mean": -0.3975021243095398, "epsilon_dpo/beta_margin_grad_std": 0.1471283882856369, "epsilon_dpo/beta_margin_mean": 0.46506625413894653, "epsilon_dpo/beta_margin_std": 0.6867176294326782, "epsilon_dpo/loss_margin_mean": 7.96026086807251, "grad_norm": 15.286458015441895, "kl/avg_steps": 0.5625, "kl/beta": 0.05920993909239769, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.2156040946718343e-07, "logits/chosen": -0.23492375016212463, "logits/rejected": -0.323875367641449, "logps/chosen": -82.56568908691406, "logps/ref_chosen": -82.82731628417969, "logps/ref_rejected": -131.6415252685547, "logps/rejected": -139.34014892578125, "loss": 1.0802, "rewards/accuracies": 0.75, "rewards/chosen": 0.012266065925359726, "rewards/margins": 0.4650663137435913, "rewards/rejected": -0.4528002142906189, "step": 222 }, { "epoch": 0.3371126228269085, "epsilon_dpo/beta": 0.05864541232585907, "epsilon_dpo/beta_margin_grad_mean": -0.40855130553245544, "epsilon_dpo/beta_margin_grad_std": 0.1467190384864807, "epsilon_dpo/beta_margin_mean": 0.4161318242549896, "epsilon_dpo/beta_margin_std": 0.6898074150085449, "epsilon_dpo/loss_margin_mean": 7.165706634521484, "grad_norm": 13.55103588104248, "kl/avg_steps": 0.40625, "kl/beta": 0.05887874588370323, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.2059626715039065e-07, "logits/chosen": -0.4994870126247406, "logits/rejected": -0.4353235363960266, "logps/chosen": -86.22138977050781, "logps/ref_chosen": -84.97879028320312, "logps/ref_rejected": -108.11666870117188, "logps/rejected": -116.52497100830078, "loss": 1.1195, "rewards/accuracies": 0.75, "rewards/chosen": -0.07521222531795502, "rewards/margins": 0.41613179445266724, "rewards/rejected": -0.49134403467178345, "step": 223 }, { "epoch": 0.3386243386243386, "epsilon_dpo/beta": 0.05844460055232048, "epsilon_dpo/beta_margin_grad_mean": -0.4341517984867096, "epsilon_dpo/beta_margin_grad_std": 0.1348971128463745, "epsilon_dpo/beta_margin_mean": 0.29352185130119324, "epsilon_dpo/beta_margin_std": 0.5995880365371704, "epsilon_dpo/loss_margin_mean": 5.082745552062988, "grad_norm": 14.399001121520996, "kl/avg_steps": 0.34375, "kl/beta": 0.05864051729440689, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.1962735288928304e-07, "logits/chosen": -0.36321765184402466, "logits/rejected": -0.22956721484661102, "logps/chosen": -86.13837432861328, "logps/ref_chosen": -85.66336822509766, "logps/ref_rejected": -96.351806640625, "logps/rejected": -101.90956115722656, "loss": 1.1982, "rewards/accuracies": 0.671875, "rewards/chosen": -0.02954496629536152, "rewards/margins": 0.29352182149887085, "rewards/rejected": -0.3230668306350708, "step": 224 }, { "epoch": 0.3401360544217687, "epsilon_dpo/beta": 0.05822630599141121, "epsilon_dpo/beta_margin_grad_mean": -0.4103103578090668, "epsilon_dpo/beta_margin_grad_std": 0.17332212626934052, "epsilon_dpo/beta_margin_mean": 0.42294129729270935, "epsilon_dpo/beta_margin_std": 0.8520727753639221, "epsilon_dpo/loss_margin_mean": 7.355151176452637, "grad_norm": 16.749916076660156, "kl/avg_steps": 0.375, "kl/beta": 0.05843963101506233, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.186536937864752e-07, "logits/chosen": -0.5275589823722839, "logits/rejected": -0.610481321811676, "logps/chosen": -96.54353332519531, "logps/ref_chosen": -95.25894165039062, "logps/ref_rejected": -144.3567657470703, "logps/rejected": -152.99652099609375, "loss": 1.1655, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07754532992839813, "rewards/margins": 0.4229414165019989, "rewards/rejected": -0.5004867315292358, "step": 225 }, { "epoch": 0.3416477702191988, "epsilon_dpo/beta": 0.0580451674759388, "epsilon_dpo/beta_margin_grad_mean": -0.41428643465042114, "epsilon_dpo/beta_margin_grad_std": 0.1615278124809265, "epsilon_dpo/beta_margin_mean": 0.3854374289512634, "epsilon_dpo/beta_margin_std": 0.7472378015518188, "epsilon_dpo/loss_margin_mean": 6.730257034301758, "grad_norm": 12.055249214172363, "kl/avg_steps": 0.3125, "kl/beta": 0.058221302926540375, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.176753170773052e-07, "logits/chosen": -0.4694936275482178, "logits/rejected": -0.34530776739120483, "logps/chosen": -71.9918441772461, "logps/ref_chosen": -73.33175659179688, "logps/ref_rejected": -90.7422103881836, "logps/rejected": -96.13255310058594, "loss": 1.1649, "rewards/accuracies": 0.671875, "rewards/chosen": 0.07627996802330017, "rewards/margins": 0.3854374289512634, "rewards/rejected": -0.30915746092796326, "step": 226 }, { "epoch": 0.3431594860166289, "epsilon_dpo/beta": 0.057846199721097946, "epsilon_dpo/beta_margin_grad_mean": -0.4127749502658844, "epsilon_dpo/beta_margin_grad_std": 0.1692953109741211, "epsilon_dpo/beta_margin_mean": 0.39249685406684875, "epsilon_dpo/beta_margin_std": 0.7651001811027527, "epsilon_dpo/loss_margin_mean": 6.88016939163208, "grad_norm": 18.469097137451172, "kl/avg_steps": 0.34375, "kl/beta": 0.05803992599248886, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.166922501290729e-07, "logits/chosen": -0.22612786293029785, "logits/rejected": -0.5124807953834534, "logps/chosen": -81.56525421142578, "logps/ref_chosen": -80.71163940429688, "logps/ref_rejected": -98.33506774902344, "logps/rejected": -106.06885528564453, "loss": 1.1672, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0527096726000309, "rewards/margins": 0.39249682426452637, "rewards/rejected": -0.44520652294158936, "step": 227 }, { "epoch": 0.34467120181405897, "epsilon_dpo/beta": 0.0576661080121994, "epsilon_dpo/beta_margin_grad_mean": -0.42423751950263977, "epsilon_dpo/beta_margin_grad_std": 0.15881699323654175, "epsilon_dpo/beta_margin_mean": 0.34450241923332214, "epsilon_dpo/beta_margin_std": 0.7262859344482422, "epsilon_dpo/loss_margin_mean": 6.062928199768066, "grad_norm": 13.120859146118164, "kl/avg_steps": 0.3125, "kl/beta": 0.05784109607338905, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.1570452044027405e-07, "logits/chosen": -0.4411237835884094, "logits/rejected": -0.3885362148284912, "logps/chosen": -78.60456848144531, "logps/ref_chosen": -77.68626403808594, "logps/ref_rejected": -103.74498748779297, "logps/rejected": -110.7262191772461, "loss": 1.192, "rewards/accuracies": 0.671875, "rewards/chosen": -0.05492077395319939, "rewards/margins": 0.34450244903564453, "rewards/rejected": -0.39942318201065063, "step": 228 }, { "epoch": 0.34618291761148906, "epsilon_dpo/beta": 0.05747736245393753, "epsilon_dpo/beta_margin_grad_mean": -0.41639742255210876, "epsilon_dpo/beta_margin_grad_std": 0.15416114032268524, "epsilon_dpo/beta_margin_mean": 0.37639522552490234, "epsilon_dpo/beta_margin_std": 0.7211213707923889, "epsilon_dpo/loss_margin_mean": 6.627065658569336, "grad_norm": 12.59009075164795, "kl/avg_steps": 0.328125, "kl/beta": 0.05766090750694275, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.147121556398312e-07, "logits/chosen": -0.40707510709762573, "logits/rejected": -0.6244249939918518, "logps/chosen": -69.3853759765625, "logps/ref_chosen": -70.9267349243164, "logps/ref_rejected": -98.36172485351562, "logps/rejected": -103.44743347167969, "loss": 1.1623, "rewards/accuracies": 0.703125, "rewards/chosen": 0.08642115443944931, "rewards/margins": 0.37639522552490234, "rewards/rejected": -0.28997403383255005, "step": 229 }, { "epoch": 0.3476946334089191, "epsilon_dpo/beta": 0.057280492037534714, "epsilon_dpo/beta_margin_grad_mean": -0.4393908679485321, "epsilon_dpo/beta_margin_grad_std": 0.16247457265853882, "epsilon_dpo/beta_margin_mean": 0.2653091549873352, "epsilon_dpo/beta_margin_std": 0.730425238609314, "epsilon_dpo/loss_margin_mean": 4.723310947418213, "grad_norm": 15.629660606384277, "kl/avg_steps": 0.34375, "kl/beta": 0.05747232586145401, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.137151834863213e-07, "logits/chosen": -0.46971315145492554, "logits/rejected": -0.42873239517211914, "logps/chosen": -83.31251525878906, "logps/ref_chosen": -82.13688659667969, "logps/ref_rejected": -83.09074401855469, "logps/rejected": -88.98968505859375, "loss": 1.2632, "rewards/accuracies": 0.671875, "rewards/chosen": -0.07054407149553299, "rewards/margins": 0.26530909538269043, "rewards/rejected": -0.3358531594276428, "step": 230 }, { "epoch": 0.3492063492063492, "epsilon_dpo/beta": 0.05708426609635353, "epsilon_dpo/beta_margin_grad_mean": -0.4018583595752716, "epsilon_dpo/beta_margin_grad_std": 0.16053839027881622, "epsilon_dpo/beta_margin_mean": 0.4458349347114563, "epsilon_dpo/beta_margin_std": 0.7325132489204407, "epsilon_dpo/loss_margin_mean": 7.896346569061279, "grad_norm": 13.946819305419922, "kl/avg_steps": 0.34375, "kl/beta": 0.05727544054389, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.1271363186719835e-07, "logits/chosen": -0.14116013050079346, "logits/rejected": -0.1820957064628601, "logps/chosen": -102.2670669555664, "logps/ref_chosen": -100.55113220214844, "logps/ref_rejected": -107.24727630615234, "logps/rejected": -116.85955810546875, "loss": 1.1122, "rewards/accuracies": 0.734375, "rewards/chosen": -0.09947866201400757, "rewards/margins": 0.4458349943161011, "rewards/rejected": -0.5453135967254639, "step": 231 }, { "epoch": 0.3507180650037793, "epsilon_dpo/beta": 0.056933220475912094, "epsilon_dpo/beta_margin_grad_mean": -0.43795302510261536, "epsilon_dpo/beta_margin_grad_std": 0.1754705309867859, "epsilon_dpo/beta_margin_mean": 0.3054857552051544, "epsilon_dpo/beta_margin_std": 0.8524812459945679, "epsilon_dpo/loss_margin_mean": 5.4650115966796875, "grad_norm": 14.59022331237793, "kl/avg_steps": 0.265625, "kl/beta": 0.05707923322916031, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.1170752879801436e-07, "logits/chosen": -0.5040056109428406, "logits/rejected": -0.3939094543457031, "logps/chosen": -94.84075927734375, "logps/ref_chosen": -92.6686019897461, "logps/ref_rejected": -103.07643127441406, "logps/rejected": -110.71360778808594, "loss": 1.2637, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1255113184452057, "rewards/margins": 0.3054857552051544, "rewards/rejected": -0.4309970736503601, "step": 232 }, { "epoch": 0.35222978080120937, "epsilon_dpo/beta": 0.05688033252954483, "epsilon_dpo/beta_margin_grad_mean": -0.4426785707473755, "epsilon_dpo/beta_margin_grad_std": 0.17714117467403412, "epsilon_dpo/beta_margin_mean": 0.270059198141098, "epsilon_dpo/beta_margin_std": 0.8090578317642212, "epsilon_dpo/loss_margin_mean": 4.856035232543945, "grad_norm": 15.220146179199219, "kl/avg_steps": 0.09375, "kl/beta": 0.05692801624536514, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 4.106969024216348e-07, "logits/chosen": -0.33606183528900146, "logits/rejected": -0.13796675205230713, "logps/chosen": -80.97364807128906, "logps/ref_chosen": -78.285400390625, "logps/ref_rejected": -90.51336669921875, "logps/rejected": -98.05764770507812, "loss": 1.2856, "rewards/accuracies": 0.546875, "rewards/chosen": -0.15561680495738983, "rewards/margins": 0.27005913853645325, "rewards/rejected": -0.4256759583950043, "step": 233 }, { "epoch": 0.35374149659863946, "epsilon_dpo/beta": 0.05677373334765434, "epsilon_dpo/beta_margin_grad_mean": -0.4403716027736664, "epsilon_dpo/beta_margin_grad_std": 0.1675698608160019, "epsilon_dpo/beta_margin_mean": 0.2730686664581299, "epsilon_dpo/beta_margin_std": 0.7592166662216187, "epsilon_dpo/loss_margin_mean": 4.9085798263549805, "grad_norm": 13.379533767700195, "kl/avg_steps": 0.1875, "kl/beta": 0.05687469616532326, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.09681781007452e-07, "logits/chosen": -0.3230592608451843, "logits/rejected": -0.5212711095809937, "logps/chosen": -77.2991943359375, "logps/ref_chosen": -75.72798919677734, "logps/ref_rejected": -74.1239013671875, "logps/rejected": -80.60368347167969, "loss": 1.2656, "rewards/accuracies": 0.625, "rewards/chosen": -0.09258531779050827, "rewards/margins": 0.2730686664581299, "rewards/rejected": -0.36565396189689636, "step": 234 }, { "epoch": 0.35525321239606955, "epsilon_dpo/beta": 0.05650780722498894, "epsilon_dpo/beta_margin_grad_mean": -0.3946419656276703, "epsilon_dpo/beta_margin_grad_std": 0.15039300918579102, "epsilon_dpo/beta_margin_mean": 0.47568777203559875, "epsilon_dpo/beta_margin_std": 0.6847705841064453, "epsilon_dpo/loss_margin_mean": 8.491762161254883, "grad_norm": 13.730059623718262, "kl/avg_steps": 0.46875, "kl/beta": 0.056768257170915604, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.08662192950594e-07, "logits/chosen": -0.6019189953804016, "logits/rejected": -0.48583483695983887, "logps/chosen": -91.29273223876953, "logps/ref_chosen": -91.11560821533203, "logps/ref_rejected": -103.01558685302734, "logps/rejected": -111.68447875976562, "loss": 1.0732, "rewards/accuracies": 0.78125, "rewards/chosen": -0.011768028140068054, "rewards/margins": 0.47568780183792114, "rewards/rejected": -0.4874558448791504, "step": 235 }, { "epoch": 0.35676492819349964, "epsilon_dpo/beta": 0.05634119734168053, "epsilon_dpo/beta_margin_grad_mean": -0.4295879006385803, "epsilon_dpo/beta_margin_grad_std": 0.17866680026054382, "epsilon_dpo/beta_margin_mean": 0.31965601444244385, "epsilon_dpo/beta_margin_std": 0.8243433833122253, "epsilon_dpo/loss_margin_mean": 5.777461051940918, "grad_norm": 16.337749481201172, "kl/avg_steps": 0.296875, "kl/beta": 0.05650339648127556, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.076381667711306e-07, "logits/chosen": -0.3394400179386139, "logits/rejected": -0.1670718789100647, "logps/chosen": -101.02061462402344, "logps/ref_chosen": -97.19900512695312, "logps/ref_rejected": -106.0116958618164, "logps/rejected": -115.61076354980469, "loss": 1.2481, "rewards/accuracies": 0.65625, "rewards/chosen": -0.21814972162246704, "rewards/margins": 0.31965598464012146, "rewards/rejected": -0.5378056764602661, "step": 236 }, { "epoch": 0.35827664399092973, "epsilon_dpo/beta": 0.05618331953883171, "epsilon_dpo/beta_margin_grad_mean": -0.4469059705734253, "epsilon_dpo/beta_margin_grad_std": 0.15130768716335297, "epsilon_dpo/beta_margin_mean": 0.23650793731212616, "epsilon_dpo/beta_margin_std": 0.6870653629302979, "epsilon_dpo/loss_margin_mean": 4.295169830322266, "grad_norm": 15.307231903076172, "kl/avg_steps": 0.28125, "kl/beta": 0.05633614957332611, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.066097311132753e-07, "logits/chosen": -0.24095523357391357, "logits/rejected": -0.20215189456939697, "logps/chosen": -102.84424591064453, "logps/ref_chosen": -99.35952758789062, "logps/ref_rejected": -105.2945327758789, "logps/rejected": -113.07442474365234, "loss": 1.2733, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19786638021469116, "rewards/margins": 0.23650792241096497, "rewards/rejected": -0.43437430262565613, "step": 237 }, { "epoch": 0.35978835978835977, "epsilon_dpo/beta": 0.055964212864637375, "epsilon_dpo/beta_margin_grad_mean": -0.41444844007492065, "epsilon_dpo/beta_margin_grad_std": 0.14432160556316376, "epsilon_dpo/beta_margin_mean": 0.38114407658576965, "epsilon_dpo/beta_margin_std": 0.648664116859436, "epsilon_dpo/loss_margin_mean": 6.885334014892578, "grad_norm": 13.714317321777344, "kl/avg_steps": 0.390625, "kl/beta": 0.056178148835897446, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.0557691474458414e-07, "logits/chosen": -0.6821615695953369, "logits/rejected": -0.23968210816383362, "logps/chosen": -89.17784118652344, "logps/ref_chosen": -87.75861358642578, "logps/ref_rejected": -99.15858459472656, "logps/rejected": -107.46315002441406, "loss": 1.1384, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08189239352941513, "rewards/margins": 0.38114404678344727, "rewards/rejected": -0.463036447763443, "step": 238 }, { "epoch": 0.36130007558578986, "epsilon_dpo/beta": 0.055807750672101974, "epsilon_dpo/beta_margin_grad_mean": -0.4088519811630249, "epsilon_dpo/beta_margin_grad_std": 0.1726495772600174, "epsilon_dpo/beta_margin_mean": 0.43803808093070984, "epsilon_dpo/beta_margin_std": 0.8137868046760559, "epsilon_dpo/loss_margin_mean": 7.949976921081543, "grad_norm": 14.539200782775879, "kl/avg_steps": 0.28125, "kl/beta": 0.0559595562517643, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.045397465551513e-07, "logits/chosen": -0.19835148751735687, "logits/rejected": -0.379528284072876, "logps/chosen": -84.41043090820312, "logps/ref_chosen": -81.45846557617188, "logps/ref_rejected": -142.63424682617188, "logps/rejected": -153.5361785888672, "loss": 1.1434, "rewards/accuracies": 0.671875, "rewards/chosen": -0.16567102074623108, "rewards/margins": 0.4380381107330322, "rewards/rejected": -0.6037091016769409, "step": 239 }, { "epoch": 0.36281179138321995, "epsilon_dpo/beta": 0.0555814690887928, "epsilon_dpo/beta_margin_grad_mean": -0.38159269094467163, "epsilon_dpo/beta_margin_grad_std": 0.17057675123214722, "epsilon_dpo/beta_margin_mean": 0.5508310198783875, "epsilon_dpo/beta_margin_std": 0.7981725335121155, "epsilon_dpo/loss_margin_mean": 10.005630493164062, "grad_norm": 25.778789520263672, "kl/avg_steps": 0.40625, "kl/beta": 0.0558026097714901, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.0349825555680045e-07, "logits/chosen": -0.7162148356437683, "logits/rejected": -0.9191411137580872, "logps/chosen": -79.45154571533203, "logps/ref_chosen": -77.65220642089844, "logps/ref_rejected": -106.41152954101562, "logps/rejected": -118.21650695800781, "loss": 1.0521, "rewards/accuracies": 0.734375, "rewards/chosen": -0.10241775214672089, "rewards/margins": 0.5508310198783875, "rewards/rejected": -0.6532487869262695, "step": 240 }, { "epoch": 0.36432350718065004, "epsilon_dpo/beta": 0.055408693850040436, "epsilon_dpo/beta_margin_grad_mean": -0.43489742279052734, "epsilon_dpo/beta_margin_grad_std": 0.15454921126365662, "epsilon_dpo/beta_margin_mean": 0.293466180562973, "epsilon_dpo/beta_margin_std": 0.6903191208839417, "epsilon_dpo/loss_margin_mean": 5.379743576049805, "grad_norm": 17.371824264526367, "kl/avg_steps": 0.3125, "kl/beta": 0.055576831102371216, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.0245247088227377e-07, "logits/chosen": -0.3993128538131714, "logits/rejected": -0.2141626477241516, "logps/chosen": -97.16249084472656, "logps/ref_chosen": -93.96852111816406, "logps/ref_rejected": -109.2076187133789, "logps/rejected": -117.78132629394531, "loss": 1.2255, "rewards/accuracies": 0.625, "rewards/chosen": -0.18001073598861694, "rewards/margins": 0.2934662103652954, "rewards/rejected": -0.47347694635391235, "step": 241 }, { "epoch": 0.36583522297808013, "epsilon_dpo/beta": 0.0550975576043129, "epsilon_dpo/beta_margin_grad_mean": -0.3924703598022461, "epsilon_dpo/beta_margin_grad_std": 0.15534812211990356, "epsilon_dpo/beta_margin_mean": 0.4966244101524353, "epsilon_dpo/beta_margin_std": 0.757362961769104, "epsilon_dpo/loss_margin_mean": 9.078295707702637, "grad_norm": 13.380143165588379, "kl/avg_steps": 0.5625, "kl/beta": 0.0554036945104599, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.0140242178441665e-07, "logits/chosen": -0.2932683825492859, "logits/rejected": -0.4387352466583252, "logps/chosen": -82.11421966552734, "logps/ref_chosen": -82.18013000488281, "logps/ref_rejected": -89.77629089355469, "logps/rejected": -98.78868103027344, "loss": 1.0755, "rewards/accuracies": 0.828125, "rewards/chosen": 0.0007947832345962524, "rewards/margins": 0.4966244101524353, "rewards/rejected": -0.49582961201667786, "step": 242 }, { "epoch": 0.3673469387755102, "epsilon_dpo/beta": 0.05490989610552788, "epsilon_dpo/beta_margin_grad_mean": -0.4230293035507202, "epsilon_dpo/beta_margin_grad_std": 0.14619854092597961, "epsilon_dpo/beta_margin_mean": 0.341871976852417, "epsilon_dpo/beta_margin_std": 0.6454283595085144, "epsilon_dpo/loss_margin_mean": 6.306732177734375, "grad_norm": 13.98609447479248, "kl/avg_steps": 0.34375, "kl/beta": 0.05509379133582115, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.003481376353596e-07, "logits/chosen": -0.4575600028038025, "logits/rejected": -0.29443448781967163, "logps/chosen": -104.02952575683594, "logps/ref_chosen": -100.66859436035156, "logps/ref_rejected": -97.08364868164062, "logps/rejected": -106.75131225585938, "loss": 1.1711, "rewards/accuracies": 0.671875, "rewards/chosen": -0.1866493821144104, "rewards/margins": 0.341871976852417, "rewards/rejected": -0.5285213589668274, "step": 243 }, { "epoch": 0.3688586545729403, "epsilon_dpo/beta": 0.054567355662584305, "epsilon_dpo/beta_margin_grad_mean": -0.3743903636932373, "epsilon_dpo/beta_margin_grad_std": 0.12789258360862732, "epsilon_dpo/beta_margin_mean": 0.5610859990119934, "epsilon_dpo/beta_margin_std": 0.6003847718238831, "epsilon_dpo/loss_margin_mean": 10.335267066955566, "grad_norm": 12.74890422821045, "kl/avg_steps": 0.625, "kl/beta": 0.05490505322813988, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.9928964792569654e-07, "logits/chosen": -0.606474757194519, "logits/rejected": -0.2900153696537018, "logps/chosen": -76.24296569824219, "logps/ref_chosen": -74.50486755371094, "logps/ref_rejected": -102.90267944335938, "logps/rejected": -114.97602844238281, "loss": 0.9822, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09550999850034714, "rewards/margins": 0.5610860586166382, "rewards/rejected": -0.6565960049629211, "step": 244 }, { "epoch": 0.37037037037037035, "epsilon_dpo/beta": 0.05426253378391266, "epsilon_dpo/beta_margin_grad_mean": -0.3894428312778473, "epsilon_dpo/beta_margin_grad_std": 0.14738242328166962, "epsilon_dpo/beta_margin_mean": 0.49736499786376953, "epsilon_dpo/beta_margin_std": 0.6809157729148865, "epsilon_dpo/loss_margin_mean": 9.233874320983887, "grad_norm": 13.286972999572754, "kl/avg_steps": 0.5625, "kl/beta": 0.05456402897834778, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.982269822636601e-07, "logits/chosen": -0.4713189899921417, "logits/rejected": -0.4719199240207672, "logps/chosen": -97.87628173828125, "logps/ref_chosen": -94.40076446533203, "logps/ref_rejected": -102.05097961425781, "logps/rejected": -114.7603759765625, "loss": 1.0545, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18986687064170837, "rewards/margins": 0.49736499786376953, "rewards/rejected": -0.6872318387031555, "step": 245 }, { "epoch": 0.37188208616780044, "epsilon_dpo/beta": 0.05406075716018677, "epsilon_dpo/beta_margin_grad_mean": -0.39330968260765076, "epsilon_dpo/beta_margin_grad_std": 0.16505803167819977, "epsilon_dpo/beta_margin_mean": 0.4963088929653168, "epsilon_dpo/beta_margin_std": 0.7848570942878723, "epsilon_dpo/loss_margin_mean": 9.28079891204834, "grad_norm": 13.795814514160156, "kl/avg_steps": 0.375, "kl/beta": 0.05425882339477539, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.971601703742932e-07, "logits/chosen": -0.3446606397628784, "logits/rejected": -0.3383950889110565, "logps/chosen": -103.00593566894531, "logps/ref_chosen": -97.2917709350586, "logps/ref_rejected": -120.41865539550781, "logps/rejected": -135.4136199951172, "loss": 1.0868, "rewards/accuracies": 0.71875, "rewards/chosen": -0.31252914667129517, "rewards/margins": 0.4963088631629944, "rewards/rejected": -0.8088380098342896, "step": 246 }, { "epoch": 0.37339380196523053, "epsilon_dpo/beta": 0.05397704616189003, "epsilon_dpo/beta_margin_grad_mean": -0.4514468312263489, "epsilon_dpo/beta_margin_grad_std": 0.15922772884368896, "epsilon_dpo/beta_margin_mean": 0.2120126336812973, "epsilon_dpo/beta_margin_std": 0.7254804968833923, "epsilon_dpo/loss_margin_mean": 4.027678489685059, "grad_norm": 14.925507545471191, "kl/avg_steps": 0.15625, "kl/beta": 0.054056111723184586, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 3.960892420986177e-07, "logits/chosen": -0.24031943082809448, "logits/rejected": -0.38908693194389343, "logps/chosen": -117.6470718383789, "logps/ref_chosen": -110.15637969970703, "logps/ref_rejected": -112.48919677734375, "logps/rejected": -124.007568359375, "loss": 1.3077, "rewards/accuracies": 0.609375, "rewards/chosen": -0.40666455030441284, "rewards/margins": 0.2120126336812973, "rewards/rejected": -0.6186771988868713, "step": 247 }, { "epoch": 0.3749055177626606, "epsilon_dpo/beta": 0.053825367242097855, "epsilon_dpo/beta_margin_grad_mean": -0.4187783896923065, "epsilon_dpo/beta_margin_grad_std": 0.18080280721187592, "epsilon_dpo/beta_margin_mean": 0.3853808343410492, "epsilon_dpo/beta_margin_std": 0.8422636389732361, "epsilon_dpo/loss_margin_mean": 7.274705410003662, "grad_norm": 17.41864585876465, "kl/avg_steps": 0.28125, "kl/beta": 0.0539717823266983, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 3.9501422739279953e-07, "logits/chosen": -0.3880723714828491, "logits/rejected": -0.15597718954086304, "logps/chosen": -99.78440856933594, "logps/ref_chosen": -94.91020202636719, "logps/ref_rejected": -79.59503936767578, "logps/rejected": -91.74395751953125, "loss": 1.1984, "rewards/accuracies": 0.625, "rewards/chosen": -0.2669926881790161, "rewards/margins": 0.3853807747364044, "rewards/rejected": -0.6523734927177429, "step": 248 }, { "epoch": 0.3764172335600907, "epsilon_dpo/beta": 0.053691230714321136, "epsilon_dpo/beta_margin_grad_mean": -0.473882794380188, "epsilon_dpo/beta_margin_grad_std": 0.17014063894748688, "epsilon_dpo/beta_margin_mean": 0.1020829826593399, "epsilon_dpo/beta_margin_std": 0.773459255695343, "epsilon_dpo/loss_margin_mean": 2.0085771083831787, "grad_norm": 16.087905883789062, "kl/avg_steps": 0.25, "kl/beta": 0.0538204126060009, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.9393515632731094e-07, "logits/chosen": -0.12361128628253937, "logits/rejected": -0.1863332986831665, "logps/chosen": -108.78060913085938, "logps/ref_chosen": -99.98388671875, "logps/ref_rejected": -84.525146484375, "logps/rejected": -95.33045959472656, "loss": 1.4266, "rewards/accuracies": 0.65625, "rewards/chosen": -0.47448354959487915, "rewards/margins": 0.10208297520875931, "rewards/rejected": -0.5765665173530579, "step": 249 }, { "epoch": 0.3779289493575208, "epsilon_dpo/beta": 0.05338955298066139, "epsilon_dpo/beta_margin_grad_mean": -0.3938334286212921, "epsilon_dpo/beta_margin_grad_std": 0.16821321845054626, "epsilon_dpo/beta_margin_mean": 0.49425008893013, "epsilon_dpo/beta_margin_std": 0.8463791012763977, "epsilon_dpo/loss_margin_mean": 9.341487884521484, "grad_norm": 15.321463584899902, "kl/avg_steps": 0.5625, "kl/beta": 0.053686197847127914, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.9285205908608934e-07, "logits/chosen": -0.49714428186416626, "logits/rejected": -0.35792437195777893, "logps/chosen": -92.70217895507812, "logps/ref_chosen": -85.96786499023438, "logps/ref_rejected": -103.78448486328125, "logps/rejected": -119.86029052734375, "loss": 1.1055, "rewards/accuracies": 0.78125, "rewards/chosen": -0.36198216676712036, "rewards/margins": 0.4942500591278076, "rewards/rejected": -0.8562322854995728, "step": 250 }, { "epoch": 0.3794406651549509, "epsilon_dpo/beta": 0.05319102481007576, "epsilon_dpo/beta_margin_grad_mean": -0.4373018145561218, "epsilon_dpo/beta_margin_grad_std": 0.1502736508846283, "epsilon_dpo/beta_margin_mean": 0.2876345217227936, "epsilon_dpo/beta_margin_std": 0.6921249032020569, "epsilon_dpo/loss_margin_mean": 5.49154806137085, "grad_norm": 12.652570724487305, "kl/avg_steps": 0.375, "kl/beta": 0.0533859021961689, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.9176496596569265e-07, "logits/chosen": -0.245034322142601, "logits/rejected": -0.4103810787200928, "logps/chosen": -95.39961242675781, "logps/ref_chosen": -88.44728088378906, "logps/ref_rejected": -106.68901062011719, "logps/rejected": -119.13288879394531, "loss": 1.2288, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3725150227546692, "rewards/margins": 0.2876344919204712, "rewards/rejected": -0.6601495146751404, "step": 251 }, { "epoch": 0.38095238095238093, "epsilon_dpo/beta": 0.05302554741501808, "epsilon_dpo/beta_margin_grad_mean": -0.44193825125694275, "epsilon_dpo/beta_margin_grad_std": 0.16500435769557953, "epsilon_dpo/beta_margin_mean": 0.2562330365180969, "epsilon_dpo/beta_margin_std": 0.7518081068992615, "epsilon_dpo/loss_margin_mean": 4.933616638183594, "grad_norm": 14.027976989746094, "kl/avg_steps": 0.3125, "kl/beta": 0.05318645387887955, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.9067390737445254e-07, "logits/chosen": -0.49801328778266907, "logits/rejected": -0.47471922636032104, "logps/chosen": -83.92085266113281, "logps/ref_chosen": -78.4037857055664, "logps/ref_rejected": -99.27626037597656, "logps/rejected": -109.7269515991211, "loss": 1.2776, "rewards/accuracies": 0.671875, "rewards/chosen": -0.2952837347984314, "rewards/margins": 0.2562330365180969, "rewards/rejected": -0.5515167713165283, "step": 252 }, { "epoch": 0.382464096749811, "epsilon_dpo/beta": 0.052843790501356125, "epsilon_dpo/beta_margin_grad_mean": -0.44336360692977905, "epsilon_dpo/beta_margin_grad_std": 0.16899996995925903, "epsilon_dpo/beta_margin_mean": 0.265546977519989, "epsilon_dpo/beta_margin_std": 0.7731426358222961, "epsilon_dpo/loss_margin_mean": 5.125515937805176, "grad_norm": 12.717514038085938, "kl/avg_steps": 0.34375, "kl/beta": 0.05302076414227486, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.8957891383162304e-07, "logits/chosen": -0.19492888450622559, "logits/rejected": -0.21276481449604034, "logps/chosen": -78.13883972167969, "logps/ref_chosen": -72.53218078613281, "logps/ref_rejected": -83.00707244873047, "logps/rejected": -93.7392578125, "loss": 1.276, "rewards/accuracies": 0.65625, "rewards/chosen": -0.298408180475235, "rewards/margins": 0.265546977519989, "rewards/rejected": -0.5639551877975464, "step": 253 }, { "epoch": 0.3839758125472411, "epsilon_dpo/beta": 0.05267927050590515, "epsilon_dpo/beta_margin_grad_mean": -0.41813188791275024, "epsilon_dpo/beta_margin_grad_std": 0.15210388600826263, "epsilon_dpo/beta_margin_mean": 0.374256432056427, "epsilon_dpo/beta_margin_std": 0.6875149011611938, "epsilon_dpo/loss_margin_mean": 7.195765495300293, "grad_norm": 14.125617980957031, "kl/avg_steps": 0.3125, "kl/beta": 0.05283912643790245, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.884800159665276e-07, "logits/chosen": -0.7151049971580505, "logits/rejected": -0.4137560725212097, "logps/chosen": -93.89373779296875, "logps/ref_chosen": -87.48554229736328, "logps/ref_rejected": -109.45498657226562, "logps/rejected": -123.0589370727539, "loss": 1.1554, "rewards/accuracies": 0.640625, "rewards/chosen": -0.3408017158508301, "rewards/margins": 0.374256432056427, "rewards/rejected": -0.7150582075119019, "step": 254 }, { "epoch": 0.3854875283446712, "epsilon_dpo/beta": 0.052449315786361694, "epsilon_dpo/beta_margin_grad_mean": -0.41780516505241394, "epsilon_dpo/beta_margin_grad_std": 0.16844013333320618, "epsilon_dpo/beta_margin_mean": 0.3639993667602539, "epsilon_dpo/beta_margin_std": 0.7727952599525452, "epsilon_dpo/loss_margin_mean": 7.039681434631348, "grad_norm": 18.118885040283203, "kl/avg_steps": 0.4375, "kl/beta": 0.05267452076077461, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.873772445177015e-07, "logits/chosen": -0.3129955530166626, "logits/rejected": -0.255302757024765, "logps/chosen": -103.62435913085938, "logps/ref_chosen": -96.826171875, "logps/ref_rejected": -111.32254028320312, "logps/rejected": -125.16040802001953, "loss": 1.1925, "rewards/accuracies": 0.75, "rewards/chosen": -0.35845616459846497, "rewards/margins": 0.3639993667602539, "rewards/rejected": -0.7224555015563965, "step": 255 }, { "epoch": 0.3869992441421013, "epsilon_dpo/beta": 0.05220445618033409, "epsilon_dpo/beta_margin_grad_mean": -0.3989577889442444, "epsilon_dpo/beta_margin_grad_std": 0.1645575612783432, "epsilon_dpo/beta_margin_mean": 0.45816469192504883, "epsilon_dpo/beta_margin_std": 0.7570108771324158, "epsilon_dpo/loss_margin_mean": 8.868268966674805, "grad_norm": 14.02364730834961, "kl/avg_steps": 0.46875, "kl/beta": 0.052445072680711746, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.862706303320329e-07, "logits/chosen": -0.24509219825267792, "logits/rejected": -0.5150793790817261, "logps/chosen": -93.66590881347656, "logps/ref_chosen": -87.4644546508789, "logps/ref_rejected": -119.12409973144531, "logps/rejected": -134.19381713867188, "loss": 1.1105, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3268200755119324, "rewards/margins": 0.45816469192504883, "rewards/rejected": -0.7849847674369812, "step": 256 }, { "epoch": 0.3885109599395314, "epsilon_dpo/beta": 0.05199351906776428, "epsilon_dpo/beta_margin_grad_mean": -0.42260271310806274, "epsilon_dpo/beta_margin_grad_std": 0.1644478142261505, "epsilon_dpo/beta_margin_mean": 0.3578561544418335, "epsilon_dpo/beta_margin_std": 0.7562159895896912, "epsilon_dpo/loss_margin_mean": 6.97858190536499, "grad_norm": 15.032943725585938, "kl/avg_steps": 0.40625, "kl/beta": 0.05220038443803787, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.851602043638994e-07, "logits/chosen": -0.33761709928512573, "logits/rejected": -0.21985182166099548, "logps/chosen": -104.91004943847656, "logps/ref_chosen": -96.47090148925781, "logps/ref_rejected": -131.09390258789062, "logps/rejected": -146.5116424560547, "loss": 1.1908, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4412100315093994, "rewards/margins": 0.3578561544418335, "rewards/rejected": -0.7990661859512329, "step": 257 }, { "epoch": 0.3900226757369615, "epsilon_dpo/beta": 0.051734402775764465, "epsilon_dpo/beta_margin_grad_mean": -0.4043444097042084, "epsilon_dpo/beta_margin_grad_std": 0.14780445396900177, "epsilon_dpo/beta_margin_mean": 0.42923951148986816, "epsilon_dpo/beta_margin_std": 0.6817684769630432, "epsilon_dpo/loss_margin_mean": 8.37546443939209, "grad_norm": 12.723770141601562, "kl/avg_steps": 0.5, "kl/beta": 0.051989179104566574, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.840459976743023e-07, "logits/chosen": -0.3793967664241791, "logits/rejected": -0.5327560305595398, "logps/chosen": -90.00956726074219, "logps/ref_chosen": -82.86605834960938, "logps/ref_rejected": -104.7901611328125, "logps/rejected": -120.30911254882812, "loss": 1.108, "rewards/accuracies": 0.765625, "rewards/chosen": -0.37101608514785767, "rewards/margins": 0.42923951148986816, "rewards/rejected": -0.800255537033081, "step": 258 }, { "epoch": 0.3915343915343915, "epsilon_dpo/beta": 0.05148502439260483, "epsilon_dpo/beta_margin_grad_mean": -0.3661709129810333, "epsilon_dpo/beta_margin_grad_std": 0.1684502214193344, "epsilon_dpo/beta_margin_mean": 0.6204766035079956, "epsilon_dpo/beta_margin_std": 0.8014397025108337, "epsilon_dpo/loss_margin_mean": 12.153839111328125, "grad_norm": 13.267675399780273, "kl/avg_steps": 0.484375, "kl/beta": 0.051730524748563766, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.8292804142999796e-07, "logits/chosen": -0.2220759093761444, "logits/rejected": -0.213998943567276, "logps/chosen": -90.79042053222656, "logps/ref_chosen": -87.27658081054688, "logps/ref_rejected": -124.00669860839844, "logps/rejected": -139.67437744140625, "loss": 1.0011, "rewards/accuracies": 0.78125, "rewards/chosen": -0.18326766788959503, "rewards/margins": 0.6204766035079956, "rewards/rejected": -0.8037442564964294, "step": 259 }, { "epoch": 0.3930461073318216, "epsilon_dpo/beta": 0.05129323527216911, "epsilon_dpo/beta_margin_grad_mean": -0.41950494050979614, "epsilon_dpo/beta_margin_grad_std": 0.18360477685928345, "epsilon_dpo/beta_margin_mean": 0.36882904171943665, "epsilon_dpo/beta_margin_std": 0.8418011665344238, "epsilon_dpo/loss_margin_mean": 7.304553508758545, "grad_norm": 17.972858428955078, "kl/avg_steps": 0.375, "kl/beta": 0.05148116126656532, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.818063669026256e-07, "logits/chosen": -0.2347155511379242, "logits/rejected": -0.4262591600418091, "logps/chosen": -75.84223937988281, "logps/ref_chosen": -69.40695190429688, "logps/ref_rejected": -104.71173858642578, "logps/rejected": -118.45158386230469, "loss": 1.2141, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3316202461719513, "rewards/margins": 0.36882901191711426, "rewards/rejected": -0.7004492282867432, "step": 260 }, { "epoch": 0.3945578231292517, "epsilon_dpo/beta": 0.05108557641506195, "epsilon_dpo/beta_margin_grad_mean": -0.43297892808914185, "epsilon_dpo/beta_margin_grad_std": 0.16156414151191711, "epsilon_dpo/beta_margin_mean": 0.3018457889556885, "epsilon_dpo/beta_margin_std": 0.7393701076507568, "epsilon_dpo/loss_margin_mean": 5.996203899383545, "grad_norm": 15.106315612792969, "kl/avg_steps": 0.40625, "kl/beta": 0.05128882825374603, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.806810054678331e-07, "logits/chosen": -0.23062291741371155, "logits/rejected": -0.158943310379982, "logps/chosen": -101.55908203125, "logps/ref_chosen": -96.14897918701172, "logps/ref_rejected": -89.86795043945312, "logps/rejected": -101.27426147460938, "loss": 1.2331, "rewards/accuracies": 0.6875, "rewards/chosen": -0.27909642457962036, "rewards/margins": 0.3018457889556885, "rewards/rejected": -0.5809422135353088, "step": 261 }, { "epoch": 0.3960695389266818, "epsilon_dpo/beta": 0.050894845277071, "epsilon_dpo/beta_margin_grad_mean": -0.41639986634254456, "epsilon_dpo/beta_margin_grad_std": 0.14282996952533722, "epsilon_dpo/beta_margin_mean": 0.3772861957550049, "epsilon_dpo/beta_margin_std": 0.6464824080467224, "epsilon_dpo/loss_margin_mean": 7.49705696105957, "grad_norm": 13.404562950134277, "kl/avg_steps": 0.375, "kl/beta": 0.05108131095767021, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.7955198860439887e-07, "logits/chosen": -0.32994481921195984, "logits/rejected": -0.48363634943962097, "logps/chosen": -96.72879028320312, "logps/ref_chosen": -93.10110473632812, "logps/ref_rejected": -114.30091094970703, "logps/rejected": -125.42564392089844, "loss": 1.1403, "rewards/accuracies": 0.703125, "rewards/chosen": -0.18662017583847046, "rewards/margins": 0.3772861361503601, "rewards/rejected": -0.5639063119888306, "step": 262 }, { "epoch": 0.3975812547241119, "epsilon_dpo/beta": 0.050704702734947205, "epsilon_dpo/beta_margin_grad_mean": -0.40215593576431274, "epsilon_dpo/beta_margin_grad_std": 0.1577272117137909, "epsilon_dpo/beta_margin_mean": 0.4436055123806, "epsilon_dpo/beta_margin_std": 0.7195088863372803, "epsilon_dpo/loss_margin_mean": 8.844717025756836, "grad_norm": 12.462885856628418, "kl/avg_steps": 0.375, "kl/beta": 0.050890471786260605, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.784193478933516e-07, "logits/chosen": 0.021105842664837837, "logits/rejected": -0.2684558629989624, "logps/chosen": -80.36698913574219, "logps/ref_chosen": -78.04122924804688, "logps/ref_rejected": -112.9266357421875, "logps/rejected": -124.09712219238281, "loss": 1.1095, "rewards/accuracies": 0.671875, "rewards/chosen": -0.12003128975629807, "rewards/margins": 0.4436054825782776, "rewards/rejected": -0.5636367797851562, "step": 263 }, { "epoch": 0.39909297052154197, "epsilon_dpo/beta": 0.05043604597449303, "epsilon_dpo/beta_margin_grad_mean": -0.40311023592948914, "epsilon_dpo/beta_margin_grad_std": 0.1410197913646698, "epsilon_dpo/beta_margin_mean": 0.44368380308151245, "epsilon_dpo/beta_margin_std": 0.6663060784339905, "epsilon_dpo/loss_margin_mean": 8.859919548034668, "grad_norm": 13.311902046203613, "kl/avg_steps": 0.53125, "kl/beta": 0.0507003478705883, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.7728311501708674e-07, "logits/chosen": -0.061237186193466187, "logits/rejected": -0.34207794070243835, "logps/chosen": -115.7286376953125, "logps/ref_chosen": -110.40716552734375, "logps/ref_rejected": -123.06625366210938, "logps/rejected": -137.24765014648438, "loss": 1.0897, "rewards/accuracies": 0.703125, "rewards/chosen": -0.2699325680732727, "rewards/margins": 0.44368380308151245, "rewards/rejected": -0.7136163711547852, "step": 264 }, { "epoch": 0.40060468631897206, "epsilon_dpo/beta": 0.05024832859635353, "epsilon_dpo/beta_margin_grad_mean": -0.37590134143829346, "epsilon_dpo/beta_margin_grad_std": 0.17293144762516022, "epsilon_dpo/beta_margin_mean": 0.577387273311615, "epsilon_dpo/beta_margin_std": 0.7995150685310364, "epsilon_dpo/loss_margin_mean": 11.604867935180664, "grad_norm": 11.446062088012695, "kl/avg_steps": 0.375, "kl/beta": 0.05043242499232292, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.7614332175848027e-07, "logits/chosen": -0.3174809217453003, "logits/rejected": -0.14214938879013062, "logps/chosen": -71.36831665039062, "logps/ref_chosen": -69.86323547363281, "logps/ref_rejected": -96.0023422241211, "logps/rejected": -109.11228942871094, "loss": 1.034, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07832078635692596, "rewards/margins": 0.577387273311615, "rewards/rejected": -0.6557080745697021, "step": 265 }, { "epoch": 0.4021164021164021, "epsilon_dpo/beta": 0.05004489794373512, "epsilon_dpo/beta_margin_grad_mean": -0.4076755940914154, "epsilon_dpo/beta_margin_grad_std": 0.14683128893375397, "epsilon_dpo/beta_margin_mean": 0.4213828146457672, "epsilon_dpo/beta_margin_std": 0.6990775465965271, "epsilon_dpo/loss_margin_mean": 8.499797821044922, "grad_norm": 14.631038665771484, "kl/avg_steps": 0.40625, "kl/beta": 0.050244010984897614, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.75e-07, "logits/chosen": -0.42971134185791016, "logits/rejected": -0.3967766761779785, "logps/chosen": -82.61585998535156, "logps/ref_chosen": -80.44645690917969, "logps/ref_rejected": -106.86441040039062, "logps/rejected": -117.53360748291016, "loss": 1.1174, "rewards/accuracies": 0.78125, "rewards/chosen": -0.11023129522800446, "rewards/margins": 0.4213827848434448, "rewards/rejected": -0.5316140651702881, "step": 266 }, { "epoch": 0.4036281179138322, "epsilon_dpo/beta": 0.049842409789562225, "epsilon_dpo/beta_margin_grad_mean": -0.4204915165901184, "epsilon_dpo/beta_margin_grad_std": 0.15187732875347137, "epsilon_dpo/beta_margin_mean": 0.34613466262817383, "epsilon_dpo/beta_margin_std": 0.6853890419006348, "epsilon_dpo/loss_margin_mean": 7.032593250274658, "grad_norm": 11.797558784484863, "kl/avg_steps": 0.40625, "kl/beta": 0.05004071816802025, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.738531817228131e-07, "logits/chosen": -0.24386508762836456, "logits/rejected": -0.20387592911720276, "logps/chosen": -79.5445785522461, "logps/ref_chosen": -78.45423889160156, "logps/ref_rejected": -87.34706115722656, "logps/rejected": -95.46998596191406, "loss": 1.179, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0566682331264019, "rewards/margins": 0.34613466262817383, "rewards/rejected": -0.40280288457870483, "step": 267 }, { "epoch": 0.4051398337112623, "epsilon_dpo/beta": 0.04979650676250458, "epsilon_dpo/beta_margin_grad_mean": -0.44863420724868774, "epsilon_dpo/beta_margin_grad_std": 0.15428493916988373, "epsilon_dpo/beta_margin_mean": 0.24120254814624786, "epsilon_dpo/beta_margin_std": 0.7026641368865967, "epsilon_dpo/loss_margin_mean": 4.94456672668457, "grad_norm": 13.300025939941406, "kl/avg_steps": 0.09375, "kl/beta": 0.04983825236558914, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 3.7270289900589204e-07, "logits/chosen": -0.17837348580360413, "logits/rejected": -0.3197210431098938, "logps/chosen": -88.37873840332031, "logps/ref_chosen": -86.03016662597656, "logps/ref_rejected": -93.82392883300781, "logps/rejected": -101.1170654296875, "loss": 1.2736, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11823678016662598, "rewards/margins": 0.24120256304740906, "rewards/rejected": -0.35943934321403503, "step": 268 }, { "epoch": 0.40665154950869237, "epsilon_dpo/beta": 0.049594249576330185, "epsilon_dpo/beta_margin_grad_mean": -0.4113570749759674, "epsilon_dpo/beta_margin_grad_std": 0.15016913414001465, "epsilon_dpo/beta_margin_mean": 0.3949938714504242, "epsilon_dpo/beta_margin_std": 0.6843596696853638, "epsilon_dpo/loss_margin_mean": 8.056325912475586, "grad_norm": 13.268834114074707, "kl/avg_steps": 0.40625, "kl/beta": 0.049791570752859116, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.7154918402511714e-07, "logits/chosen": -0.5032898783683777, "logits/rejected": -0.3996407091617584, "logps/chosen": -92.66567993164062, "logps/ref_chosen": -89.21455383300781, "logps/ref_rejected": -108.29411315917969, "logps/rejected": -119.80156707763672, "loss": 1.1374, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17236913740634918, "rewards/margins": 0.3949939012527466, "rewards/rejected": -0.5673630237579346, "step": 269 }, { "epoch": 0.40816326530612246, "epsilon_dpo/beta": 0.049409087747335434, "epsilon_dpo/beta_margin_grad_mean": -0.41228729486465454, "epsilon_dpo/beta_margin_grad_std": 0.15867401659488678, "epsilon_dpo/beta_margin_mean": 0.39783310890197754, "epsilon_dpo/beta_margin_std": 0.7360899448394775, "epsilon_dpo/loss_margin_mean": 8.144404411315918, "grad_norm": 14.946235656738281, "kl/avg_steps": 0.375, "kl/beta": 0.049590110778808594, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.7039206905237656e-07, "logits/chosen": -0.25615566968917847, "logits/rejected": -0.4289062023162842, "logps/chosen": -92.39274597167969, "logps/ref_chosen": -90.55712890625, "logps/ref_rejected": -115.80068969726562, "logps/rejected": -125.78071594238281, "loss": 1.1507, "rewards/accuracies": 0.71875, "rewards/chosen": -0.09273286163806915, "rewards/margins": 0.3978331387042999, "rewards/rejected": -0.4905660152435303, "step": 270 }, { "epoch": 0.40967498110355255, "epsilon_dpo/beta": 0.04927081987261772, "epsilon_dpo/beta_margin_grad_mean": -0.43657374382019043, "epsilon_dpo/beta_margin_grad_std": 0.1702788919210434, "epsilon_dpo/beta_margin_mean": 0.31171828508377075, "epsilon_dpo/beta_margin_std": 0.8074589371681213, "epsilon_dpo/loss_margin_mean": 6.436391830444336, "grad_norm": 14.009123802185059, "kl/avg_steps": 0.28125, "kl/beta": 0.049404844641685486, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 3.692315864546635e-07, "logits/chosen": -0.6964402198791504, "logits/rejected": -0.5896154046058655, "logps/chosen": -96.97105407714844, "logps/ref_chosen": -93.89132690429688, "logps/ref_rejected": -117.50479888916016, "logps/rejected": -127.02091979980469, "loss": 1.2445, "rewards/accuracies": 0.59375, "rewards/chosen": -0.15434423089027405, "rewards/margins": 0.3117183446884155, "rewards/rejected": -0.4660625457763672, "step": 271 }, { "epoch": 0.41118669690098264, "epsilon_dpo/beta": 0.04907866567373276, "epsilon_dpo/beta_margin_grad_mean": -0.3888590931892395, "epsilon_dpo/beta_margin_grad_std": 0.1553007811307907, "epsilon_dpo/beta_margin_mean": 0.5047463178634644, "epsilon_dpo/beta_margin_std": 0.733122706413269, "epsilon_dpo/loss_margin_mean": 10.382523536682129, "grad_norm": 11.808626174926758, "kl/avg_steps": 0.390625, "kl/beta": 0.049266282469034195, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.6806776869317067e-07, "logits/chosen": -0.2755691111087799, "logits/rejected": -0.2921621799468994, "logps/chosen": -94.78904724121094, "logps/ref_chosen": -92.47999572753906, "logps/ref_rejected": -88.89102935791016, "logps/rejected": -101.58261108398438, "loss": 1.0642, "rewards/accuracies": 0.734375, "rewards/chosen": -0.11534780263900757, "rewards/margins": 0.5047463774681091, "rewards/rejected": -0.6200941801071167, "step": 272 }, { "epoch": 0.4126984126984127, "epsilon_dpo/beta": 0.04888010397553444, "epsilon_dpo/beta_margin_grad_mean": -0.40970584750175476, "epsilon_dpo/beta_margin_grad_std": 0.18216586112976074, "epsilon_dpo/beta_margin_mean": 0.4161832630634308, "epsilon_dpo/beta_margin_std": 0.8531363010406494, "epsilon_dpo/loss_margin_mean": 8.62950611114502, "grad_norm": 13.147982597351074, "kl/avg_steps": 0.40625, "kl/beta": 0.04907458275556564, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.669006483223828e-07, "logits/chosen": -0.46416640281677246, "logits/rejected": -0.3967297077178955, "logps/chosen": -85.8084716796875, "logps/ref_chosen": -81.18894958496094, "logps/ref_rejected": -108.72845458984375, "logps/rejected": -121.97748565673828, "loss": 1.1775, "rewards/accuracies": 0.734375, "rewards/chosen": -0.22839263081550598, "rewards/margins": 0.4161832332611084, "rewards/rejected": -0.644575834274292, "step": 273 }, { "epoch": 0.41421012849584277, "epsilon_dpo/beta": 0.048682332038879395, "epsilon_dpo/beta_margin_grad_mean": -0.3969515562057495, "epsilon_dpo/beta_margin_grad_std": 0.17456351220607758, "epsilon_dpo/beta_margin_mean": 0.48041605949401855, "epsilon_dpo/beta_margin_std": 0.8032354712486267, "epsilon_dpo/loss_margin_mean": 9.981647491455078, "grad_norm": 13.131810188293457, "kl/avg_steps": 0.40625, "kl/beta": 0.04887602478265762, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.657302579891656e-07, "logits/chosen": -0.1510474979877472, "logits/rejected": 0.027453720569610596, "logps/chosen": -89.0546875, "logps/ref_chosen": -85.10057067871094, "logps/ref_rejected": -93.13152313232422, "logps/rejected": -107.06729125976562, "loss": 1.1088, "rewards/accuracies": 0.703125, "rewards/chosen": -0.19507646560668945, "rewards/margins": 0.4804159998893738, "rewards/rejected": -0.675492525100708, "step": 274 }, { "epoch": 0.41572184429327286, "epsilon_dpo/beta": 0.04848536476492882, "epsilon_dpo/beta_margin_grad_mean": -0.39388585090637207, "epsilon_dpo/beta_margin_grad_std": 0.16643884778022766, "epsilon_dpo/beta_margin_mean": 0.4981761872768402, "epsilon_dpo/beta_margin_std": 0.779217004776001, "epsilon_dpo/loss_margin_mean": 10.376490592956543, "grad_norm": 13.791189193725586, "kl/avg_steps": 0.40625, "kl/beta": 0.04867827147245407, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.645566304318526e-07, "logits/chosen": -0.30934983491897583, "logits/rejected": -0.3148757815361023, "logps/chosen": -79.164794921875, "logps/ref_chosen": -76.37564849853516, "logps/ref_rejected": -106.13626098632812, "logps/rejected": -119.30189514160156, "loss": 1.0848, "rewards/accuracies": 0.65625, "rewards/chosen": -0.13688188791275024, "rewards/margins": 0.4981761574745178, "rewards/rejected": -0.6350580453872681, "step": 275 }, { "epoch": 0.41723356009070295, "epsilon_dpo/beta": 0.04830434173345566, "epsilon_dpo/beta_margin_grad_mean": -0.3967646062374115, "epsilon_dpo/beta_margin_grad_std": 0.16201792657375336, "epsilon_dpo/beta_margin_mean": 0.4819945991039276, "epsilon_dpo/beta_margin_std": 0.759510338306427, "epsilon_dpo/loss_margin_mean": 10.081770896911621, "grad_norm": 14.872968673706055, "kl/avg_steps": 0.375, "kl/beta": 0.04848131537437439, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.633797984793294e-07, "logits/chosen": -0.2909401059150696, "logits/rejected": -0.37871038913726807, "logps/chosen": -77.63218688964844, "logps/ref_chosen": -75.13729858398438, "logps/ref_rejected": -84.94955444335938, "logps/rejected": -97.52620697021484, "loss": 1.0904, "rewards/accuracies": 0.734375, "rewards/chosen": -0.12274863570928574, "rewards/margins": 0.48199462890625, "rewards/rejected": -0.604743242263794, "step": 276 }, { "epoch": 0.41874527588813304, "epsilon_dpo/beta": 0.04825973138213158, "epsilon_dpo/beta_margin_grad_mean": -0.45563802123069763, "epsilon_dpo/beta_margin_grad_std": 0.16357764601707458, "epsilon_dpo/beta_margin_mean": 0.20618519186973572, "epsilon_dpo/beta_margin_std": 0.7475130558013916, "epsilon_dpo/loss_margin_mean": 4.387637138366699, "grad_norm": 15.38239574432373, "kl/avg_steps": 0.09375, "kl/beta": 0.04830018803477287, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 3.6219979505011555e-07, "logits/chosen": -0.44621503353118896, "logits/rejected": -0.3698129653930664, "logps/chosen": -106.39254760742188, "logps/ref_chosen": -100.04359436035156, "logps/ref_rejected": -93.42616271972656, "logps/rejected": -104.16275024414062, "loss": 1.3199, "rewards/accuracies": 0.609375, "rewards/chosen": -0.3093889653682709, "rewards/margins": 0.2061852216720581, "rewards/rejected": -0.5155741572380066, "step": 277 }, { "epoch": 0.42025699168556313, "epsilon_dpo/beta": 0.04813912510871887, "epsilon_dpo/beta_margin_grad_mean": -0.4347180724143982, "epsilon_dpo/beta_margin_grad_std": 0.16170021891593933, "epsilon_dpo/beta_margin_mean": 0.2943073511123657, "epsilon_dpo/beta_margin_std": 0.7458164691925049, "epsilon_dpo/loss_margin_mean": 6.219837665557861, "grad_norm": 20.242382049560547, "kl/avg_steps": 0.25, "kl/beta": 0.04825494810938835, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.6101665315144353e-07, "logits/chosen": -0.3930334448814392, "logits/rejected": -0.5034379363059998, "logps/chosen": -99.878173828125, "logps/ref_chosen": -93.06146240234375, "logps/ref_rejected": -109.71893310546875, "logps/rejected": -122.75547790527344, "loss": 1.2412, "rewards/accuracies": 0.671875, "rewards/chosen": -0.33152052760124207, "rewards/margins": 0.2943073511123657, "rewards/rejected": -0.6258278489112854, "step": 278 }, { "epoch": 0.4217687074829932, "epsilon_dpo/beta": 0.047846004366874695, "epsilon_dpo/beta_margin_grad_mean": -0.36673232913017273, "epsilon_dpo/beta_margin_grad_std": 0.15627476572990417, "epsilon_dpo/beta_margin_mean": 0.6086516976356506, "epsilon_dpo/beta_margin_std": 0.7403861880302429, "epsilon_dpo/loss_margin_mean": 12.797127723693848, "grad_norm": 15.089401245117188, "kl/avg_steps": 0.609375, "kl/beta": 0.048134613782167435, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.5983040587833563e-07, "logits/chosen": -0.570505678653717, "logits/rejected": -0.41417115926742554, "logps/chosen": -75.94297790527344, "logps/ref_chosen": -76.2708740234375, "logps/ref_rejected": -91.68028259277344, "logps/rejected": -104.14952087402344, "loss": 0.9893, "rewards/accuracies": 0.828125, "rewards/chosen": 0.014286134392023087, "rewards/margins": 0.6086516976356506, "rewards/rejected": -0.5943655371665955, "step": 279 }, { "epoch": 0.42328042328042326, "epsilon_dpo/beta": 0.0475488044321537, "epsilon_dpo/beta_margin_grad_mean": -0.3657313883304596, "epsilon_dpo/beta_margin_grad_std": 0.15367664396762848, "epsilon_dpo/beta_margin_mean": 0.6205738186836243, "epsilon_dpo/beta_margin_std": 0.7367238998413086, "epsilon_dpo/loss_margin_mean": 13.125807762145996, "grad_norm": 11.988062858581543, "kl/avg_steps": 0.625, "kl/beta": 0.04784306883811951, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.586410864126781e-07, "logits/chosen": -0.3317842483520508, "logits/rejected": -0.27062520384788513, "logps/chosen": -84.26651000976562, "logps/ref_chosen": -81.00099182128906, "logps/ref_rejected": -101.10507202148438, "logps/rejected": -117.49641418457031, "loss": 0.9782, "rewards/accuracies": 0.875, "rewards/chosen": -0.15623867511749268, "rewards/margins": 0.6205738186836243, "rewards/rejected": -0.7768125534057617, "step": 280 }, { "epoch": 0.42479213907785335, "epsilon_dpo/beta": 0.04734262824058533, "epsilon_dpo/beta_margin_grad_mean": -0.39394766092300415, "epsilon_dpo/beta_margin_grad_std": 0.1553352326154709, "epsilon_dpo/beta_margin_mean": 0.48012128472328186, "epsilon_dpo/beta_margin_std": 0.7120420336723328, "epsilon_dpo/loss_margin_mean": 10.240545272827148, "grad_norm": 10.852984428405762, "kl/avg_steps": 0.4375, "kl/beta": 0.0475459061563015, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.574487280222929e-07, "logits/chosen": -0.39467182755470276, "logits/rejected": -0.2560708224773407, "logps/chosen": -85.82388305664062, "logps/ref_chosen": -82.56924438476562, "logps/ref_rejected": -83.29329681396484, "logps/rejected": -96.78848266601562, "loss": 1.0782, "rewards/accuracies": 0.734375, "rewards/chosen": -0.15683114528656006, "rewards/margins": 0.48012131452560425, "rewards/rejected": -0.6369524598121643, "step": 281 }, { "epoch": 0.42630385487528344, "epsilon_dpo/beta": 0.04717332124710083, "epsilon_dpo/beta_margin_grad_mean": -0.3915327191352844, "epsilon_dpo/beta_margin_grad_std": 0.17149022221565247, "epsilon_dpo/beta_margin_mean": 0.5186943411827087, "epsilon_dpo/beta_margin_std": 0.8207116723060608, "epsilon_dpo/loss_margin_mean": 11.112959861755371, "grad_norm": 12.8870210647583, "kl/avg_steps": 0.359375, "kl/beta": 0.04733880236744881, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.562533640600075e-07, "logits/chosen": -0.5010318756103516, "logits/rejected": -0.29100731015205383, "logps/chosen": -89.39044189453125, "logps/ref_chosen": -84.25967407226562, "logps/ref_rejected": -91.84246826171875, "logps/rejected": -108.08619689941406, "loss": 1.0816, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24529683589935303, "rewards/margins": 0.5186943411827087, "rewards/rejected": -0.7639911770820618, "step": 282 }, { "epoch": 0.42781557067271353, "epsilon_dpo/beta": 0.04699710011482239, "epsilon_dpo/beta_margin_grad_mean": -0.4094040095806122, "epsilon_dpo/beta_margin_grad_std": 0.1825781762599945, "epsilon_dpo/beta_margin_mean": 0.4076806604862213, "epsilon_dpo/beta_margin_std": 0.8461658954620361, "epsilon_dpo/loss_margin_mean": 8.805419921875, "grad_norm": 13.878371238708496, "kl/avg_steps": 0.375, "kl/beta": 0.047169286757707596, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.550550279627215e-07, "logits/chosen": -0.17757020890712738, "logits/rejected": -0.5387111306190491, "logps/chosen": -96.1683120727539, "logps/ref_chosen": -87.76092529296875, "logps/ref_rejected": -123.7965087890625, "logps/rejected": -141.00930786132812, "loss": 1.1833, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3980576992034912, "rewards/margins": 0.4076806604862213, "rewards/rejected": -0.8057383894920349, "step": 283 }, { "epoch": 0.4293272864701436, "epsilon_dpo/beta": 0.04679214581847191, "epsilon_dpo/beta_margin_grad_mean": -0.39326760172843933, "epsilon_dpo/beta_margin_grad_std": 0.15346704423427582, "epsilon_dpo/beta_margin_mean": 0.4926775395870209, "epsilon_dpo/beta_margin_std": 0.7348820567131042, "epsilon_dpo/loss_margin_mean": 10.620450973510742, "grad_norm": 12.785466194152832, "kl/avg_steps": 0.4375, "kl/beta": 0.046993061900138855, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -0.2667410373687744, "logits/rejected": -0.15663591027259827, "logps/chosen": -87.38343811035156, "logps/ref_chosen": -81.22230529785156, "logps/ref_rejected": -111.91586303710938, "logps/rejected": -128.69744873046875, "loss": 1.0723, "rewards/accuracies": 0.78125, "rewards/chosen": -0.28914564847946167, "rewards/margins": 0.4926775097846985, "rewards/rejected": -0.7818231582641602, "step": 284 }, { "epoch": 0.4308390022675737, "epsilon_dpo/beta": 0.046661436557769775, "epsilon_dpo/beta_margin_grad_mean": -0.43417391180992126, "epsilon_dpo/beta_margin_grad_std": 0.17843972146511078, "epsilon_dpo/beta_margin_mean": 0.311433881521225, "epsilon_dpo/beta_margin_std": 0.8369320034980774, "epsilon_dpo/loss_margin_mean": 6.8021392822265625, "grad_norm": 13.3861722946167, "kl/avg_steps": 0.28125, "kl/beta": 0.04678836464881897, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 3.5264957352549375e-07, "logits/chosen": -0.33967140316963196, "logits/rejected": -0.2970946431159973, "logps/chosen": -104.98358154296875, "logps/ref_chosen": -92.5496597290039, "logps/ref_rejected": -102.47920227050781, "logps/rejected": -121.71526336669922, "loss": 1.2579, "rewards/accuracies": 0.640625, "rewards/chosen": -0.5837230086326599, "rewards/margins": 0.31143394112586975, "rewards/rejected": -0.895156979560852, "step": 285 }, { "epoch": 0.4323507180650038, "epsilon_dpo/beta": 0.04642849788069725, "epsilon_dpo/beta_margin_grad_mean": -0.3862251043319702, "epsilon_dpo/beta_margin_grad_std": 0.1750505268573761, "epsilon_dpo/beta_margin_mean": 0.532291054725647, "epsilon_dpo/beta_margin_std": 0.8362828493118286, "epsilon_dpo/loss_margin_mean": 11.574300765991211, "grad_norm": 12.411300659179688, "kl/avg_steps": 0.5, "kl/beta": 0.04665714129805565, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.514425224712835e-07, "logits/chosen": -0.36736541986465454, "logits/rejected": -0.30413326621055603, "logps/chosen": -93.0548324584961, "logps/ref_chosen": -83.22084045410156, "logps/ref_rejected": -118.17338562011719, "logps/rejected": -139.58168029785156, "loss": 1.0783, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4587008059024811, "rewards/margins": 0.532291054725647, "rewards/rejected": -0.9909918308258057, "step": 286 }, { "epoch": 0.43386243386243384, "epsilon_dpo/beta": 0.04615398123860359, "epsilon_dpo/beta_margin_grad_mean": -0.3491361737251282, "epsilon_dpo/beta_margin_grad_std": 0.1557650864124298, "epsilon_dpo/beta_margin_mean": 0.7086184024810791, "epsilon_dpo/beta_margin_std": 0.7686328291893005, "epsilon_dpo/loss_margin_mean": 15.439092636108398, "grad_norm": 11.449210166931152, "kl/avg_steps": 0.59375, "kl/beta": 0.04642501473426819, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.502326338516534e-07, "logits/chosen": -0.6256189942359924, "logits/rejected": -0.15066683292388916, "logps/chosen": -73.1468734741211, "logps/ref_chosen": -67.09947204589844, "logps/ref_rejected": -83.48188781738281, "logps/rejected": -104.9683837890625, "loss": 0.9251, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2815502882003784, "rewards/margins": 0.7086184024810791, "rewards/rejected": -0.9901686906814575, "step": 287 }, { "epoch": 0.43537414965986393, "epsilon_dpo/beta": 0.04596810042858124, "epsilon_dpo/beta_margin_grad_mean": -0.4099636971950531, "epsilon_dpo/beta_margin_grad_std": 0.15979404747486115, "epsilon_dpo/beta_margin_mean": 0.41960883140563965, "epsilon_dpo/beta_margin_std": 0.7552804946899414, "epsilon_dpo/loss_margin_mean": 9.233772277832031, "grad_norm": 15.480899810791016, "kl/avg_steps": 0.40625, "kl/beta": 0.046150993555784225, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.490199415097892e-07, "logits/chosen": -0.570479154586792, "logits/rejected": -0.406840443611145, "logps/chosen": -101.13687133789062, "logps/ref_chosen": -89.84213256835938, "logps/ref_rejected": -113.61448669433594, "logps/rejected": -134.14300537109375, "loss": 1.1375, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5225342512130737, "rewards/margins": 0.41960883140563965, "rewards/rejected": -0.9421430826187134, "step": 288 }, { "epoch": 0.436885865457294, "epsilon_dpo/beta": 0.045825205743312836, "epsilon_dpo/beta_margin_grad_mean": -0.3875085413455963, "epsilon_dpo/beta_margin_grad_std": 0.15026584267616272, "epsilon_dpo/beta_margin_mean": 0.5227700471878052, "epsilon_dpo/beta_margin_std": 0.7179827094078064, "epsilon_dpo/loss_margin_mean": 11.50743293762207, "grad_norm": 10.489204406738281, "kl/avg_steps": 0.3125, "kl/beta": 0.04596426337957382, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.4780447936730247e-07, "logits/chosen": -0.22212985157966614, "logits/rejected": -0.39587292075157166, "logps/chosen": -88.09721374511719, "logps/ref_chosen": -78.14584350585938, "logps/ref_rejected": -93.63999938964844, "logps/rejected": -115.09880065917969, "loss": 1.0435, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4589444398880005, "rewards/margins": 0.5227700471878052, "rewards/rejected": -0.9817144870758057, "step": 289 }, { "epoch": 0.4383975812547241, "epsilon_dpo/beta": 0.04562516510486603, "epsilon_dpo/beta_margin_grad_mean": -0.4058082401752472, "epsilon_dpo/beta_margin_grad_std": 0.15972179174423218, "epsilon_dpo/beta_margin_mean": 0.4322429895401001, "epsilon_dpo/beta_margin_std": 0.7467278242111206, "epsilon_dpo/loss_margin_mean": 9.5772705078125, "grad_norm": 12.687112808227539, "kl/avg_steps": 0.4375, "kl/beta": 0.04582107067108154, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.465862814232821e-07, "logits/chosen": -0.34867942333221436, "logits/rejected": -0.12253440916538239, "logps/chosen": -97.43099975585938, "logps/ref_chosen": -83.81318664550781, "logps/ref_rejected": -114.35005187988281, "logps/rejected": -137.54513549804688, "loss": 1.1257, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6226211190223694, "rewards/margins": 0.4322430491447449, "rewards/rejected": -1.0548641681671143, "step": 290 }, { "epoch": 0.4399092970521542, "epsilon_dpo/beta": 0.04536939039826393, "epsilon_dpo/beta_margin_grad_mean": -0.38911837339401245, "epsilon_dpo/beta_margin_grad_std": 0.17643475532531738, "epsilon_dpo/beta_margin_mean": 0.5134809613227844, "epsilon_dpo/beta_margin_std": 0.8421981334686279, "epsilon_dpo/loss_margin_mean": 11.423192977905273, "grad_norm": 12.824395179748535, "kl/avg_steps": 0.5625, "kl/beta": 0.04562147706747055, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.4536538175334343e-07, "logits/chosen": -0.41977792978286743, "logits/rejected": -0.8858845233917236, "logps/chosen": -83.10356140136719, "logps/ref_chosen": -71.1761703491211, "logps/ref_rejected": -101.21935272216797, "logps/rejected": -124.5699462890625, "loss": 1.0952, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5435177087783813, "rewards/margins": 0.5134809613227844, "rewards/rejected": -1.0569987297058105, "step": 291 }, { "epoch": 0.4414210128495843, "epsilon_dpo/beta": 0.045158155262470245, "epsilon_dpo/beta_margin_grad_mean": -0.40868300199508667, "epsilon_dpo/beta_margin_grad_std": 0.17304831743240356, "epsilon_dpo/beta_margin_mean": 0.41493427753448486, "epsilon_dpo/beta_margin_std": 0.8037520051002502, "epsilon_dpo/loss_margin_mean": 9.300040245056152, "grad_norm": 16.256256103515625, "kl/avg_steps": 0.46875, "kl/beta": 0.04536629468202591, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.4414181450867465e-07, "logits/chosen": -0.3706292510032654, "logits/rejected": -0.3315098285675049, "logps/chosen": -100.09800720214844, "logps/ref_chosen": -87.9404296875, "logps/ref_rejected": -103.1168212890625, "logps/rejected": -124.57444763183594, "loss": 1.1609, "rewards/accuracies": 0.75, "rewards/chosen": -0.5523610711097717, "rewards/margins": 0.4149342477321625, "rewards/rejected": -0.9672952890396118, "step": 292 }, { "epoch": 0.4429327286470144, "epsilon_dpo/beta": 0.04493334889411926, "epsilon_dpo/beta_margin_grad_mean": -0.3718346357345581, "epsilon_dpo/beta_margin_grad_std": 0.19872498512268066, "epsilon_dpo/beta_margin_mean": 0.6251580119132996, "epsilon_dpo/beta_margin_std": 0.9845532178878784, "epsilon_dpo/loss_margin_mean": 14.047744750976562, "grad_norm": 14.59776782989502, "kl/avg_steps": 0.5, "kl/beta": 0.0451546311378479, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.4291561391508185e-07, "logits/chosen": -0.380871057510376, "logits/rejected": -0.24333512783050537, "logps/chosen": -88.27153015136719, "logps/ref_chosen": -77.02020263671875, "logps/ref_rejected": -111.15068054199219, "logps/rejected": -136.44973754882812, "loss": 1.0647, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5092015266418457, "rewards/margins": 0.6251579523086548, "rewards/rejected": -1.1343594789505005, "step": 293 }, { "epoch": 0.4444444444444444, "epsilon_dpo/beta": 0.044737886637449265, "epsilon_dpo/beta_margin_grad_mean": -0.41345012187957764, "epsilon_dpo/beta_margin_grad_std": 0.16614548861980438, "epsilon_dpo/beta_margin_mean": 0.39459964632987976, "epsilon_dpo/beta_margin_std": 0.7768252491950989, "epsilon_dpo/loss_margin_mean": 8.932480812072754, "grad_norm": 14.383130073547363, "kl/avg_steps": 0.4375, "kl/beta": 0.04492998123168945, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.4168681427203153e-07, "logits/chosen": -0.4731680750846863, "logits/rejected": -0.32575279474258423, "logps/chosen": -89.42033386230469, "logps/ref_chosen": -76.86295318603516, "logps/ref_rejected": -91.43938446044922, "logps/rejected": -112.92924499511719, "loss": 1.1666, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5631007552146912, "rewards/margins": 0.3945996165275574, "rewards/rejected": -0.9577003717422485, "step": 294 }, { "epoch": 0.4459561602418745, "epsilon_dpo/beta": 0.044529028236866, "epsilon_dpo/beta_margin_grad_mean": -0.4238334894180298, "epsilon_dpo/beta_margin_grad_std": 0.1660003811120987, "epsilon_dpo/beta_margin_mean": 0.3427713215351105, "epsilon_dpo/beta_margin_std": 0.7789937257766724, "epsilon_dpo/loss_margin_mean": 7.807656288146973, "grad_norm": 19.5250186920166, "kl/avg_steps": 0.46875, "kl/beta": 0.04473426938056946, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.4045544995169125e-07, "logits/chosen": -0.4011607766151428, "logits/rejected": -0.5367324352264404, "logps/chosen": -83.89892578125, "logps/ref_chosen": -70.6540298461914, "logps/ref_rejected": -107.9378433227539, "logps/rejected": -128.99038696289062, "loss": 1.2102, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5908478498458862, "rewards/margins": 0.3427712917327881, "rewards/rejected": -0.9336191415786743, "step": 295 }, { "epoch": 0.4474678760393046, "epsilon_dpo/beta": 0.04436302185058594, "epsilon_dpo/beta_margin_grad_mean": -0.38691622018814087, "epsilon_dpo/beta_margin_grad_std": 0.17068080604076385, "epsilon_dpo/beta_margin_mean": 0.5330710411071777, "epsilon_dpo/beta_margin_std": 0.8052643537521362, "epsilon_dpo/loss_margin_mean": 12.143484115600586, "grad_norm": 13.374835968017578, "kl/avg_steps": 0.375, "kl/beta": 0.04452555626630783, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.392215553979679e-07, "logits/chosen": -0.18664756417274475, "logits/rejected": -0.5290898084640503, "logps/chosen": -105.62068176269531, "logps/ref_chosen": -92.49901580810547, "logps/ref_rejected": -109.3675537109375, "logps/rejected": -134.63270568847656, "loss": 1.0668, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5853240489959717, "rewards/margins": 0.5330710411071777, "rewards/rejected": -1.1183950901031494, "step": 296 }, { "epoch": 0.4489795918367347, "epsilon_dpo/beta": 0.04411409795284271, "epsilon_dpo/beta_margin_grad_mean": -0.37734997272491455, "epsilon_dpo/beta_margin_grad_std": 0.15403449535369873, "epsilon_dpo/beta_margin_mean": 0.5672126412391663, "epsilon_dpo/beta_margin_std": 0.7445960640907288, "epsilon_dpo/loss_margin_mean": 12.947671890258789, "grad_norm": 13.148897171020508, "kl/avg_steps": 0.5625, "kl/beta": 0.04435920715332031, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.3798516512554485e-07, "logits/chosen": -0.3896023631095886, "logits/rejected": -0.3738701343536377, "logps/chosen": -91.46431732177734, "logps/ref_chosen": -78.97463989257812, "logps/ref_rejected": -96.7846908569336, "logps/rejected": -122.2220458984375, "loss": 1.0188, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5526552796363831, "rewards/margins": 0.5672126412391663, "rewards/rejected": -1.1198678016662598, "step": 297 }, { "epoch": 0.4504913076341648, "epsilon_dpo/beta": 0.043908704072237015, "epsilon_dpo/beta_margin_grad_mean": -0.4079945385456085, "epsilon_dpo/beta_margin_grad_std": 0.1935083121061325, "epsilon_dpo/beta_margin_mean": 0.4454768896102905, "epsilon_dpo/beta_margin_std": 0.9295777678489685, "epsilon_dpo/loss_margin_mean": 10.276945114135742, "grad_norm": 20.633535385131836, "kl/avg_steps": 0.46875, "kl/beta": 0.04411108419299126, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.367463137189156e-07, "logits/chosen": -0.49798786640167236, "logits/rejected": -0.21161851286888123, "logps/chosen": -90.81427001953125, "logps/ref_chosen": -78.96235656738281, "logps/ref_rejected": -94.07938385009766, "logps/rejected": -116.20824432373047, "loss": 1.1815, "rewards/accuracies": 0.75, "rewards/chosen": -0.5214691162109375, "rewards/margins": 0.4454768896102905, "rewards/rejected": -0.966946005821228, "step": 298 }, { "epoch": 0.4520030234315949, "epsilon_dpo/beta": 0.043745007365942, "epsilon_dpo/beta_margin_grad_mean": -0.4169948995113373, "epsilon_dpo/beta_margin_grad_std": 0.1756003201007843, "epsilon_dpo/beta_margin_mean": 0.4042234420776367, "epsilon_dpo/beta_margin_std": 0.862417459487915, "epsilon_dpo/loss_margin_mean": 9.364120483398438, "grad_norm": 14.022722244262695, "kl/avg_steps": 0.375, "kl/beta": 0.04390527680516243, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.355050358314172e-07, "logits/chosen": -0.2293621301651001, "logits/rejected": -0.3756565749645233, "logps/chosen": -94.15196228027344, "logps/ref_chosen": -81.70744323730469, "logps/ref_rejected": -91.36782836914062, "logps/rejected": -113.17647552490234, "loss": 1.1844, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5472281575202942, "rewards/margins": 0.4042234718799591, "rewards/rejected": -0.9514516592025757, "step": 299 }, { "epoch": 0.45351473922902497, "epsilon_dpo/beta": 0.043595246970653534, "epsilon_dpo/beta_margin_grad_mean": -0.4183122515678406, "epsilon_dpo/beta_margin_grad_std": 0.19845078885555267, "epsilon_dpo/beta_margin_mean": 0.3808678686618805, "epsilon_dpo/beta_margin_std": 0.9299583435058594, "epsilon_dpo/loss_margin_mean": 8.885825157165527, "grad_norm": 13.588310241699219, "kl/avg_steps": 0.34375, "kl/beta": 0.04374124854803085, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.3426136618426043e-07, "logits/chosen": -0.32883864641189575, "logits/rejected": -0.4738979935646057, "logps/chosen": -95.8285140991211, "logps/ref_chosen": -84.14907836914062, "logps/ref_rejected": -103.7045669555664, "logps/rejected": -124.26982116699219, "loss": 1.2376, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5121716260910034, "rewards/margins": 0.3808678984642029, "rewards/rejected": -0.8930395245552063, "step": 300 }, { "epoch": 0.45351473922902497, "eval_epsilon_dpo/beta": 0.04342613369226456, "eval_epsilon_dpo/beta_margin_grad_mean": -0.40209266543388367, "eval_epsilon_dpo/beta_margin_grad_std": 0.17587903141975403, "eval_epsilon_dpo/beta_margin_mean": 0.46374091506004333, "eval_epsilon_dpo/beta_margin_std": 0.8474838137626648, "eval_epsilon_dpo/loss_margin_mean": 10.808184623718262, "eval_kl/n_epsilon_steps": 0.3050176203250885, "eval_kl/p_epsilon_steps": 0.6941021084785461, "eval_logits/chosen": -0.3509601354598999, "eval_logits/rejected": -0.42714956402778625, "eval_logps/chosen": -97.03380584716797, "eval_logps/ref_chosen": -87.42715454101562, "eval_logps/ref_rejected": -104.23548889160156, "eval_logps/rejected": -124.65033721923828, "eval_loss": 0.5699101686477661, "eval_rewards/accuracies": 0.7086267471313477, "eval_rewards/chosen": -0.41977250576019287, "eval_rewards/margins": 0.46374091506004333, "eval_rewards/rejected": -0.8835135102272034, "eval_runtime": 47.568, "eval_samples_per_second": 48.415, "eval_steps_per_second": 1.514, "step": 300 }, { "epoch": 0.455026455026455, "epsilon_dpo/beta": 0.043500397354364395, "epsilon_dpo/beta_margin_grad_mean": -0.4132575988769531, "epsilon_dpo/beta_margin_grad_std": 0.17967145144939423, "epsilon_dpo/beta_margin_mean": 0.42292046546936035, "epsilon_dpo/beta_margin_std": 0.8556511402130127, "epsilon_dpo/loss_margin_mean": 9.865974426269531, "grad_norm": 12.259976387023926, "kl/avg_steps": 0.21875, "kl/beta": 0.04359140247106552, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 3.3301533956555885e-07, "logits/chosen": -0.6459434628486633, "logits/rejected": -0.5689473748207092, "logps/chosen": -84.82774353027344, "logps/ref_chosen": -75.17066955566406, "logps/ref_rejected": -91.63156127929688, "logps/rejected": -111.15460968017578, "loss": 1.17, "rewards/accuracies": 0.640625, "rewards/chosen": -0.42266562581062317, "rewards/margins": 0.42292046546936035, "rewards/rejected": -0.8455861210823059, "step": 301 }, { "epoch": 0.4565381708238851, "epsilon_dpo/beta": 0.043323881924152374, "epsilon_dpo/beta_margin_grad_mean": -0.4404016435146332, "epsilon_dpo/beta_margin_grad_std": 0.1699916273355484, "epsilon_dpo/beta_margin_mean": 0.26643675565719604, "epsilon_dpo/beta_margin_std": 0.8003353476524353, "epsilon_dpo/loss_margin_mean": 6.27188777923584, "grad_norm": 15.321131706237793, "kl/avg_steps": 0.40625, "kl/beta": 0.043496254831552505, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.317669908293554e-07, "logits/chosen": -0.4655022919178009, "logits/rejected": -0.3340109884738922, "logps/chosen": -102.78459930419922, "logps/ref_chosen": -91.02297973632812, "logps/ref_rejected": -114.12149810791016, "logps/rejected": -132.15499877929688, "loss": 1.2832, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5123536586761475, "rewards/margins": 0.26643672585487366, "rewards/rejected": -0.7787904143333435, "step": 302 }, { "epoch": 0.4580498866213152, "epsilon_dpo/beta": 0.04314858838915825, "epsilon_dpo/beta_margin_grad_mean": -0.37916189432144165, "epsilon_dpo/beta_margin_grad_std": 0.19821646809577942, "epsilon_dpo/beta_margin_mean": 0.5926302671432495, "epsilon_dpo/beta_margin_std": 0.9654866456985474, "epsilon_dpo/loss_margin_mean": 13.885607719421387, "grad_norm": 12.133411407470703, "kl/avg_steps": 0.40625, "kl/beta": 0.04332026466727257, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.3051635489464793e-07, "logits/chosen": -0.25000911951065063, "logits/rejected": -0.08165319263935089, "logps/chosen": -98.95149993896484, "logps/ref_chosen": -89.47880554199219, "logps/ref_rejected": -114.87516784667969, "logps/rejected": -138.2334747314453, "loss": 1.0818, "rewards/accuracies": 0.703125, "rewards/chosen": -0.41259628534317017, "rewards/margins": 0.5926302671432495, "rewards/rejected": -1.0052266120910645, "step": 303 }, { "epoch": 0.4595616024187453, "epsilon_dpo/beta": 0.042906589806079865, "epsilon_dpo/beta_margin_grad_mean": -0.39030832052230835, "epsilon_dpo/beta_margin_grad_std": 0.14020994305610657, "epsilon_dpo/beta_margin_mean": 0.5041561722755432, "epsilon_dpo/beta_margin_std": 0.6786003708839417, "epsilon_dpo/loss_margin_mean": 11.824028015136719, "grad_norm": 11.328267097473145, "kl/avg_steps": 0.5625, "kl/beta": 0.043144989758729935, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.292634667444117e-07, "logits/chosen": -0.48182913661003113, "logits/rejected": -0.5360888242721558, "logps/chosen": -89.85015869140625, "logps/ref_chosen": -82.58224487304688, "logps/ref_rejected": -101.22938537597656, "logps/rejected": -120.32133483886719, "loss": 1.045, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3128387928009033, "rewards/margins": 0.5041561722755432, "rewards/rejected": -0.8169949054718018, "step": 304 }, { "epoch": 0.46107331821617537, "epsilon_dpo/beta": 0.0427604503929615, "epsilon_dpo/beta_margin_grad_mean": -0.3991040587425232, "epsilon_dpo/beta_margin_grad_std": 0.19379828870296478, "epsilon_dpo/beta_margin_mean": 0.48293551802635193, "epsilon_dpo/beta_margin_std": 0.9312344193458557, "epsilon_dpo/loss_margin_mean": 11.445974349975586, "grad_norm": 12.086163520812988, "kl/avg_steps": 0.34375, "kl/beta": 0.042903654277324677, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.280083614246217e-07, "logits/chosen": -0.22488896548748016, "logits/rejected": -0.0361773818731308, "logps/chosen": -107.65516662597656, "logps/ref_chosen": -97.63542175292969, "logps/ref_rejected": -90.60706329345703, "logps/rejected": -112.07278442382812, "loss": 1.1529, "rewards/accuracies": 0.671875, "rewards/chosen": -0.43161648511886597, "rewards/margins": 0.48293548822402954, "rewards/rejected": -0.9145519733428955, "step": 305 }, { "epoch": 0.46258503401360546, "epsilon_dpo/beta": 0.0426006019115448, "epsilon_dpo/beta_margin_grad_mean": -0.39660578966140747, "epsilon_dpo/beta_margin_grad_std": 0.1713494211435318, "epsilon_dpo/beta_margin_mean": 0.49307981133461, "epsilon_dpo/beta_margin_std": 0.8368701934814453, "epsilon_dpo/loss_margin_mean": 11.6995210647583, "grad_norm": 12.074493408203125, "kl/avg_steps": 0.375, "kl/beta": 0.04275668039917946, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.267510740432719e-07, "logits/chosen": -0.208884596824646, "logits/rejected": -0.4091324210166931, "logps/chosen": -82.65934753417969, "logps/ref_chosen": -76.59986877441406, "logps/ref_rejected": -108.35289764404297, "logps/rejected": -126.11189270019531, "loss": 1.1049, "rewards/accuracies": 0.703125, "rewards/chosen": -0.2597709894180298, "rewards/margins": 0.4930797517299652, "rewards/rejected": -0.7528507709503174, "step": 306 }, { "epoch": 0.46409674981103555, "epsilon_dpo/beta": 0.042601197957992554, "epsilon_dpo/beta_margin_grad_mean": -0.44173377752304077, "epsilon_dpo/beta_margin_grad_std": 0.1646379977464676, "epsilon_dpo/beta_margin_mean": 0.2777702510356903, "epsilon_dpo/beta_margin_std": 0.7612587213516235, "epsilon_dpo/loss_margin_mean": 6.658515930175781, "grad_norm": 13.114051818847656, "kl/avg_steps": 0.0, "kl/beta": 0.04259693995118141, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 3.2549163976939285e-07, "logits/chosen": -0.3018755316734314, "logits/rejected": -0.5736774802207947, "logps/chosen": -84.22529602050781, "logps/ref_chosen": -78.99342346191406, "logps/ref_rejected": -91.41653442382812, "logps/rejected": -103.30691528320312, "loss": 1.2598, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22581946849822998, "rewards/margins": 0.2777702510356903, "rewards/rejected": -0.5035896897315979, "step": 307 }, { "epoch": 0.4656084656084656, "epsilon_dpo/beta": 0.04244144633412361, "epsilon_dpo/beta_margin_grad_mean": -0.40081164240837097, "epsilon_dpo/beta_margin_grad_std": 0.17875270545482635, "epsilon_dpo/beta_margin_mean": 0.46681687235832214, "epsilon_dpo/beta_margin_std": 0.8536893725395203, "epsilon_dpo/loss_margin_mean": 11.135279655456543, "grad_norm": 15.095867156982422, "kl/avg_steps": 0.375, "kl/beta": 0.04259693995118141, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.2423009383206874e-07, "logits/chosen": -0.7949353456497192, "logits/rejected": -0.33702436089515686, "logps/chosen": -100.32615661621094, "logps/ref_chosen": -94.20703887939453, "logps/ref_rejected": -103.01887512207031, "logps/rejected": -120.27326965332031, "loss": 1.1351, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2623611092567444, "rewards/margins": 0.46681690216064453, "rewards/rejected": -0.7291780114173889, "step": 308 }, { "epoch": 0.4671201814058957, "epsilon_dpo/beta": 0.04219004511833191, "epsilon_dpo/beta_margin_grad_mean": -0.4012344479560852, "epsilon_dpo/beta_margin_grad_std": 0.1441354751586914, "epsilon_dpo/beta_margin_mean": 0.44948306679725647, "epsilon_dpo/beta_margin_std": 0.6808764934539795, "epsilon_dpo/loss_margin_mean": 10.729732513427734, "grad_norm": 12.998229026794434, "kl/avg_steps": 0.59375, "kl/beta": 0.04243779927492142, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.229664715194511e-07, "logits/chosen": -0.44665130972862244, "logits/rejected": -0.2758372128009796, "logps/chosen": -95.9297103881836, "logps/ref_chosen": -87.41177368164062, "logps/ref_rejected": -105.32408142089844, "logps/rejected": -124.57174682617188, "loss": 1.09, "rewards/accuracies": 0.78125, "rewards/chosen": -0.36134669184684753, "rewards/margins": 0.44948309659957886, "rewards/rejected": -0.810829758644104, "step": 309 }, { "epoch": 0.46863189720332576, "epsilon_dpo/beta": 0.04217834398150444, "epsilon_dpo/beta_margin_grad_mean": -0.4731263518333435, "epsilon_dpo/beta_margin_grad_std": 0.174700528383255, "epsilon_dpo/beta_margin_mean": 0.12875214219093323, "epsilon_dpo/beta_margin_std": 0.8044756054878235, "epsilon_dpo/loss_margin_mean": 3.1934990882873535, "grad_norm": 15.983020782470703, "kl/avg_steps": 0.03125, "kl/beta": 0.04218731075525284, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 3.2170080817777257e-07, "logits/chosen": -0.7636397480964661, "logits/rejected": -0.5896657705307007, "logps/chosen": -111.97227478027344, "logps/ref_chosen": -101.82364654541016, "logps/ref_rejected": -105.92247009277344, "logps/rejected": -119.26460266113281, "loss": 1.4115, "rewards/accuracies": 0.578125, "rewards/chosen": -0.4309285879135132, "rewards/margins": 0.12875217199325562, "rewards/rejected": -0.5596807599067688, "step": 310 }, { "epoch": 0.47014361300075586, "epsilon_dpo/beta": 0.04204654321074486, "epsilon_dpo/beta_margin_grad_mean": -0.4101211428642273, "epsilon_dpo/beta_margin_grad_std": 0.1667424887418747, "epsilon_dpo/beta_margin_mean": 0.4359593689441681, "epsilon_dpo/beta_margin_std": 0.8494295477867126, "epsilon_dpo/loss_margin_mean": 10.489989280700684, "grad_norm": 11.257895469665527, "kl/avg_steps": 0.3125, "kl/beta": 0.04217413440346718, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.204331392103574e-07, "logits/chosen": -0.3479464650154114, "logits/rejected": -0.6605005264282227, "logps/chosen": -84.55355834960938, "logps/ref_chosen": -81.57022857666016, "logps/ref_rejected": -118.00057983398438, "logps/rejected": -131.47389221191406, "loss": 1.149, "rewards/accuracies": 0.734375, "rewards/chosen": -0.12689390778541565, "rewards/margins": 0.4359593391418457, "rewards/rejected": -0.562853217124939, "step": 311 }, { "epoch": 0.47165532879818595, "epsilon_dpo/beta": 0.0418432243168354, "epsilon_dpo/beta_margin_grad_mean": -0.3847061097621918, "epsilon_dpo/beta_margin_grad_std": 0.16446185111999512, "epsilon_dpo/beta_margin_mean": 0.5488578081130981, "epsilon_dpo/beta_margin_std": 0.7945172786712646, "epsilon_dpo/loss_margin_mean": 13.221216201782227, "grad_norm": 10.767632484436035, "kl/avg_steps": 0.484375, "kl/beta": 0.042042750865221024, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.1916350007663176e-07, "logits/chosen": -0.32211631536483765, "logits/rejected": -0.1509503722190857, "logps/chosen": -79.85913848876953, "logps/ref_chosen": -73.72250366210938, "logps/ref_rejected": -101.14654541015625, "logps/rejected": -120.50439453125, "loss": 1.0487, "rewards/accuracies": 0.765625, "rewards/chosen": -0.2573304772377014, "rewards/margins": 0.5488578081130981, "rewards/rejected": -0.8061882257461548, "step": 312 }, { "epoch": 0.47316704459561604, "epsilon_dpo/beta": 0.04171350598335266, "epsilon_dpo/beta_margin_grad_mean": -0.43476757407188416, "epsilon_dpo/beta_margin_grad_std": 0.1839824914932251, "epsilon_dpo/beta_margin_mean": 0.3212515711784363, "epsilon_dpo/beta_margin_std": 0.8919322490692139, "epsilon_dpo/loss_margin_mean": 7.839280605316162, "grad_norm": 12.423453330993652, "kl/avg_steps": 0.3125, "kl/beta": 0.0418400876224041, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.178919262911314e-07, "logits/chosen": -0.2028963714838028, "logits/rejected": -0.39557722210884094, "logps/chosen": -86.79026794433594, "logps/ref_chosen": -83.71331787109375, "logps/ref_rejected": -93.05607604980469, "logps/rejected": -103.9723129272461, "loss": 1.2671, "rewards/accuracies": 0.65625, "rewards/chosen": -0.13130736351013184, "rewards/margins": 0.3212515711784363, "rewards/rejected": -0.4525589346885681, "step": 313 }, { "epoch": 0.47467876039304613, "epsilon_dpo/beta": 0.04155748710036278, "epsilon_dpo/beta_margin_grad_mean": -0.3984832167625427, "epsilon_dpo/beta_margin_grad_std": 0.16552382707595825, "epsilon_dpo/beta_margin_mean": 0.4729049801826477, "epsilon_dpo/beta_margin_std": 0.7664237022399902, "epsilon_dpo/loss_margin_mean": 11.503206253051758, "grad_norm": 14.001582145690918, "kl/avg_steps": 0.375, "kl/beta": 0.041709743440151215, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.166184534225087e-07, "logits/chosen": -0.36525222659111023, "logits/rejected": -0.4043053090572357, "logps/chosen": -102.99305725097656, "logps/ref_chosen": -98.91353607177734, "logps/ref_rejected": -88.04048919677734, "logps/rejected": -103.62322235107422, "loss": 1.1009, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17131654918193817, "rewards/margins": 0.4729049801826477, "rewards/rejected": -0.6442215442657471, "step": 314 }, { "epoch": 0.47619047619047616, "epsilon_dpo/beta": 0.04138924181461334, "epsilon_dpo/beta_margin_grad_mean": -0.4095999598503113, "epsilon_dpo/beta_margin_grad_std": 0.15082554519176483, "epsilon_dpo/beta_margin_mean": 0.41423144936561584, "epsilon_dpo/beta_margin_std": 0.6999982595443726, "epsilon_dpo/loss_margin_mean": 10.115947723388672, "grad_norm": 12.541328430175781, "kl/avg_steps": 0.40625, "kl/beta": 0.04155391454696655, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.1534311709253723e-07, "logits/chosen": -0.2650623023509979, "logits/rejected": -0.30792877078056335, "logps/chosen": -97.30009460449219, "logps/ref_chosen": -92.76774597167969, "logps/ref_rejected": -102.42605590820312, "logps/rejected": -117.07435607910156, "loss": 1.1248, "rewards/accuracies": 0.75, "rewards/chosen": -0.18921594321727753, "rewards/margins": 0.41423147916793823, "rewards/rejected": -0.603447437286377, "step": 315 }, { "epoch": 0.47770219198790626, "epsilon_dpo/beta": 0.041234713047742844, "epsilon_dpo/beta_margin_grad_mean": -0.39376702904701233, "epsilon_dpo/beta_margin_grad_std": 0.16725587844848633, "epsilon_dpo/beta_margin_mean": 0.4972352981567383, "epsilon_dpo/beta_margin_std": 0.7851472496986389, "epsilon_dpo/loss_margin_mean": 12.186201095581055, "grad_norm": 11.967978477478027, "kl/avg_steps": 0.375, "kl/beta": 0.04138578474521637, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.1406595297511564e-07, "logits/chosen": -0.17552141845226288, "logits/rejected": -0.2363368570804596, "logps/chosen": -85.0501708984375, "logps/ref_chosen": -80.11531066894531, "logps/ref_rejected": -124.51179504394531, "logps/rejected": -141.6328582763672, "loss": 1.0872, "rewards/accuracies": 0.703125, "rewards/chosen": -0.20370078086853027, "rewards/margins": 0.49723532795906067, "rewards/rejected": -0.7009360790252686, "step": 316 }, { "epoch": 0.47921390778533635, "epsilon_dpo/beta": 0.04100334644317627, "epsilon_dpo/beta_margin_grad_mean": -0.3886288106441498, "epsilon_dpo/beta_margin_grad_std": 0.171974316239357, "epsilon_dpo/beta_margin_mean": 0.5132362246513367, "epsilon_dpo/beta_margin_std": 0.8470146059989929, "epsilon_dpo/loss_margin_mean": 12.628094673156738, "grad_norm": 16.494524002075195, "kl/avg_steps": 0.5625, "kl/beta": 0.041231170296669006, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.1278699679526975e-07, "logits/chosen": -0.3971477448940277, "logits/rejected": -0.32429075241088867, "logps/chosen": -83.87468719482422, "logps/ref_chosen": -81.12582397460938, "logps/ref_rejected": -98.85325622558594, "logps/rejected": -114.23020935058594, "loss": 1.0944, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1150696724653244, "rewards/margins": 0.5132362842559814, "rewards/rejected": -0.6283059120178223, "step": 317 }, { "epoch": 0.48072562358276644, "epsilon_dpo/beta": 0.04087650030851364, "epsilon_dpo/beta_margin_grad_mean": -0.42562544345855713, "epsilon_dpo/beta_margin_grad_std": 0.18235144019126892, "epsilon_dpo/beta_margin_mean": 0.36548641324043274, "epsilon_dpo/beta_margin_std": 0.8594874143600464, "epsilon_dpo/loss_margin_mean": 9.091790199279785, "grad_norm": 15.041504859924316, "kl/avg_steps": 0.3125, "kl/beta": 0.04100054129958153, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.1150628432815336e-07, "logits/chosen": -0.4328976273536682, "logits/rejected": -0.25933602452278137, "logps/chosen": -80.97486877441406, "logps/ref_chosen": -75.95895385742188, "logps/ref_rejected": -105.97601318359375, "logps/rejected": -120.0837173461914, "loss": 1.2196, "rewards/accuracies": 0.671875, "rewards/chosen": -0.20701169967651367, "rewards/margins": 0.3654864430427551, "rewards/rejected": -0.572498083114624, "step": 318 }, { "epoch": 0.48223733938019653, "epsilon_dpo/beta": 0.04071083664894104, "epsilon_dpo/beta_margin_grad_mean": -0.3958146274089813, "epsilon_dpo/beta_margin_grad_std": 0.1657380759716034, "epsilon_dpo/beta_margin_mean": 0.48401468992233276, "epsilon_dpo/beta_margin_std": 0.7800770401954651, "epsilon_dpo/loss_margin_mean": 12.014669418334961, "grad_norm": 11.02835464477539, "kl/avg_steps": 0.40625, "kl/beta": 0.040872812271118164, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.1022385139804707e-07, "logits/chosen": -0.685593843460083, "logits/rejected": -0.36105144023895264, "logps/chosen": -93.86186218261719, "logps/ref_chosen": -88.54928588867188, "logps/ref_rejected": -102.58941650390625, "logps/rejected": -119.91665649414062, "loss": 1.0955, "rewards/accuracies": 0.71875, "rewards/chosen": -0.21764391660690308, "rewards/margins": 0.48401468992233276, "rewards/rejected": -0.7016586661338806, "step": 319 }, { "epoch": 0.4837490551776266, "epsilon_dpo/beta": 0.040609732270240784, "epsilon_dpo/beta_margin_grad_mean": -0.4467881917953491, "epsilon_dpo/beta_margin_grad_std": 0.17798320949077606, "epsilon_dpo/beta_margin_mean": 0.24270537495613098, "epsilon_dpo/beta_margin_std": 0.8327056169509888, "epsilon_dpo/loss_margin_mean": 6.119729518890381, "grad_norm": 11.728205680847168, "kl/avg_steps": 0.25, "kl/beta": 0.04070743918418884, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -0.24449878931045532, "logits/rejected": -0.4534444808959961, "logps/chosen": -76.99515533447266, "logps/ref_chosen": -72.40655517578125, "logps/ref_rejected": -90.68508911132812, "logps/rejected": -101.39341735839844, "loss": 1.3166, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1887323260307312, "rewards/margins": 0.24270540475845337, "rewards/rejected": -0.43143773078918457, "step": 320 }, { "epoch": 0.4852607709750567, "epsilon_dpo/beta": 0.040432319045066833, "epsilon_dpo/beta_margin_grad_mean": -0.39743614196777344, "epsilon_dpo/beta_margin_grad_std": 0.1755134016275406, "epsilon_dpo/beta_margin_mean": 0.47886669635772705, "epsilon_dpo/beta_margin_std": 0.8305484652519226, "epsilon_dpo/loss_margin_mean": 11.975590705871582, "grad_norm": 14.635098457336426, "kl/avg_steps": 0.4375, "kl/beta": 0.040605925023555756, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.0765396768561004e-07, "logits/chosen": -0.5184362530708313, "logits/rejected": -0.4613918662071228, "logps/chosen": -74.00281524658203, "logps/ref_chosen": -71.84503936767578, "logps/ref_rejected": -78.1065673828125, "logps/rejected": -92.23993682861328, "loss": 1.1179, "rewards/accuracies": 0.75, "rewards/chosen": -0.09022434055805206, "rewards/margins": 0.47886669635772705, "rewards/rejected": -0.5690910220146179, "step": 321 }, { "epoch": 0.48677248677248675, "epsilon_dpo/beta": 0.04016774892807007, "epsilon_dpo/beta_margin_grad_mean": -0.36837536096572876, "epsilon_dpo/beta_margin_grad_std": 0.15891148149967194, "epsilon_dpo/beta_margin_mean": 0.6071159243583679, "epsilon_dpo/beta_margin_std": 0.7657486796379089, "epsilon_dpo/loss_margin_mean": 15.201668739318848, "grad_norm": 11.990974426269531, "kl/avg_steps": 0.65625, "kl/beta": 0.040429048240184784, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.063665887884511e-07, "logits/chosen": -0.18207885324954987, "logits/rejected": -0.3786749541759491, "logps/chosen": -76.58918762207031, "logps/ref_chosen": -70.58155822753906, "logps/ref_rejected": -99.65203094482422, "logps/rejected": -120.861328125, "loss": 0.9974, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2421734780073166, "rewards/margins": 0.6071159243583679, "rewards/rejected": -0.8492894172668457, "step": 322 }, { "epoch": 0.48828420256991684, "epsilon_dpo/beta": 0.04000628739595413, "epsilon_dpo/beta_margin_grad_mean": -0.4120200574398041, "epsilon_dpo/beta_margin_grad_std": 0.1918712854385376, "epsilon_dpo/beta_margin_mean": 0.4309968650341034, "epsilon_dpo/beta_margin_std": 0.9564192295074463, "epsilon_dpo/loss_margin_mean": 10.92121410369873, "grad_norm": 12.162070274353027, "kl/avg_steps": 0.40625, "kl/beta": 0.04016546159982681, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.0507763319663517e-07, "logits/chosen": -0.4847533106803894, "logits/rejected": -0.21049503982067108, "logps/chosen": -100.65248107910156, "logps/ref_chosen": -93.55892944335938, "logps/ref_rejected": -122.09970092773438, "logps/rejected": -140.11447143554688, "loss": 1.1997, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2870478630065918, "rewards/margins": 0.4309968948364258, "rewards/rejected": -0.7180447578430176, "step": 323 }, { "epoch": 0.4897959183673469, "epsilon_dpo/beta": 0.03980691358447075, "epsilon_dpo/beta_margin_grad_mean": -0.3884578347206116, "epsilon_dpo/beta_margin_grad_std": 0.14908921718597412, "epsilon_dpo/beta_margin_mean": 0.512287437915802, "epsilon_dpo/beta_margin_std": 0.7012259364128113, "epsilon_dpo/loss_margin_mean": 12.961596488952637, "grad_norm": 11.008560180664062, "kl/avg_steps": 0.5, "kl/beta": 0.04000294953584671, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.0378713696502097e-07, "logits/chosen": -0.4199288785457611, "logits/rejected": -0.7384412288665771, "logps/chosen": -85.6984634399414, "logps/ref_chosen": -81.87194061279297, "logps/ref_rejected": -91.50868225097656, "logps/rejected": -108.29679870605469, "loss": 1.0477, "rewards/accuracies": 0.78125, "rewards/chosen": -0.15418805181980133, "rewards/margins": 0.512287437915802, "rewards/rejected": -0.6664755344390869, "step": 324 }, { "epoch": 0.491307634164777, "epsilon_dpo/beta": 0.03965863212943077, "epsilon_dpo/beta_margin_grad_mean": -0.3986729383468628, "epsilon_dpo/beta_margin_grad_std": 0.1633588671684265, "epsilon_dpo/beta_margin_mean": 0.4666576087474823, "epsilon_dpo/beta_margin_std": 0.7744911909103394, "epsilon_dpo/loss_margin_mean": 11.899657249450684, "grad_norm": 11.990095138549805, "kl/avg_steps": 0.375, "kl/beta": 0.03980392962694168, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.0249513619156206e-07, "logits/chosen": -0.3202875256538391, "logits/rejected": -0.4885994791984558, "logps/chosen": -97.64002990722656, "logps/ref_chosen": -86.96577453613281, "logps/ref_rejected": -104.21844482421875, "logps/rejected": -126.79237365722656, "loss": 1.107, "rewards/accuracies": 0.703125, "rewards/chosen": -0.42622262239456177, "rewards/margins": 0.4666576385498047, "rewards/rejected": -0.8928803205490112, "step": 325 }, { "epoch": 0.4928193499622071, "epsilon_dpo/beta": 0.03953525424003601, "epsilon_dpo/beta_margin_grad_mean": -0.42973268032073975, "epsilon_dpo/beta_margin_grad_std": 0.16378960013389587, "epsilon_dpo/beta_margin_mean": 0.3205667734146118, "epsilon_dpo/beta_margin_std": 0.7373978495597839, "epsilon_dpo/loss_margin_mean": 8.246063232421875, "grad_norm": 12.42955493927002, "kl/avg_steps": 0.3125, "kl/beta": 0.03965522348880768, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.012016670162977e-07, "logits/chosen": -0.5864256620407104, "logits/rejected": -0.6668286323547363, "logps/chosen": -110.80265808105469, "logps/ref_chosen": -98.51603698730469, "logps/ref_rejected": -97.31979370117188, "logps/rejected": -117.85248565673828, "loss": 1.2173, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4882323145866394, "rewards/margins": 0.3205668032169342, "rewards/rejected": -0.8087990880012512, "step": 326 }, { "epoch": 0.4943310657596372, "epsilon_dpo/beta": 0.03943679854273796, "epsilon_dpo/beta_margin_grad_mean": -0.42828133702278137, "epsilon_dpo/beta_margin_grad_std": 0.17430712282657623, "epsilon_dpo/beta_margin_mean": 0.3396408259868622, "epsilon_dpo/beta_margin_std": 0.842943549156189, "epsilon_dpo/loss_margin_mean": 8.755025863647461, "grad_norm": 13.681025505065918, "kl/avg_steps": 0.25, "kl/beta": 0.039531685411930084, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.99906765620341e-07, "logits/chosen": -0.374507337808609, "logits/rejected": -0.5013106465339661, "logps/chosen": -107.53089904785156, "logps/ref_chosen": -95.87124633789062, "logps/ref_rejected": -108.29940795898438, "logps/rejected": -128.71409606933594, "loss": 1.2332, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4616592824459076, "rewards/margins": 0.33964085578918457, "rewards/rejected": -0.8013001084327698, "step": 327 }, { "epoch": 0.4958427815570673, "epsilon_dpo/beta": 0.039276834577322006, "epsilon_dpo/beta_margin_grad_mean": -0.4211041033267975, "epsilon_dpo/beta_margin_grad_std": 0.16422966122627258, "epsilon_dpo/beta_margin_mean": 0.3656752407550812, "epsilon_dpo/beta_margin_std": 0.7797321677207947, "epsilon_dpo/loss_margin_mean": 9.426528930664062, "grad_norm": 11.109094619750977, "kl/avg_steps": 0.40625, "kl/beta": 0.03943310305476189, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.9861046822486766e-07, "logits/chosen": -0.2030862420797348, "logits/rejected": -0.10589778423309326, "logps/chosen": -95.96703338623047, "logps/ref_chosen": -85.7418212890625, "logps/ref_rejected": -104.25873565673828, "logps/rejected": -123.91046905517578, "loss": 1.1903, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4038175642490387, "rewards/margins": 0.3656752407550812, "rewards/rejected": -0.7694927453994751, "step": 328 }, { "epoch": 0.4973544973544973, "epsilon_dpo/beta": 0.03911792114377022, "epsilon_dpo/beta_margin_grad_mean": -0.4037947654724121, "epsilon_dpo/beta_margin_grad_std": 0.15787221491336823, "epsilon_dpo/beta_margin_mean": 0.4557969570159912, "epsilon_dpo/beta_margin_std": 0.7563498616218567, "epsilon_dpo/loss_margin_mean": 11.770658493041992, "grad_norm": 11.235872268676758, "kl/avg_steps": 0.40625, "kl/beta": 0.03927355632185936, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.9731281109010253e-07, "logits/chosen": -0.2828035354614258, "logits/rejected": -0.26564446091651917, "logps/chosen": -95.25900268554688, "logps/ref_chosen": -85.7830810546875, "logps/ref_rejected": -109.35794830322266, "logps/rejected": -130.6045379638672, "loss": 1.1076, "rewards/accuracies": 0.703125, "rewards/chosen": -0.37267088890075684, "rewards/margins": 0.4557969570159912, "rewards/rejected": -0.828467845916748, "step": 329 }, { "epoch": 0.4988662131519274, "epsilon_dpo/beta": 0.03892296925187111, "epsilon_dpo/beta_margin_grad_mean": -0.3836939036846161, "epsilon_dpo/beta_margin_grad_std": 0.18278218805789948, "epsilon_dpo/beta_margin_mean": 0.5425154566764832, "epsilon_dpo/beta_margin_std": 0.8813403248786926, "epsilon_dpo/loss_margin_mean": 14.080089569091797, "grad_norm": 10.5839262008667, "kl/avg_steps": 0.5, "kl/beta": 0.03911465033888817, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.9601383051430505e-07, "logits/chosen": -0.24584242701530457, "logits/rejected": -0.20333293080329895, "logps/chosen": -82.42384338378906, "logps/ref_chosen": -75.27607727050781, "logps/ref_rejected": -102.55793762207031, "logps/rejected": -123.7857894897461, "loss": 1.0872, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2804255187511444, "rewards/margins": 0.5425155162811279, "rewards/rejected": -0.82294100522995, "step": 330 }, { "epoch": 0.5003779289493575, "epsilon_dpo/beta": 0.03874148800969124, "epsilon_dpo/beta_margin_grad_mean": -0.38367658853530884, "epsilon_dpo/beta_margin_grad_std": 0.17551447451114655, "epsilon_dpo/beta_margin_mean": 0.5445398092269897, "epsilon_dpo/beta_margin_std": 0.8645876049995422, "epsilon_dpo/loss_margin_mean": 14.192091941833496, "grad_norm": 11.384100914001465, "kl/avg_steps": 0.46875, "kl/beta": 0.03892005234956741, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.947135628327544e-07, "logits/chosen": -0.27640146017074585, "logits/rejected": -0.12456096708774567, "logps/chosen": -90.9360122680664, "logps/ref_chosen": -80.22660064697266, "logps/ref_rejected": -90.82720947265625, "logps/rejected": -115.72871398925781, "loss": 1.0769, "rewards/accuracies": 0.734375, "rewards/chosen": -0.41756218671798706, "rewards/margins": 0.5445398092269897, "rewards/rejected": -0.9621019959449768, "step": 331 }, { "epoch": 0.5018896447467877, "epsilon_dpo/beta": 0.038560736924409866, "epsilon_dpo/beta_margin_grad_mean": -0.3933686316013336, "epsilon_dpo/beta_margin_grad_std": 0.1542639285326004, "epsilon_dpo/beta_margin_mean": 0.5055604577064514, "epsilon_dpo/beta_margin_std": 0.7535479068756104, "epsilon_dpo/loss_margin_mean": 13.21468734741211, "grad_norm": 12.87071704864502, "kl/avg_steps": 0.46875, "kl/beta": 0.038738466799259186, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.934120444167326e-07, "logits/chosen": -0.4173445701599121, "logits/rejected": -0.5104435086250305, "logps/chosen": -100.83187103271484, "logps/ref_chosen": -90.54778289794922, "logps/ref_rejected": -107.08577728271484, "logps/rejected": -130.58456420898438, "loss": 1.0664, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3975829482078552, "rewards/margins": 0.5055604577064514, "rewards/rejected": -0.9031434059143066, "step": 332 }, { "epoch": 0.5034013605442177, "epsilon_dpo/beta": 0.03841697797179222, "epsilon_dpo/beta_margin_grad_mean": -0.3773667812347412, "epsilon_dpo/beta_margin_grad_std": 0.14968030154705048, "epsilon_dpo/beta_margin_mean": 0.5757911801338196, "epsilon_dpo/beta_margin_std": 0.7351418137550354, "epsilon_dpo/loss_margin_mean": 15.106307983398438, "grad_norm": 11.473505020141602, "kl/avg_steps": 0.375, "kl/beta": 0.03855772688984871, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.921093116725076e-07, "logits/chosen": -0.5246701240539551, "logits/rejected": -0.4246484637260437, "logps/chosen": -94.96739196777344, "logps/ref_chosen": -83.32445526123047, "logps/ref_rejected": -112.07743835449219, "logps/rejected": -138.82667541503906, "loss": 1.0074, "rewards/accuracies": 0.734375, "rewards/chosen": -0.44865351915359497, "rewards/margins": 0.5757912397384644, "rewards/rejected": -1.0244446992874146, "step": 333 }, { "epoch": 0.5049130763416477, "epsilon_dpo/beta": 0.038321349769830704, "epsilon_dpo/beta_margin_grad_mean": -0.42791518568992615, "epsilon_dpo/beta_margin_grad_std": 0.2027246654033661, "epsilon_dpo/beta_margin_mean": 0.3443240225315094, "epsilon_dpo/beta_margin_std": 0.9910604953765869, "epsilon_dpo/loss_margin_mean": 9.166866302490234, "grad_norm": 14.25706672668457, "kl/avg_steps": 0.25, "kl/beta": 0.038413673639297485, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.609375, "learning_rate": 2.9080540104031484e-07, "logits/chosen": -0.13559825718402863, "logits/rejected": -0.3539199233055115, "logps/chosen": -104.35836029052734, "logps/ref_chosen": -91.25233459472656, "logps/ref_rejected": -114.59999084472656, "logps/rejected": -136.8728790283203, "loss": 1.2894, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5054439902305603, "rewards/margins": 0.3443240523338318, "rewards/rejected": -0.8497680425643921, "step": 334 }, { "epoch": 0.5064247921390779, "epsilon_dpo/beta": 0.03822590783238411, "epsilon_dpo/beta_margin_grad_mean": -0.41665026545524597, "epsilon_dpo/beta_margin_grad_std": 0.20269030332565308, "epsilon_dpo/beta_margin_mean": 0.4251091480255127, "epsilon_dpo/beta_margin_std": 0.9988790154457092, "epsilon_dpo/loss_margin_mean": 11.305398941040039, "grad_norm": 12.197574615478516, "kl/avg_steps": 0.25, "kl/beta": 0.038317881524562836, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.895003489933375e-07, "logits/chosen": -0.1341536045074463, "logits/rejected": -0.1751415729522705, "logps/chosen": -99.05448913574219, "logps/ref_chosen": -83.96897888183594, "logps/ref_rejected": -99.78108215332031, "logps/rejected": -126.1719970703125, "loss": 1.2237, "rewards/accuracies": 0.625, "rewards/chosen": -0.5797243118286133, "rewards/margins": 0.4251091480255127, "rewards/rejected": -1.004833459854126, "step": 335 }, { "epoch": 0.5079365079365079, "epsilon_dpo/beta": 0.03805890679359436, "epsilon_dpo/beta_margin_grad_mean": -0.3994874060153961, "epsilon_dpo/beta_margin_grad_std": 0.16580943763256073, "epsilon_dpo/beta_margin_mean": 0.4782327711582184, "epsilon_dpo/beta_margin_std": 0.8129090070724487, "epsilon_dpo/loss_margin_mean": 12.692656517028809, "grad_norm": 13.025675773620605, "kl/avg_steps": 0.4375, "kl/beta": 0.03822232410311699, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.8819419203668675e-07, "logits/chosen": -0.1973276436328888, "logits/rejected": -0.1543230414390564, "logps/chosen": -113.52323913574219, "logps/ref_chosen": -95.66194152832031, "logps/ref_rejected": -114.09184265136719, "logps/rejected": -144.64581298828125, "loss": 1.1081, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6822552680969238, "rewards/margins": 0.47823280096054077, "rewards/rejected": -1.1604881286621094, "step": 336 }, { "epoch": 0.509448223733938, "epsilon_dpo/beta": 0.037964485585689545, "epsilon_dpo/beta_margin_grad_mean": -0.4189932942390442, "epsilon_dpo/beta_margin_grad_std": 0.1795109510421753, "epsilon_dpo/beta_margin_mean": 0.38377320766448975, "epsilon_dpo/beta_margin_std": 0.8511192202568054, "epsilon_dpo/loss_margin_mean": 10.27182388305664, "grad_norm": 10.546476364135742, "kl/avg_steps": 0.25, "kl/beta": 0.03805582970380783, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.8688696670638053e-07, "logits/chosen": -0.20124396681785583, "logits/rejected": -0.48997047543525696, "logps/chosen": -117.29708862304688, "logps/ref_chosen": -99.90296173095703, "logps/ref_rejected": -113.39192199707031, "logps/rejected": -141.05787658691406, "loss": 1.2014, "rewards/accuracies": 0.625, "rewards/chosen": -0.6641528606414795, "rewards/margins": 0.38377323746681213, "rewards/rejected": -1.0479261875152588, "step": 337 }, { "epoch": 0.5109599395313681, "epsilon_dpo/beta": 0.03785794600844383, "epsilon_dpo/beta_margin_grad_mean": -0.42979949712753296, "epsilon_dpo/beta_margin_grad_std": 0.1584760844707489, "epsilon_dpo/beta_margin_mean": 0.3304803669452667, "epsilon_dpo/beta_margin_std": 0.7403267025947571, "epsilon_dpo/loss_margin_mean": 8.861513137817383, "grad_norm": 11.558711051940918, "kl/avg_steps": 0.28125, "kl/beta": 0.037960927933454514, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.8557870956832133e-07, "logits/chosen": -0.06663598120212555, "logits/rejected": -0.06503565609455109, "logps/chosen": -105.7027587890625, "logps/ref_chosen": -86.85198974609375, "logps/ref_rejected": -96.50064086914062, "logps/rejected": -124.21292114257812, "loss": 1.2068, "rewards/accuracies": 0.640625, "rewards/chosen": -0.7169533967971802, "rewards/margins": 0.33048033714294434, "rewards/rejected": -1.0474337339401245, "step": 338 }, { "epoch": 0.5124716553287982, "epsilon_dpo/beta": 0.03770444914698601, "epsilon_dpo/beta_margin_grad_mean": -0.3875303566455841, "epsilon_dpo/beta_margin_grad_std": 0.1659288853406906, "epsilon_dpo/beta_margin_mean": 0.515190601348877, "epsilon_dpo/beta_margin_std": 0.7667047381401062, "epsilon_dpo/loss_margin_mean": 13.80580997467041, "grad_norm": 15.705618858337402, "kl/avg_steps": 0.40625, "kl/beta": 0.03785446286201477, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.842694572172736e-07, "logits/chosen": -0.544682502746582, "logits/rejected": -0.5630224943161011, "logps/chosen": -78.35090637207031, "logps/ref_chosen": -66.23301696777344, "logps/ref_rejected": -91.11048126220703, "logps/rejected": -117.03418731689453, "loss": 1.0689, "rewards/accuracies": 0.734375, "rewards/chosen": -0.45997846126556396, "rewards/margins": 0.515190601348877, "rewards/rejected": -0.9751690626144409, "step": 339 }, { "epoch": 0.5139833711262283, "epsilon_dpo/beta": 0.0375872403383255, "epsilon_dpo/beta_margin_grad_mean": -0.3984226584434509, "epsilon_dpo/beta_margin_grad_std": 0.19148968160152435, "epsilon_dpo/beta_margin_mean": 0.49994638562202454, "epsilon_dpo/beta_margin_std": 0.9249793291091919, "epsilon_dpo/loss_margin_mean": 13.473993301391602, "grad_norm": 10.675450325012207, "kl/avg_steps": 0.3125, "kl/beta": 0.03770130127668381, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.8295924627584004e-07, "logits/chosen": -0.1954106092453003, "logits/rejected": -0.060511067509651184, "logps/chosen": -92.63058471679688, "logps/ref_chosen": -76.95579528808594, "logps/ref_rejected": -86.38760375976562, "logps/rejected": -115.53638458251953, "loss": 1.1359, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5927058458328247, "rewards/margins": 0.49994635581970215, "rewards/rejected": -1.0926522016525269, "step": 340 }, { "epoch": 0.5154950869236583, "epsilon_dpo/beta": 0.037458401173353195, "epsilon_dpo/beta_margin_grad_mean": -0.3949894607067108, "epsilon_dpo/beta_margin_grad_std": 0.19184111058712006, "epsilon_dpo/beta_margin_mean": 0.5000091195106506, "epsilon_dpo/beta_margin_std": 0.9141637682914734, "epsilon_dpo/loss_margin_mean": 13.519756317138672, "grad_norm": 11.776097297668457, "kl/avg_steps": 0.34375, "kl/beta": 0.0375838503241539, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.816481133934373e-07, "logits/chosen": -0.3284544348716736, "logits/rejected": -0.22923079133033752, "logps/chosen": -100.67996215820312, "logps/ref_chosen": -84.42140197753906, "logps/ref_rejected": -101.77664184570312, "logps/rejected": -131.55496215820312, "loss": 1.1337, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6120609641075134, "rewards/margins": 0.5000091195106506, "rewards/rejected": -1.112070083618164, "step": 341 }, { "epoch": 0.5170068027210885, "epsilon_dpo/beta": 0.037294965237379074, "epsilon_dpo/beta_margin_grad_mean": -0.39244845509529114, "epsilon_dpo/beta_margin_grad_std": 0.18085214495658875, "epsilon_dpo/beta_margin_mean": 0.5229305028915405, "epsilon_dpo/beta_margin_std": 0.8699291944503784, "epsilon_dpo/loss_margin_mean": 14.16259479522705, "grad_norm": 10.74425220489502, "kl/avg_steps": 0.4375, "kl/beta": 0.03745510056614876, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.8033609524527046e-07, "logits/chosen": -0.24709981679916382, "logits/rejected": -0.25939422845840454, "logps/chosen": -89.46382141113281, "logps/ref_chosen": -75.53753662109375, "logps/ref_rejected": -91.3760986328125, "logps/rejected": -119.46498107910156, "loss": 1.0964, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5207824110984802, "rewards/margins": 0.5229305028915405, "rewards/rejected": -1.043712854385376, "step": 342 }, { "epoch": 0.5185185185185185, "epsilon_dpo/beta": 0.03719655051827431, "epsilon_dpo/beta_margin_grad_mean": -0.44036367535591125, "epsilon_dpo/beta_margin_grad_std": 0.17963874340057373, "epsilon_dpo/beta_margin_mean": 0.27663713693618774, "epsilon_dpo/beta_margin_std": 0.8233576416969299, "epsilon_dpo/loss_margin_mean": 7.593426704406738, "grad_norm": 11.922304153442383, "kl/avg_steps": 0.265625, "kl/beta": 0.03729194775223732, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.7902322853130753e-07, "logits/chosen": -0.46268126368522644, "logits/rejected": -0.49861472845077515, "logps/chosen": -111.53775787353516, "logps/ref_chosen": -96.93223571777344, "logps/ref_rejected": -104.06831359863281, "logps/rejected": -126.26726531982422, "loss": 1.2851, "rewards/accuracies": 0.640625, "rewards/chosen": -0.5451029539108276, "rewards/margins": 0.27663713693618774, "rewards/rejected": -0.8217400312423706, "step": 343 }, { "epoch": 0.5200302343159486, "epsilon_dpo/beta": 0.037034135311841965, "epsilon_dpo/beta_margin_grad_mean": -0.39166146516799927, "epsilon_dpo/beta_margin_grad_std": 0.17405946552753448, "epsilon_dpo/beta_margin_mean": 0.5026716589927673, "epsilon_dpo/beta_margin_std": 0.8177103996276855, "epsilon_dpo/loss_margin_mean": 13.722208023071289, "grad_norm": 11.160215377807617, "kl/avg_steps": 0.4375, "kl/beta": 0.03719315305352211, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.7770954997525274e-07, "logits/chosen": -0.29269033670425415, "logits/rejected": -0.29436129331588745, "logps/chosen": -97.3475112915039, "logps/ref_chosen": -78.63787841796875, "logps/ref_rejected": -107.91048431396484, "logps/rejected": -140.3423309326172, "loss": 1.0953, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6958789229393005, "rewards/margins": 0.5026716589927673, "rewards/rejected": -1.1985507011413574, "step": 344 }, { "epoch": 0.5215419501133787, "epsilon_dpo/beta": 0.03689596429467201, "epsilon_dpo/beta_margin_grad_mean": -0.4142055809497833, "epsilon_dpo/beta_margin_grad_std": 0.18251675367355347, "epsilon_dpo/beta_margin_mean": 0.4029293954372406, "epsilon_dpo/beta_margin_std": 0.871208667755127, "epsilon_dpo/loss_margin_mean": 11.078716278076172, "grad_norm": 15.428401947021484, "kl/avg_steps": 0.375, "kl/beta": 0.037031140178442, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.7639509632351927e-07, "logits/chosen": -0.12234549969434738, "logits/rejected": -0.16197794675827026, "logps/chosen": -91.65048217773438, "logps/ref_chosen": -79.6897201538086, "logps/ref_rejected": -100.54981994628906, "logps/rejected": -123.58929443359375, "loss": 1.1932, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44506821036338806, "rewards/margins": 0.4029293954372406, "rewards/rejected": -0.8479976654052734, "step": 345 }, { "epoch": 0.5230536659108088, "epsilon_dpo/beta": 0.036735061556100845, "epsilon_dpo/beta_margin_grad_mean": -0.3954569101333618, "epsilon_dpo/beta_margin_grad_std": 0.17873188853263855, "epsilon_dpo/beta_margin_mean": 0.5000748038291931, "epsilon_dpo/beta_margin_std": 0.8613612055778503, "epsilon_dpo/loss_margin_mean": 13.76108169555664, "grad_norm": 13.216407775878906, "kl/avg_steps": 0.4375, "kl/beta": 0.03689279407262802, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.7507990434420123e-07, "logits/chosen": -0.3659716546535492, "logits/rejected": -0.22499585151672363, "logps/chosen": -87.64271545410156, "logps/ref_chosen": -75.92257690429688, "logps/ref_rejected": -116.39082336425781, "logps/rejected": -141.87203979492188, "loss": 1.1105, "rewards/accuracies": 0.6875, "rewards/chosen": -0.43288546800613403, "rewards/margins": 0.5000748038291931, "rewards/rejected": -0.9329602718353271, "step": 346 }, { "epoch": 0.5245653817082389, "epsilon_dpo/beta": 0.036632440984249115, "epsilon_dpo/beta_margin_grad_mean": -0.4249770939350128, "epsilon_dpo/beta_margin_grad_std": 0.1754349023103714, "epsilon_dpo/beta_margin_mean": 0.36538615822792053, "epsilon_dpo/beta_margin_std": 0.8376450538635254, "epsilon_dpo/loss_margin_mean": 10.127693176269531, "grad_norm": 11.684128761291504, "kl/avg_steps": 0.28125, "kl/beta": 0.03673208877444267, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.737640108260456e-07, "logits/chosen": -0.3123420178890228, "logits/rejected": -0.3318549692630768, "logps/chosen": -96.49603271484375, "logps/ref_chosen": -78.1712875366211, "logps/ref_rejected": -98.7479248046875, "logps/rejected": -127.20036315917969, "loss": 1.21, "rewards/accuracies": 0.625, "rewards/chosen": -0.674199104309082, "rewards/margins": 0.36538615822792053, "rewards/rejected": -1.0395852327346802, "step": 347 }, { "epoch": 0.5260770975056689, "epsilon_dpo/beta": 0.03654114902019501, "epsilon_dpo/beta_margin_grad_mean": -0.4058833718299866, "epsilon_dpo/beta_margin_grad_std": 0.17744751274585724, "epsilon_dpo/beta_margin_mean": 0.4648371636867523, "epsilon_dpo/beta_margin_std": 0.8594658374786377, "epsilon_dpo/loss_margin_mean": 12.89057445526123, "grad_norm": 14.16436767578125, "kl/avg_steps": 0.25, "kl/beta": 0.03662906959652901, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.724474525774229e-07, "logits/chosen": -0.21772196888923645, "logits/rejected": -0.3911592364311218, "logps/chosen": -92.62994384765625, "logps/ref_chosen": -75.35821533203125, "logps/ref_rejected": -90.65567016601562, "logps/rejected": -120.8179702758789, "loss": 1.136, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6333014965057373, "rewards/margins": 0.4648371636867523, "rewards/rejected": -1.098138689994812, "step": 348 }, { "epoch": 0.527588813303099, "epsilon_dpo/beta": 0.03637009114027023, "epsilon_dpo/beta_margin_grad_mean": -0.40059003233909607, "epsilon_dpo/beta_margin_grad_std": 0.18574047088623047, "epsilon_dpo/beta_margin_mean": 0.4811083674430847, "epsilon_dpo/beta_margin_std": 0.910507321357727, "epsilon_dpo/loss_margin_mean": 13.37955093383789, "grad_norm": 11.531458854675293, "kl/avg_steps": 0.46875, "kl/beta": 0.03653772547841072, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.711302664252973e-07, "logits/chosen": -0.5206562280654907, "logits/rejected": -0.34496116638183594, "logps/chosen": -90.16403198242188, "logps/ref_chosen": -75.35797119140625, "logps/ref_rejected": -109.66375732421875, "logps/rejected": -137.849365234375, "loss": 1.1427, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5402654409408569, "rewards/margins": 0.4811083674430847, "rewards/rejected": -1.0213738679885864, "step": 349 }, { "epoch": 0.5291005291005291, "epsilon_dpo/beta": 0.036143574863672256, "epsilon_dpo/beta_margin_grad_mean": -0.3563748002052307, "epsilon_dpo/beta_margin_grad_std": 0.14815759658813477, "epsilon_dpo/beta_margin_mean": 0.6667227149009705, "epsilon_dpo/beta_margin_std": 0.7182585597038269, "epsilon_dpo/loss_margin_mean": 18.544713973999023, "grad_norm": 14.325149536132812, "kl/avg_steps": 0.625, "kl/beta": 0.036367256194353104, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.698124892141971e-07, "logits/chosen": -0.4628453254699707, "logits/rejected": -0.3299052119255066, "logps/chosen": -99.627197265625, "logps/ref_chosen": -84.2036361694336, "logps/ref_rejected": -115.74954223632812, "logps/rejected": -149.71783447265625, "loss": 0.9385, "rewards/accuracies": 0.796875, "rewards/chosen": -0.55904221534729, "rewards/margins": 0.6667227149009705, "rewards/rejected": -1.2257649898529053, "step": 350 }, { "epoch": 0.5306122448979592, "epsilon_dpo/beta": 0.03597555682063103, "epsilon_dpo/beta_margin_grad_mean": -0.4049746096134186, "epsilon_dpo/beta_margin_grad_std": 0.18007493019104004, "epsilon_dpo/beta_margin_mean": 0.4236089587211609, "epsilon_dpo/beta_margin_std": 0.8556088805198669, "epsilon_dpo/loss_margin_mean": 11.925423622131348, "grad_norm": 12.217904090881348, "kl/avg_steps": 0.46875, "kl/beta": 0.036141373217105865, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.6849415780518357e-07, "logits/chosen": -0.2202155888080597, "logits/rejected": -0.2810223698616028, "logps/chosen": -92.83936309814453, "logps/ref_chosen": -75.33352661132812, "logps/ref_rejected": -103.31353759765625, "logps/rejected": -132.74481201171875, "loss": 1.1721, "rewards/accuracies": 0.75, "rewards/chosen": -0.6318588852882385, "rewards/margins": 0.4236089289188385, "rewards/rejected": -1.0554677248001099, "step": 351 }, { "epoch": 0.5321239606953893, "epsilon_dpo/beta": 0.03585267439484596, "epsilon_dpo/beta_margin_grad_mean": -0.40181681513786316, "epsilon_dpo/beta_margin_grad_std": 0.1694328784942627, "epsilon_dpo/beta_margin_mean": 0.4756009876728058, "epsilon_dpo/beta_margin_std": 0.8291955590248108, "epsilon_dpo/loss_margin_mean": 13.421477317810059, "grad_norm": 10.798321723937988, "kl/avg_steps": 0.34375, "kl/beta": 0.03597274795174599, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.6717530907482024e-07, "logits/chosen": -0.3640022873878479, "logits/rejected": -0.23501214385032654, "logps/chosen": -105.84851837158203, "logps/ref_chosen": -87.23213195800781, "logps/ref_rejected": -112.02339172363281, "logps/rejected": -144.06124877929688, "loss": 1.1153, "rewards/accuracies": 0.671875, "rewards/chosen": -0.670296311378479, "rewards/margins": 0.4756010174751282, "rewards/rejected": -1.145897388458252, "step": 352 }, { "epoch": 0.5336356764928194, "epsilon_dpo/beta": 0.03561781346797943, "epsilon_dpo/beta_margin_grad_mean": -0.38533923029899597, "epsilon_dpo/beta_margin_grad_std": 0.18581627309322357, "epsilon_dpo/beta_margin_mean": 0.5274343490600586, "epsilon_dpo/beta_margin_std": 0.9047785401344299, "epsilon_dpo/loss_margin_mean": 14.941997528076172, "grad_norm": 11.600533485412598, "kl/avg_steps": 0.65625, "kl/beta": 0.03584951534867287, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.658559799141411e-07, "logits/chosen": -0.31952571868896484, "logits/rejected": -0.42992115020751953, "logps/chosen": -104.45259857177734, "logps/ref_chosen": -90.74217224121094, "logps/ref_rejected": -93.2311019897461, "logps/rejected": -121.88351440429688, "loss": 1.1077, "rewards/accuracies": 0.8125, "rewards/chosen": -0.49013230204582214, "rewards/margins": 0.5274343490600586, "rewards/rejected": -1.0175666809082031, "step": 353 }, { "epoch": 0.5351473922902494, "epsilon_dpo/beta": 0.035452380776405334, "epsilon_dpo/beta_margin_grad_mean": -0.3861326277256012, "epsilon_dpo/beta_margin_grad_std": 0.18367260694503784, "epsilon_dpo/beta_margin_mean": 0.5246230959892273, "epsilon_dpo/beta_margin_std": 0.8698256015777588, "epsilon_dpo/loss_margin_mean": 14.963334083557129, "grad_norm": 11.076436042785645, "kl/avg_steps": 0.46875, "kl/beta": 0.03561578691005707, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.6453620722761895e-07, "logits/chosen": -0.503691554069519, "logits/rejected": -0.20610500872135162, "logps/chosen": -73.68644714355469, "logps/ref_chosen": -59.317867279052734, "logps/ref_rejected": -106.89750671386719, "logps/rejected": -136.2294158935547, "loss": 1.0983, "rewards/accuracies": 0.71875, "rewards/chosen": -0.511963963508606, "rewards/margins": 0.5246231555938721, "rewards/rejected": -1.036587119102478, "step": 354 }, { "epoch": 0.5366591080876795, "epsilon_dpo/beta": 0.0352315790951252, "epsilon_dpo/beta_margin_grad_mean": -0.3844098150730133, "epsilon_dpo/beta_margin_grad_std": 0.17795078456401825, "epsilon_dpo/beta_margin_mean": 0.5513162612915039, "epsilon_dpo/beta_margin_std": 0.868008017539978, "epsilon_dpo/loss_margin_mean": 15.773537635803223, "grad_norm": 11.321294784545898, "kl/avg_steps": 0.625, "kl/beta": 0.035449616611003876, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.632160279321328e-07, "logits/chosen": -0.2635769546031952, "logits/rejected": -0.6916155815124512, "logps/chosen": -80.50833892822266, "logps/ref_chosen": -66.28169250488281, "logps/ref_rejected": -107.13027954101562, "logps/rejected": -137.13046264648438, "loss": 1.0733, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5025330185890198, "rewards/margins": 0.5513162612915039, "rewards/rejected": -1.053849220275879, "step": 355 }, { "epoch": 0.5381708238851096, "epsilon_dpo/beta": 0.03508982062339783, "epsilon_dpo/beta_margin_grad_mean": -0.3791634738445282, "epsilon_dpo/beta_margin_grad_std": 0.1978328675031662, "epsilon_dpo/beta_margin_mean": 0.5767776966094971, "epsilon_dpo/beta_margin_std": 0.9672296047210693, "epsilon_dpo/loss_margin_mean": 16.622722625732422, "grad_norm": 14.604504585266113, "kl/avg_steps": 0.40625, "kl/beta": 0.035229433327913284, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.618954789559356e-07, "logits/chosen": -0.3518662452697754, "logits/rejected": -0.28898200392723083, "logps/chosen": -76.50581359863281, "logps/ref_chosen": -66.96154022216797, "logps/ref_rejected": -96.09982299804688, "logps/rejected": -122.26683044433594, "loss": 1.0952, "rewards/accuracies": 0.71875, "rewards/chosen": -0.33804744482040405, "rewards/margins": 0.5767776966094971, "rewards/rejected": -0.9148250818252563, "step": 356 }, { "epoch": 0.5396825396825397, "epsilon_dpo/beta": 0.03495881333947182, "epsilon_dpo/beta_margin_grad_mean": -0.404715359210968, "epsilon_dpo/beta_margin_grad_std": 0.18092228472232819, "epsilon_dpo/beta_margin_mean": 0.45309287309646606, "epsilon_dpo/beta_margin_std": 0.8744781017303467, "epsilon_dpo/loss_margin_mean": 13.121901512145996, "grad_norm": 12.346394538879395, "kl/avg_steps": 0.375, "kl/beta": 0.035086892545223236, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.6057459723762076e-07, "logits/chosen": -0.34115922451019287, "logits/rejected": -0.4912487864494324, "logps/chosen": -104.1282958984375, "logps/ref_chosen": -88.78244018554688, "logps/ref_rejected": -93.55414581298828, "logps/rejected": -122.02190399169922, "loss": 1.1528, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5402135252952576, "rewards/margins": 0.45309287309646606, "rewards/rejected": -0.9933063983917236, "step": 357 }, { "epoch": 0.5411942554799698, "epsilon_dpo/beta": 0.03479543328285217, "epsilon_dpo/beta_margin_grad_mean": -0.3826284110546112, "epsilon_dpo/beta_margin_grad_std": 0.17101824283599854, "epsilon_dpo/beta_margin_mean": 0.5514477491378784, "epsilon_dpo/beta_margin_std": 0.8156856298446655, "epsilon_dpo/loss_margin_mean": 15.996456146240234, "grad_norm": 12.181327819824219, "kl/avg_steps": 0.46875, "kl/beta": 0.03495581075549126, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.5925341972508954e-07, "logits/chosen": -0.5567905306816101, "logits/rejected": -0.4287104606628418, "logps/chosen": -105.34233093261719, "logps/ref_chosen": -92.37928771972656, "logps/ref_rejected": -90.57598876953125, "logps/rejected": -119.53549194335938, "loss": 1.0559, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4534444808959961, "rewards/margins": 0.5514477491378784, "rewards/rejected": -1.004892349243164, "step": 358 }, { "epoch": 0.5427059712773998, "epsilon_dpo/beta": 0.03469833359122276, "epsilon_dpo/beta_margin_grad_mean": -0.4318692982196808, "epsilon_dpo/beta_margin_grad_std": 0.18369096517562866, "epsilon_dpo/beta_margin_mean": 0.32085007429122925, "epsilon_dpo/beta_margin_std": 0.8615565896034241, "epsilon_dpo/loss_margin_mean": 9.426478385925293, "grad_norm": 12.058820724487305, "kl/avg_steps": 0.28125, "kl/beta": 0.034792717546224594, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.579319833745169e-07, "logits/chosen": -0.6475515365600586, "logits/rejected": -0.38479065895080566, "logps/chosen": -100.14118194580078, "logps/ref_chosen": -85.90824127197266, "logps/ref_rejected": -104.04725646972656, "logps/rejected": -127.70668029785156, "loss": 1.2595, "rewards/accuracies": 0.609375, "rewards/chosen": -0.49748337268829346, "rewards/margins": 0.32085007429122925, "rewards/rejected": -0.8183333873748779, "step": 359 }, { "epoch": 0.54421768707483, "epsilon_dpo/beta": 0.03460101783275604, "epsilon_dpo/beta_margin_grad_mean": -0.4063347578048706, "epsilon_dpo/beta_margin_grad_std": 0.17292876541614532, "epsilon_dpo/beta_margin_mean": 0.4609655439853668, "epsilon_dpo/beta_margin_std": 0.8432448506355286, "epsilon_dpo/loss_margin_mean": 13.486247062683105, "grad_norm": 10.786174774169922, "kl/avg_steps": 0.28125, "kl/beta": 0.03469513729214668, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.5661032514931834e-07, "logits/chosen": -0.3451042175292969, "logits/rejected": -0.5038284063339233, "logps/chosen": -98.89404296875, "logps/ref_chosen": -84.124267578125, "logps/ref_rejected": -118.12923431396484, "logps/rejected": -146.38525390625, "loss": 1.1324, "rewards/accuracies": 0.640625, "rewards/chosen": -0.5138739943504333, "rewards/margins": 0.46096551418304443, "rewards/rejected": -0.9748395085334778, "step": 360 }, { "epoch": 0.54572940287226, "epsilon_dpo/beta": 0.03442828729748726, "epsilon_dpo/beta_margin_grad_mean": -0.3710825443267822, "epsilon_dpo/beta_margin_grad_std": 0.16406892240047455, "epsilon_dpo/beta_margin_mean": 0.6091728806495667, "epsilon_dpo/beta_margin_std": 0.8011631369590759, "epsilon_dpo/loss_margin_mean": 17.82909393310547, "grad_norm": 9.879510879516602, "kl/avg_steps": 0.5, "kl/beta": 0.03459783270955086, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.552884820191154e-07, "logits/chosen": -0.25418639183044434, "logits/rejected": -0.4245424270629883, "logps/chosen": -92.85491180419922, "logps/ref_chosen": -79.35969543457031, "logps/ref_rejected": -100.64299011230469, "logps/rejected": -131.96731567382812, "loss": 1.0057, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4663471281528473, "rewards/margins": 0.6091729402542114, "rewards/rejected": -1.0755200386047363, "step": 361 }, { "epoch": 0.54724111866969, "epsilon_dpo/beta": 0.034240804612636566, "epsilon_dpo/beta_margin_grad_mean": -0.41057583689689636, "epsilon_dpo/beta_margin_grad_std": 0.1774662286043167, "epsilon_dpo/beta_margin_mean": 0.424104779958725, "epsilon_dpo/beta_margin_std": 0.8874197006225586, "epsilon_dpo/loss_margin_mean": 12.533075332641602, "grad_norm": 13.939665794372559, "kl/avg_steps": 0.546875, "kl/beta": 0.034425701946020126, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.53966490958702e-07, "logits/chosen": -0.49522465467453003, "logits/rejected": -0.8078896999359131, "logps/chosen": -104.72351837158203, "logps/ref_chosen": -87.18423461914062, "logps/ref_rejected": -131.90994262695312, "logps/rejected": -161.9822998046875, "loss": 1.1749, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6019046306610107, "rewards/margins": 0.42410480976104736, "rewards/rejected": -1.026009440422058, "step": 362 }, { "epoch": 0.5487528344671202, "epsilon_dpo/beta": 0.03398507088422775, "epsilon_dpo/beta_margin_grad_mean": -0.3623369038105011, "epsilon_dpo/beta_margin_grad_std": 0.15423668920993805, "epsilon_dpo/beta_margin_mean": 0.6261187195777893, "epsilon_dpo/beta_margin_std": 0.7368482947349548, "epsilon_dpo/loss_margin_mean": 18.516027450561523, "grad_norm": 12.303284645080566, "kl/avg_steps": 0.75, "kl/beta": 0.034238461405038834, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.526443889470099e-07, "logits/chosen": -0.2638314664363861, "logits/rejected": -0.3185631334781647, "logps/chosen": -84.76541900634766, "logps/ref_chosen": -69.48241424560547, "logps/ref_rejected": -115.17513275146484, "logps/rejected": -148.9741668701172, "loss": 0.9752, "rewards/accuracies": 0.875, "rewards/chosen": -0.5202293395996094, "rewards/margins": 0.6261187791824341, "rewards/rejected": -1.146347999572754, "step": 363 }, { "epoch": 0.5502645502645502, "epsilon_dpo/beta": 0.0338701531291008, "epsilon_dpo/beta_margin_grad_mean": -0.37633389234542847, "epsilon_dpo/beta_margin_grad_std": 0.18905964493751526, "epsilon_dpo/beta_margin_mean": 0.6139385104179382, "epsilon_dpo/beta_margin_std": 0.9477400183677673, "epsilon_dpo/loss_margin_mean": 18.3122501373291, "grad_norm": 13.421125411987305, "kl/avg_steps": 0.34375, "kl/beta": 0.033983584493398666, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.513222129660744e-07, "logits/chosen": -0.2861930727958679, "logits/rejected": -0.30931782722473145, "logps/chosen": -93.13960266113281, "logps/ref_chosen": -81.23245239257812, "logps/ref_rejected": -109.52008056640625, "logps/rejected": -139.73948669433594, "loss": 1.0549, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4042242765426636, "rewards/margins": 0.613938570022583, "rewards/rejected": -1.018162727355957, "step": 364 }, { "epoch": 0.5517762660619804, "epsilon_dpo/beta": 0.0336800292134285, "epsilon_dpo/beta_margin_grad_mean": -0.3741872012615204, "epsilon_dpo/beta_margin_grad_std": 0.15171965956687927, "epsilon_dpo/beta_margin_mean": 0.5734090209007263, "epsilon_dpo/beta_margin_std": 0.7196288704872131, "epsilon_dpo/loss_margin_mean": 17.134653091430664, "grad_norm": 12.334189414978027, "kl/avg_steps": 0.5625, "kl/beta": 0.03386716544628143, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.5e-07, "logits/chosen": -0.44367483258247375, "logits/rejected": -0.374923974275589, "logps/chosen": -95.52092742919922, "logps/ref_chosen": -88.40348815917969, "logps/ref_rejected": -93.00691986083984, "logps/rejected": -117.25901794433594, "loss": 1.0079, "rewards/accuracies": 0.78125, "rewards/chosen": -0.24103927612304688, "rewards/margins": 0.5734090209007263, "rewards/rejected": -0.8144482970237732, "step": 365 }, { "epoch": 0.5532879818594104, "epsilon_dpo/beta": 0.033586371690034866, "epsilon_dpo/beta_margin_grad_mean": -0.40513846278190613, "epsilon_dpo/beta_margin_grad_std": 0.1917259395122528, "epsilon_dpo/beta_margin_mean": 0.47303882241249084, "epsilon_dpo/beta_margin_std": 0.9454165101051331, "epsilon_dpo/loss_margin_mean": 14.272467613220215, "grad_norm": 10.503172874450684, "kl/avg_steps": 0.28125, "kl/beta": 0.033677730709314346, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.486777870339255e-07, "logits/chosen": -0.5803329944610596, "logits/rejected": -0.5538395643234253, "logps/chosen": -87.64436340332031, "logps/ref_chosen": -77.08778381347656, "logps/ref_rejected": -88.84114074707031, "logps/rejected": -113.67018127441406, "loss": 1.1624, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3563389480113983, "rewards/margins": 0.47303879261016846, "rewards/rejected": -0.8293777704238892, "step": 366 }, { "epoch": 0.5547996976568406, "epsilon_dpo/beta": 0.03344488888978958, "epsilon_dpo/beta_margin_grad_mean": -0.41342422366142273, "epsilon_dpo/beta_margin_grad_std": 0.16751711070537567, "epsilon_dpo/beta_margin_mean": 0.3815315067768097, "epsilon_dpo/beta_margin_std": 0.7867935299873352, "epsilon_dpo/loss_margin_mean": 11.564186096191406, "grad_norm": 10.828190803527832, "kl/avg_steps": 0.421875, "kl/beta": 0.033583275973796844, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.4735561105299014e-07, "logits/chosen": -0.613606333732605, "logits/rejected": -0.5410276651382446, "logps/chosen": -92.40072631835938, "logps/ref_chosen": -77.62783813476562, "logps/ref_rejected": -104.18344116210938, "logps/rejected": -130.52052307128906, "loss": 1.1813, "rewards/accuracies": 0.734375, "rewards/chosen": -0.49686622619628906, "rewards/margins": 0.3815315365791321, "rewards/rejected": -0.8783978223800659, "step": 367 }, { "epoch": 0.5563114134542706, "epsilon_dpo/beta": 0.03333056718111038, "epsilon_dpo/beta_margin_grad_mean": -0.4020618498325348, "epsilon_dpo/beta_margin_grad_std": 0.18135952949523926, "epsilon_dpo/beta_margin_mean": 0.4608325660228729, "epsilon_dpo/beta_margin_std": 0.8643712401390076, "epsilon_dpo/loss_margin_mean": 14.009523391723633, "grad_norm": 10.81441593170166, "kl/avg_steps": 0.34375, "kl/beta": 0.033442191779613495, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.46033509041298e-07, "logits/chosen": -0.43107640743255615, "logits/rejected": -0.27693885564804077, "logps/chosen": -113.70527648925781, "logps/ref_chosen": -96.74423217773438, "logps/ref_rejected": -98.10311889648438, "logps/rejected": -129.0736846923828, "loss": 1.1441, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5687465667724609, "rewards/margins": 0.4608325660228729, "rewards/rejected": -1.0295791625976562, "step": 368 }, { "epoch": 0.5578231292517006, "epsilon_dpo/beta": 0.033211126923561096, "epsilon_dpo/beta_margin_grad_mean": -0.4330996870994568, "epsilon_dpo/beta_margin_grad_std": 0.16043256223201752, "epsilon_dpo/beta_margin_mean": 0.30416300892829895, "epsilon_dpo/beta_margin_std": 0.7585687637329102, "epsilon_dpo/loss_margin_mean": 9.304594039916992, "grad_norm": 12.7125244140625, "kl/avg_steps": 0.359375, "kl/beta": 0.033327627927064896, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.447115179808846e-07, "logits/chosen": -0.7121263742446899, "logits/rejected": -0.6011459827423096, "logps/chosen": -91.87727355957031, "logps/ref_chosen": -77.4908676147461, "logps/ref_rejected": -105.63996887207031, "logps/rejected": -129.33096313476562, "loss": 1.2349, "rewards/accuracies": 0.703125, "rewards/chosen": -0.47804659605026245, "rewards/margins": 0.30416297912597656, "rewards/rejected": -0.782209575176239, "step": 369 }, { "epoch": 0.5593348450491308, "epsilon_dpo/beta": 0.033076684921979904, "epsilon_dpo/beta_margin_grad_mean": -0.36861076951026917, "epsilon_dpo/beta_margin_grad_std": 0.1980779469013214, "epsilon_dpo/beta_margin_mean": 0.6260709166526794, "epsilon_dpo/beta_margin_std": 0.9714806079864502, "epsilon_dpo/loss_margin_mean": 19.130352020263672, "grad_norm": 15.036335945129395, "kl/avg_steps": 0.40625, "kl/beta": 0.03320828452706337, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.4338967485068164e-07, "logits/chosen": -0.37563467025756836, "logits/rejected": -0.16301874816417694, "logps/chosen": -86.26065063476562, "logps/ref_chosen": -73.92730712890625, "logps/ref_rejected": -99.21125030517578, "logps/rejected": -130.67494201660156, "loss": 1.0611, "rewards/accuracies": 0.734375, "rewards/chosen": -0.412090003490448, "rewards/margins": 0.6260708570480347, "rewards/rejected": -1.0381609201431274, "step": 370 }, { "epoch": 0.5608465608465608, "epsilon_dpo/beta": 0.03293251991271973, "epsilon_dpo/beta_margin_grad_mean": -0.3986269235610962, "epsilon_dpo/beta_margin_grad_std": 0.1933879405260086, "epsilon_dpo/beta_margin_mean": 0.48927295207977295, "epsilon_dpo/beta_margin_std": 0.940234899520874, "epsilon_dpo/loss_margin_mean": 15.045866012573242, "grad_norm": 16.158891677856445, "kl/avg_steps": 0.4375, "kl/beta": 0.033073924481868744, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.420680166254831e-07, "logits/chosen": -0.33735889196395874, "logits/rejected": -0.2781010568141937, "logps/chosen": -84.40055847167969, "logps/ref_chosen": -70.58553314208984, "logps/ref_rejected": -84.94666290283203, "logps/rejected": -113.80755615234375, "loss": 1.1498, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4571009874343872, "rewards/margins": 0.48927295207977295, "rewards/rejected": -0.9463739395141602, "step": 371 }, { "epoch": 0.562358276643991, "epsilon_dpo/beta": 0.03283023461699486, "epsilon_dpo/beta_margin_grad_mean": -0.4104512333869934, "epsilon_dpo/beta_margin_grad_std": 0.19789732992649078, "epsilon_dpo/beta_margin_mean": 0.42812153697013855, "epsilon_dpo/beta_margin_std": 0.9525101184844971, "epsilon_dpo/loss_margin_mean": 13.248870849609375, "grad_norm": 11.11093807220459, "kl/avg_steps": 0.3125, "kl/beta": 0.032929856330156326, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.4074658027491044e-07, "logits/chosen": -0.41160130500793457, "logits/rejected": -0.47164422273635864, "logps/chosen": -86.42536163330078, "logps/ref_chosen": -74.52593994140625, "logps/ref_rejected": -100.20262145996094, "logps/rejected": -125.35091400146484, "loss": 1.2052, "rewards/accuracies": 0.6875, "rewards/chosen": -0.39410853385925293, "rewards/margins": 0.42812150716781616, "rewards/rejected": -0.8222300410270691, "step": 372 }, { "epoch": 0.563869992441421, "epsilon_dpo/beta": 0.03270743787288666, "epsilon_dpo/beta_margin_grad_mean": -0.4025079607963562, "epsilon_dpo/beta_margin_grad_std": 0.18594452738761902, "epsilon_dpo/beta_margin_mean": 0.4715001881122589, "epsilon_dpo/beta_margin_std": 0.8927410244941711, "epsilon_dpo/loss_margin_mean": 14.600152969360352, "grad_norm": 12.162555694580078, "kl/avg_steps": 0.375, "kl/beta": 0.03282726928591728, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.394254027623792e-07, "logits/chosen": -0.4808266758918762, "logits/rejected": -0.5573515892028809, "logps/chosen": -105.85910034179688, "logps/ref_chosen": -87.2992935180664, "logps/ref_rejected": -109.5474853515625, "logps/rejected": -142.7074432373047, "loss": 1.1456, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6099110245704651, "rewards/margins": 0.4715002179145813, "rewards/rejected": -1.0814111232757568, "step": 373 }, { "epoch": 0.5653817082388511, "epsilon_dpo/beta": 0.03252391517162323, "epsilon_dpo/beta_margin_grad_mean": -0.3559419512748718, "epsilon_dpo/beta_margin_grad_std": 0.18340633809566498, "epsilon_dpo/beta_margin_mean": 0.6854166984558105, "epsilon_dpo/beta_margin_std": 0.9182686805725098, "epsilon_dpo/loss_margin_mean": 21.238224029541016, "grad_norm": 10.829121589660645, "kl/avg_steps": 0.5625, "kl/beta": 0.03270462900400162, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.381045210440644e-07, "logits/chosen": -0.33610478043556213, "logits/rejected": -0.38159722089767456, "logps/chosen": -118.37493896484375, "logps/ref_chosen": -103.18836212158203, "logps/ref_rejected": -101.68389892578125, "logps/rejected": -138.10870361328125, "loss": 0.9944, "rewards/accuracies": 0.8125, "rewards/chosen": -0.49699217081069946, "rewards/margins": 0.6854166984558105, "rewards/rejected": -1.1824089288711548, "step": 374 }, { "epoch": 0.5668934240362812, "epsilon_dpo/beta": 0.03239784389734268, "epsilon_dpo/beta_margin_grad_mean": -0.39941704273223877, "epsilon_dpo/beta_margin_grad_std": 0.1892874836921692, "epsilon_dpo/beta_margin_mean": 0.4690830409526825, "epsilon_dpo/beta_margin_std": 0.890160858631134, "epsilon_dpo/loss_margin_mean": 14.67497730255127, "grad_norm": 12.20340633392334, "kl/avg_steps": 0.390625, "kl/beta": 0.03252169117331505, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.3678397206786715e-07, "logits/chosen": -0.3479893207550049, "logits/rejected": -0.14140911400318146, "logps/chosen": -90.7242660522461, "logps/ref_chosen": -80.96290588378906, "logps/ref_rejected": -105.99746704101562, "logps/rejected": -130.43380737304688, "loss": 1.1496, "rewards/accuracies": 0.6875, "rewards/chosen": -0.31886184215545654, "rewards/margins": 0.46908310055732727, "rewards/rejected": -0.7879449129104614, "step": 375 }, { "epoch": 0.5684051398337112, "epsilon_dpo/beta": 0.0322667732834816, "epsilon_dpo/beta_margin_grad_mean": -0.3862917721271515, "epsilon_dpo/beta_margin_grad_std": 0.19730228185653687, "epsilon_dpo/beta_margin_mean": 0.5730307698249817, "epsilon_dpo/beta_margin_std": 0.984718918800354, "epsilon_dpo/loss_margin_mean": 17.959718704223633, "grad_norm": 10.152912139892578, "kl/avg_steps": 0.40625, "kl/beta": 0.032395150512456894, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.3546379277238103e-07, "logits/chosen": -0.24255940318107605, "logits/rejected": -0.46518683433532715, "logps/chosen": -86.96746826171875, "logps/ref_chosen": -73.2122573852539, "logps/ref_rejected": -96.98638916015625, "logps/rejected": -128.70132446289062, "loss": 1.1006, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44633597135543823, "rewards/margins": 0.5730308294296265, "rewards/rejected": -1.01936674118042, "step": 376 }, { "epoch": 0.5699168556311414, "epsilon_dpo/beta": 0.032126132398843765, "epsilon_dpo/beta_margin_grad_mean": -0.4142596423625946, "epsilon_dpo/beta_margin_grad_std": 0.16649965941905975, "epsilon_dpo/beta_margin_mean": 0.4061944782733917, "epsilon_dpo/beta_margin_std": 0.7875421047210693, "epsilon_dpo/loss_margin_mean": 12.792858123779297, "grad_norm": 11.573308944702148, "kl/avg_steps": 0.4375, "kl/beta": 0.03226407617330551, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.3414402008585886e-07, "logits/chosen": -0.2805550694465637, "logits/rejected": -0.31607991456985474, "logps/chosen": -85.04446411132812, "logps/ref_chosen": -68.43621063232422, "logps/ref_rejected": -80.13630676269531, "logps/rejected": -109.53741455078125, "loss": 1.1594, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5347778797149658, "rewards/margins": 0.4061944782733917, "rewards/rejected": -0.9409723281860352, "step": 377 }, { "epoch": 0.5714285714285714, "epsilon_dpo/beta": 0.03196611627936363, "epsilon_dpo/beta_margin_grad_mean": -0.41235488653182983, "epsilon_dpo/beta_margin_grad_std": 0.17046606540679932, "epsilon_dpo/beta_margin_mean": 0.4097154140472412, "epsilon_dpo/beta_margin_std": 0.8452909588813782, "epsilon_dpo/loss_margin_mean": 12.954933166503906, "grad_norm": 11.13086986541748, "kl/avg_steps": 0.5, "kl/beta": 0.03212353587150574, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.3282469092517977e-07, "logits/chosen": -0.3559729754924774, "logits/rejected": -0.3680075407028198, "logps/chosen": -99.54299926757812, "logps/ref_chosen": -83.83049011230469, "logps/ref_rejected": -101.34911346435547, "logps/rejected": -130.0165557861328, "loss": 1.1737, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5037399530410767, "rewards/margins": 0.4097154140472412, "rewards/rejected": -0.9134553074836731, "step": 378 }, { "epoch": 0.5729402872260015, "epsilon_dpo/beta": 0.031827058643102646, "epsilon_dpo/beta_margin_grad_mean": -0.3888133764266968, "epsilon_dpo/beta_margin_grad_std": 0.18203434348106384, "epsilon_dpo/beta_margin_mean": 0.5286673307418823, "epsilon_dpo/beta_margin_std": 0.8799561262130737, "epsilon_dpo/loss_margin_mean": 16.793392181396484, "grad_norm": 11.439459800720215, "kl/avg_steps": 0.4375, "kl/beta": 0.03196371719241142, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.3150584219481643e-07, "logits/chosen": -0.6511602997779846, "logits/rejected": -0.6409453749656677, "logps/chosen": -112.08035278320312, "logps/ref_chosen": -99.05479431152344, "logps/ref_rejected": -126.27124786376953, "logps/rejected": -156.09019470214844, "loss": 1.0957, "rewards/accuracies": 0.75, "rewards/chosen": -0.4176122844219208, "rewards/margins": 0.5286673903465271, "rewards/rejected": -0.9462796449661255, "step": 379 }, { "epoch": 0.5744520030234316, "epsilon_dpo/beta": 0.03164863586425781, "epsilon_dpo/beta_margin_grad_mean": -0.36535948514938354, "epsilon_dpo/beta_margin_grad_std": 0.18868181109428406, "epsilon_dpo/beta_margin_mean": 0.6302640438079834, "epsilon_dpo/beta_margin_std": 0.9209726452827454, "epsilon_dpo/loss_margin_mean": 20.09124183654785, "grad_norm": 9.986943244934082, "kl/avg_steps": 0.5625, "kl/beta": 0.03182448446750641, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.3018751078580283e-07, "logits/chosen": -0.31451958417892456, "logits/rejected": -0.2750563621520996, "logps/chosen": -93.44203186035156, "logps/ref_chosen": -83.15869140625, "logps/ref_rejected": -100.08312225341797, "logps/rejected": -130.45770263671875, "loss": 1.0381, "rewards/accuracies": 0.796875, "rewards/chosen": -0.32757604122161865, "rewards/margins": 0.6302640438079834, "rewards/rejected": -0.957840085029602, "step": 380 }, { "epoch": 0.5759637188208617, "epsilon_dpo/beta": 0.031570516526699066, "epsilon_dpo/beta_margin_grad_mean": -0.4260694086551666, "epsilon_dpo/beta_margin_grad_std": 0.17625002562999725, "epsilon_dpo/beta_margin_mean": 0.3538646697998047, "epsilon_dpo/beta_margin_std": 0.8401235342025757, "epsilon_dpo/loss_margin_mean": 11.38886833190918, "grad_norm": 10.793598175048828, "kl/avg_steps": 0.25, "kl/beta": 0.031646475195884705, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.288697335747027e-07, "logits/chosen": -0.29458528757095337, "logits/rejected": -0.4564410448074341, "logps/chosen": -97.15643310546875, "logps/ref_chosen": -81.04007720947266, "logps/ref_rejected": -89.97328186035156, "logps/rejected": -117.47850036621094, "loss": 1.2213, "rewards/accuracies": 0.625, "rewards/chosen": -0.5116348266601562, "rewards/margins": 0.3538646996021271, "rewards/rejected": -0.8654994964599609, "step": 381 }, { "epoch": 0.5774754346182918, "epsilon_dpo/beta": 0.03141285851597786, "epsilon_dpo/beta_margin_grad_mean": -0.40493854880332947, "epsilon_dpo/beta_margin_grad_std": 0.1608555167913437, "epsilon_dpo/beta_margin_mean": 0.4543093740940094, "epsilon_dpo/beta_margin_std": 0.7822091579437256, "epsilon_dpo/loss_margin_mean": 14.60008430480957, "grad_norm": 13.035445213317871, "kl/avg_steps": 0.5, "kl/beta": 0.03156755492091179, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.2755254742257706e-07, "logits/chosen": -0.2647457718849182, "logits/rejected": -0.5761418342590332, "logps/chosen": -99.72867584228516, "logps/ref_chosen": -83.58617401123047, "logps/ref_rejected": -105.45875549316406, "logps/rejected": -136.2013397216797, "loss": 1.1163, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5082204341888428, "rewards/margins": 0.4543094038963318, "rewards/rejected": -0.9625298380851746, "step": 382 }, { "epoch": 0.5789871504157218, "epsilon_dpo/beta": 0.031354743987321854, "epsilon_dpo/beta_margin_grad_mean": -0.4083390533924103, "epsilon_dpo/beta_margin_grad_std": 0.18964853882789612, "epsilon_dpo/beta_margin_mean": 0.45374611020088196, "epsilon_dpo/beta_margin_std": 0.9225190281867981, "epsilon_dpo/loss_margin_mean": 14.688213348388672, "grad_norm": 12.959400177001953, "kl/avg_steps": 0.1875, "kl/beta": 0.03141050413250923, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 2.2623598917395436e-07, "logits/chosen": -0.4968550503253937, "logits/rejected": -0.37411147356033325, "logps/chosen": -116.3802719116211, "logps/ref_chosen": -102.12741088867188, "logps/ref_rejected": -100.01849365234375, "logps/rejected": -128.95956420898438, "loss": 1.1697, "rewards/accuracies": 0.640625, "rewards/chosen": -0.45026350021362305, "rewards/margins": 0.45374611020088196, "rewards/rejected": -0.9040095806121826, "step": 383 }, { "epoch": 0.5804988662131519, "epsilon_dpo/beta": 0.03119313158094883, "epsilon_dpo/beta_margin_grad_mean": -0.3977428674697876, "epsilon_dpo/beta_margin_grad_std": 0.17535927891731262, "epsilon_dpo/beta_margin_mean": 0.48126453161239624, "epsilon_dpo/beta_margin_std": 0.8371120691299438, "epsilon_dpo/loss_margin_mean": 15.583664894104004, "grad_norm": 11.489479064941406, "kl/avg_steps": 0.515625, "kl/beta": 0.0313517190515995, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.2492009565579875e-07, "logits/chosen": -0.0987042710185051, "logits/rejected": -0.31251096725463867, "logps/chosen": -102.03778076171875, "logps/ref_chosen": -86.96634674072266, "logps/ref_rejected": -101.53166198730469, "logps/rejected": -132.18675231933594, "loss": 1.1173, "rewards/accuracies": 0.734375, "rewards/chosen": -0.47207605838775635, "rewards/margins": 0.48126456141471863, "rewards/rejected": -0.9533406496047974, "step": 384 }, { "epoch": 0.582010582010582, "epsilon_dpo/beta": 0.031077032908797264, "epsilon_dpo/beta_margin_grad_mean": -0.3920232951641083, "epsilon_dpo/beta_margin_grad_std": 0.1606670767068863, "epsilon_dpo/beta_margin_mean": 0.5069488286972046, "epsilon_dpo/beta_margin_std": 0.7647234797477722, "epsilon_dpo/loss_margin_mean": 16.471561431884766, "grad_norm": 14.488018989562988, "kl/avg_steps": 0.375, "kl/beta": 0.031190890818834305, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.2360490367648084e-07, "logits/chosen": -0.45997345447540283, "logits/rejected": -0.5446884632110596, "logps/chosen": -106.83857727050781, "logps/ref_chosen": -91.81571960449219, "logps/ref_rejected": -110.62493896484375, "logps/rejected": -142.11935424804688, "loss": 1.0717, "rewards/accuracies": 0.734375, "rewards/chosen": -0.46806877851486206, "rewards/margins": 0.5069488286972046, "rewards/rejected": -0.9750176668167114, "step": 385 }, { "epoch": 0.5835222978080121, "epsilon_dpo/beta": 0.03096092864871025, "epsilon_dpo/beta_margin_grad_mean": -0.42493200302124023, "epsilon_dpo/beta_margin_grad_std": 0.17903663218021393, "epsilon_dpo/beta_margin_mean": 0.34691476821899414, "epsilon_dpo/beta_margin_std": 0.8625679016113281, "epsilon_dpo/loss_margin_mean": 11.385979652404785, "grad_norm": 12.905364990234375, "kl/avg_steps": 0.375, "kl/beta": 0.031074361875653267, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.2229045002474724e-07, "logits/chosen": -0.26814377307891846, "logits/rejected": -0.4823966324329376, "logps/chosen": -116.19183349609375, "logps/ref_chosen": -99.04656982421875, "logps/ref_rejected": -122.45169067382812, "logps/rejected": -150.98294067382812, "loss": 1.2355, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5320286750793457, "rewards/margins": 0.34691473841667175, "rewards/rejected": -0.8789434432983398, "step": 386 }, { "epoch": 0.5850340136054422, "epsilon_dpo/beta": 0.030787205323576927, "epsilon_dpo/beta_margin_grad_mean": -0.3811575174331665, "epsilon_dpo/beta_margin_grad_std": 0.17074567079544067, "epsilon_dpo/beta_margin_mean": 0.555602490901947, "epsilon_dpo/beta_margin_std": 0.809950590133667, "epsilon_dpo/loss_margin_mean": 18.20049285888672, "grad_norm": 10.500995635986328, "kl/avg_steps": 0.5625, "kl/beta": 0.030958266928792, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.209767714686924e-07, "logits/chosen": -0.4561399519443512, "logits/rejected": -0.2552209794521332, "logps/chosen": -83.26592254638672, "logps/ref_chosen": -71.10548400878906, "logps/ref_rejected": -108.16309356689453, "logps/rejected": -138.52401733398438, "loss": 1.0512, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3770449757575989, "rewards/margins": 0.555602490901947, "rewards/rejected": -0.9326474666595459, "step": 387 }, { "epoch": 0.5865457294028723, "epsilon_dpo/beta": 0.030687108635902405, "epsilon_dpo/beta_margin_grad_mean": -0.4308612644672394, "epsilon_dpo/beta_margin_grad_std": 0.1636020541191101, "epsilon_dpo/beta_margin_mean": 0.31852632761001587, "epsilon_dpo/beta_margin_std": 0.7362657785415649, "epsilon_dpo/loss_margin_mean": 10.55184555053711, "grad_norm": 14.64468765258789, "kl/avg_steps": 0.328125, "kl/beta": 0.030785102397203445, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.1966390475472954e-07, "logits/chosen": -0.3645251393318176, "logits/rejected": -0.34023189544677734, "logps/chosen": -116.99144744873047, "logps/ref_chosen": -101.09429931640625, "logps/ref_rejected": -107.06742858886719, "logps/rejected": -133.51641845703125, "loss": 1.2187, "rewards/accuracies": 0.671875, "rewards/chosen": -0.4895590543746948, "rewards/margins": 0.31852632761001587, "rewards/rejected": -0.8080854415893555, "step": 388 }, { "epoch": 0.5880574452003023, "epsilon_dpo/beta": 0.03055322915315628, "epsilon_dpo/beta_margin_grad_mean": -0.3908854126930237, "epsilon_dpo/beta_margin_grad_std": 0.1603168547153473, "epsilon_dpo/beta_margin_mean": 0.5040192604064941, "epsilon_dpo/beta_margin_std": 0.7411776781082153, "epsilon_dpo/loss_margin_mean": 16.651473999023438, "grad_norm": 10.520373344421387, "kl/avg_steps": 0.4375, "kl/beta": 0.030684418976306915, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.1835188660656265e-07, "logits/chosen": -0.2951532006263733, "logits/rejected": -0.3086988031864166, "logps/chosen": -92.45687103271484, "logps/ref_chosen": -81.87734985351562, "logps/ref_rejected": -99.03115844726562, "logps/rejected": -126.26214599609375, "loss": 1.0683, "rewards/accuracies": 0.75, "rewards/chosen": -0.32412028312683105, "rewards/margins": 0.5040192604064941, "rewards/rejected": -0.8281395435333252, "step": 389 }, { "epoch": 0.5895691609977324, "epsilon_dpo/beta": 0.03042014129459858, "epsilon_dpo/beta_margin_grad_mean": -0.3851366937160492, "epsilon_dpo/beta_margin_grad_std": 0.1705460548400879, "epsilon_dpo/beta_margin_mean": 0.5503158569335938, "epsilon_dpo/beta_margin_std": 0.8301871418952942, "epsilon_dpo/loss_margin_mean": 18.252700805664062, "grad_norm": 9.353094100952148, "kl/avg_steps": 0.4375, "kl/beta": 0.0305507592856884, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.170407537241599e-07, "logits/chosen": -0.27739864587783813, "logits/rejected": -0.28185367584228516, "logps/chosen": -70.11703491210938, "logps/ref_chosen": -64.2594985961914, "logps/ref_rejected": -85.45353698730469, "logps/rejected": -109.56376647949219, "loss": 1.0595, "rewards/accuracies": 0.734375, "rewards/chosen": -0.17991623282432556, "rewards/margins": 0.5503158569335938, "rewards/rejected": -0.7302320599555969, "step": 390 }, { "epoch": 0.5910808767951625, "epsilon_dpo/beta": 0.03024960495531559, "epsilon_dpo/beta_margin_grad_mean": -0.36598044633865356, "epsilon_dpo/beta_margin_grad_std": 0.18533697724342346, "epsilon_dpo/beta_margin_mean": 0.6447353363037109, "epsilon_dpo/beta_margin_std": 0.9046013951301575, "epsilon_dpo/loss_margin_mean": 21.484485626220703, "grad_norm": 9.929374694824219, "kl/avg_steps": 0.5625, "kl/beta": 0.030417680740356445, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.1573054278272636e-07, "logits/chosen": -0.5862469673156738, "logits/rejected": -0.41867655515670776, "logps/chosen": -95.09674072265625, "logps/ref_chosen": -84.55359649658203, "logps/ref_rejected": -111.49552154541016, "logps/rejected": -143.52316284179688, "loss": 1.0192, "rewards/accuracies": 0.75, "rewards/chosen": -0.3208223581314087, "rewards/margins": 0.6447353363037109, "rewards/rejected": -0.9655576944351196, "step": 391 }, { "epoch": 0.5925925925925926, "epsilon_dpo/beta": 0.030118217691779137, "epsilon_dpo/beta_margin_grad_mean": -0.3631499409675598, "epsilon_dpo/beta_margin_grad_std": 0.1734553724527359, "epsilon_dpo/beta_margin_mean": 0.6588443517684937, "epsilon_dpo/beta_margin_std": 0.8642693161964417, "epsilon_dpo/loss_margin_mean": 22.05181121826172, "grad_norm": 9.381688117980957, "kl/avg_steps": 0.4375, "kl/beta": 0.030247539281845093, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.1442129043167873e-07, "logits/chosen": -0.8898314833641052, "logits/rejected": -0.3909822702407837, "logps/chosen": -101.3697509765625, "logps/ref_chosen": -95.38569641113281, "logps/ref_rejected": -124.9638671875, "logps/rejected": -152.99972534179688, "loss": 0.9915, "rewards/accuracies": 0.78125, "rewards/chosen": -0.18112841248512268, "rewards/margins": 0.6588443517684937, "rewards/rejected": -0.8399727940559387, "step": 392 }, { "epoch": 0.5941043083900227, "epsilon_dpo/beta": 0.029968200251460075, "epsilon_dpo/beta_margin_grad_mean": -0.3912041187286377, "epsilon_dpo/beta_margin_grad_std": 0.15663762390613556, "epsilon_dpo/beta_margin_mean": 0.5000752806663513, "epsilon_dpo/beta_margin_std": 0.7438558340072632, "epsilon_dpo/loss_margin_mean": 16.825716018676758, "grad_norm": 10.260003089904785, "kl/avg_steps": 0.5, "kl/beta": 0.030115783214569092, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.131130332936195e-07, "logits/chosen": -0.31805336475372314, "logits/rejected": -0.3601935803890228, "logps/chosen": -87.36988067626953, "logps/ref_chosen": -76.34320068359375, "logps/ref_rejected": -94.04782104492188, "logps/rejected": -121.90022277832031, "loss": 1.0705, "rewards/accuracies": 0.75, "rewards/chosen": -0.33345675468444824, "rewards/margins": 0.5000752806663513, "rewards/rejected": -0.8335320949554443, "step": 393 }, { "epoch": 0.5956160241874527, "epsilon_dpo/beta": 0.029828468337655067, "epsilon_dpo/beta_margin_grad_mean": -0.3802933990955353, "epsilon_dpo/beta_margin_grad_std": 0.1510239988565445, "epsilon_dpo/beta_margin_mean": 0.5410258173942566, "epsilon_dpo/beta_margin_std": 0.7005043029785156, "epsilon_dpo/loss_margin_mean": 18.2916202545166, "grad_norm": 11.447973251342773, "kl/avg_steps": 0.46875, "kl/beta": 0.029965952038764954, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.1180580796331323e-07, "logits/chosen": -0.21706591546535492, "logits/rejected": -0.3572559356689453, "logps/chosen": -91.35517883300781, "logps/ref_chosen": -82.9498291015625, "logps/ref_rejected": -92.46846771240234, "logps/rejected": -119.16543579101562, "loss": 1.0272, "rewards/accuracies": 0.75, "rewards/chosen": -0.25249218940734863, "rewards/margins": 0.5410258173942566, "rewards/rejected": -0.79351806640625, "step": 394 }, { "epoch": 0.5971277399848829, "epsilon_dpo/beta": 0.029754459857940674, "epsilon_dpo/beta_margin_grad_mean": -0.41406112909317017, "epsilon_dpo/beta_margin_grad_std": 0.16929131746292114, "epsilon_dpo/beta_margin_mean": 0.40022772550582886, "epsilon_dpo/beta_margin_std": 0.7990128397941589, "epsilon_dpo/loss_margin_mean": 13.644739151000977, "grad_norm": 10.52011489868164, "kl/avg_steps": 0.25, "kl/beta": 0.029826141893863678, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.609375, "learning_rate": 2.104996510066625e-07, "logits/chosen": -0.3717024028301239, "logits/rejected": -0.5349926352500916, "logps/chosen": -79.53875732421875, "logps/ref_chosen": -70.12308502197266, "logps/ref_rejected": -102.75277709960938, "logps/rejected": -125.81317138671875, "loss": 1.1687, "rewards/accuracies": 0.703125, "rewards/chosen": -0.2816503345966339, "rewards/margins": 0.40022772550582886, "rewards/rejected": -0.6818780899047852, "step": 395 }, { "epoch": 0.5986394557823129, "epsilon_dpo/beta": 0.02956877090036869, "epsilon_dpo/beta_margin_grad_mean": -0.3932345509529114, "epsilon_dpo/beta_margin_grad_std": 0.14215026795864105, "epsilon_dpo/beta_margin_mean": 0.47859686613082886, "epsilon_dpo/beta_margin_std": 0.6916207671165466, "epsilon_dpo/loss_margin_mean": 16.287395477294922, "grad_norm": 12.903496742248535, "kl/avg_steps": 0.625, "kl/beta": 0.029751762747764587, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.0919459895968517e-07, "logits/chosen": -0.2872779965400696, "logits/rejected": -0.3059869408607483, "logps/chosen": -81.68611907958984, "logps/ref_chosen": -71.49127197265625, "logps/ref_rejected": -107.02473449707031, "logps/rejected": -133.50697326660156, "loss": 1.0691, "rewards/accuracies": 0.828125, "rewards/chosen": -0.30214953422546387, "rewards/margins": 0.47859686613082886, "rewards/rejected": -0.780746340751648, "step": 396 }, { "epoch": 0.600151171579743, "epsilon_dpo/beta": 0.029431317001581192, "epsilon_dpo/beta_margin_grad_mean": -0.4295797646045685, "epsilon_dpo/beta_margin_grad_std": 0.17762644588947296, "epsilon_dpo/beta_margin_mean": 0.308048814535141, "epsilon_dpo/beta_margin_std": 0.8488595485687256, "epsilon_dpo/loss_margin_mean": 10.641646385192871, "grad_norm": 11.531167984008789, "kl/avg_steps": 0.46875, "kl/beta": 0.029566969722509384, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.078906883274924e-07, "logits/chosen": -0.5348297357559204, "logits/rejected": -0.1661069393157959, "logps/chosen": -107.25617980957031, "logps/ref_chosen": -95.6727523803711, "logps/ref_rejected": -111.85726928710938, "logps/rejected": -134.08233642578125, "loss": 1.2647, "rewards/accuracies": 0.703125, "rewards/chosen": -0.34256458282470703, "rewards/margins": 0.3080487847328186, "rewards/rejected": -0.6506133675575256, "step": 397 }, { "epoch": 0.6016628873771731, "epsilon_dpo/beta": 0.02931239642202854, "epsilon_dpo/beta_margin_grad_mean": -0.3774406909942627, "epsilon_dpo/beta_margin_grad_std": 0.17114564776420593, "epsilon_dpo/beta_margin_mean": 0.585285484790802, "epsilon_dpo/beta_margin_std": 0.8242340683937073, "epsilon_dpo/loss_margin_mean": 20.152509689331055, "grad_norm": 8.442160606384277, "kl/avg_steps": 0.40625, "kl/beta": 0.02942902036011219, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.065879555832674e-07, "logits/chosen": -0.40959131717681885, "logits/rejected": -0.6267037391662598, "logps/chosen": -79.64991760253906, "logps/ref_chosen": -70.61448669433594, "logps/ref_rejected": -98.2459716796875, "logps/rejected": -127.43391418457031, "loss": 1.0326, "rewards/accuracies": 0.71875, "rewards/chosen": -0.26629623770713806, "rewards/margins": 0.5852854251861572, "rewards/rejected": -0.8515816926956177, "step": 398 }, { "epoch": 0.6031746031746031, "epsilon_dpo/beta": 0.029175475239753723, "epsilon_dpo/beta_margin_grad_mean": -0.38290148973464966, "epsilon_dpo/beta_margin_grad_std": 0.1641579419374466, "epsilon_dpo/beta_margin_mean": 0.5577808618545532, "epsilon_dpo/beta_margin_std": 0.8090330958366394, "epsilon_dpo/loss_margin_mean": 19.284090042114258, "grad_norm": 15.290122985839844, "kl/avg_steps": 0.46875, "kl/beta": 0.029309948906302452, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.052864371672457e-07, "logits/chosen": -0.12391947209835052, "logits/rejected": -0.24706780910491943, "logps/chosen": -112.02005004882812, "logps/ref_chosen": -93.99934387207031, "logps/ref_rejected": -146.26950073242188, "logps/rejected": -183.5742950439453, "loss": 1.0452, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5295411348342896, "rewards/margins": 0.5577808618545532, "rewards/rejected": -1.0873219966888428, "step": 399 }, { "epoch": 0.6046863189720333, "epsilon_dpo/beta": 0.029103174805641174, "epsilon_dpo/beta_margin_grad_mean": -0.434611976146698, "epsilon_dpo/beta_margin_grad_std": 0.17042642831802368, "epsilon_dpo/beta_margin_mean": 0.3079947233200073, "epsilon_dpo/beta_margin_std": 0.7888738512992859, "epsilon_dpo/loss_margin_mean": 10.77357292175293, "grad_norm": 12.158735275268555, "kl/avg_steps": 0.25, "kl/beta": 0.029173199087381363, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.0398616948569493e-07, "logits/chosen": -0.2648681700229645, "logits/rejected": -0.5493128299713135, "logps/chosen": -118.06231689453125, "logps/ref_chosen": -97.5876235961914, "logps/ref_rejected": -117.63888549804688, "logps/rejected": -148.88714599609375, "loss": 1.2438, "rewards/accuracies": 0.609375, "rewards/chosen": -0.5979068279266357, "rewards/margins": 0.3079947233200073, "rewards/rejected": -0.9059015512466431, "step": 400 }, { "epoch": 0.6046863189720333, "eval_epsilon_dpo/beta": 0.028982818126678467, "eval_epsilon_dpo/beta_margin_grad_mean": -0.4003281891345978, "eval_epsilon_dpo/beta_margin_grad_std": 0.17058531939983368, "eval_epsilon_dpo/beta_margin_mean": 0.47119957208633423, "eval_epsilon_dpo/beta_margin_std": 0.8252949118614197, "eval_epsilon_dpo/loss_margin_mean": 16.441312789916992, "eval_kl/n_epsilon_steps": 0.2922535240650177, "eval_kl/p_epsilon_steps": 0.7064260840415955, "eval_logits/chosen": -0.35346463322639465, "eval_logits/rejected": -0.45723411440849304, "eval_logps/chosen": -101.37396240234375, "eval_logps/ref_chosen": -87.42715454101562, "eval_logps/ref_rejected": -104.23548889160156, "eval_logps/rejected": -134.62362670898438, "eval_loss": 0.5624240636825562, "eval_rewards/accuracies": 0.7178696990013123, "eval_rewards/chosen": -0.4064703583717346, "eval_rewards/margins": 0.47119957208633423, "eval_rewards/rejected": -0.8776699304580688, "eval_runtime": 47.6139, "eval_samples_per_second": 48.368, "eval_steps_per_second": 1.512, "step": 400 }, { "epoch": 0.6061980347694633, "epsilon_dpo/beta": 0.02892146445810795, "epsilon_dpo/beta_margin_grad_mean": -0.3574426770210266, "epsilon_dpo/beta_margin_grad_std": 0.16776859760284424, "epsilon_dpo/beta_margin_mean": 0.6677769422531128, "epsilon_dpo/beta_margin_std": 0.8188538551330566, "epsilon_dpo/loss_margin_mean": 23.23797035217285, "grad_norm": 8.932498931884766, "kl/avg_steps": 0.625, "kl/beta": 0.02910044975578785, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.0268718890989752e-07, "logits/chosen": -0.14188948273658752, "logits/rejected": -0.18290500342845917, "logps/chosen": -81.09709167480469, "logps/ref_chosen": -72.36085510253906, "logps/ref_rejected": -105.40269470214844, "logps/rejected": -137.37689208984375, "loss": 0.9714, "rewards/accuracies": 0.828125, "rewards/chosen": -0.25411131978034973, "rewards/margins": 0.6677769422531128, "rewards/rejected": -0.9218882918357849, "step": 401 }, { "epoch": 0.6077097505668935, "epsilon_dpo/beta": 0.02879149280488491, "epsilon_dpo/beta_margin_grad_mean": -0.3803299069404602, "epsilon_dpo/beta_margin_grad_std": 0.1708374321460724, "epsilon_dpo/beta_margin_mean": 0.5698226690292358, "epsilon_dpo/beta_margin_std": 0.8285603523254395, "epsilon_dpo/loss_margin_mean": 19.973407745361328, "grad_norm": 18.650056838989258, "kl/avg_steps": 0.453125, "kl/beta": 0.028919700533151627, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.013895317751323e-07, "logits/chosen": -0.46771174669265747, "logits/rejected": -0.2445870041847229, "logps/chosen": -94.94572448730469, "logps/ref_chosen": -83.60546112060547, "logps/ref_rejected": -94.18315124511719, "logps/rejected": -125.496826171875, "loss": 1.0448, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3300950527191162, "rewards/margins": 0.5698226690292358, "rewards/rejected": -0.899917721748352, "step": 402 }, { "epoch": 0.6092214663643235, "epsilon_dpo/beta": 0.02863917127251625, "epsilon_dpo/beta_margin_grad_mean": -0.376331090927124, "epsilon_dpo/beta_margin_grad_std": 0.18496714532375336, "epsilon_dpo/beta_margin_mean": 0.6017536520957947, "epsilon_dpo/beta_margin_std": 0.9268775582313538, "epsilon_dpo/loss_margin_mean": 21.197481155395508, "grad_norm": 13.501971244812012, "kl/avg_steps": 0.53125, "kl/beta": 0.028789250180125237, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.0009323437965898e-07, "logits/chosen": -0.38072872161865234, "logits/rejected": -0.4114571809768677, "logps/chosen": -87.80209350585938, "logps/ref_chosen": -72.8893814086914, "logps/ref_rejected": -107.12028503417969, "logps/rejected": -143.23048400878906, "loss": 1.0558, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4293259084224701, "rewards/margins": 0.6017535924911499, "rewards/rejected": -1.0310795307159424, "step": 403 }, { "epoch": 0.6107331821617535, "epsilon_dpo/beta": 0.028505729511380196, "epsilon_dpo/beta_margin_grad_mean": -0.3736587464809418, "epsilon_dpo/beta_margin_grad_std": 0.17950692772865295, "epsilon_dpo/beta_margin_mean": 0.6139946579933167, "epsilon_dpo/beta_margin_std": 0.8749144077301025, "epsilon_dpo/loss_margin_mean": 21.728633880615234, "grad_norm": 9.500164985656738, "kl/avg_steps": 0.46875, "kl/beta": 0.0286371149122715, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.9879833298370237e-07, "logits/chosen": -0.43219926953315735, "logits/rejected": -0.43257758021354675, "logps/chosen": -99.16268920898438, "logps/ref_chosen": -86.45829772949219, "logps/ref_rejected": -120.54048156738281, "logps/rejected": -154.9735107421875, "loss": 1.0292, "rewards/accuracies": 0.75, "rewards/chosen": -0.3642900586128235, "rewards/margins": 0.6139946579933167, "rewards/rejected": -0.9782847166061401, "step": 404 }, { "epoch": 0.6122448979591837, "epsilon_dpo/beta": 0.028408365324139595, "epsilon_dpo/beta_margin_grad_mean": -0.41101425886154175, "epsilon_dpo/beta_margin_grad_std": 0.16673527657985687, "epsilon_dpo/beta_margin_mean": 0.42524203658103943, "epsilon_dpo/beta_margin_std": 0.8040966987609863, "epsilon_dpo/loss_margin_mean": 15.15623664855957, "grad_norm": 9.812227249145508, "kl/avg_steps": 0.34375, "kl/beta": 0.028503505513072014, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.975048638084379e-07, "logits/chosen": -0.25932326912879944, "logits/rejected": -0.3149704337120056, "logps/chosen": -89.40546417236328, "logps/ref_chosen": -73.7075424194336, "logps/ref_rejected": -89.86683654785156, "logps/rejected": -120.72099304199219, "loss": 1.148, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4482584297657013, "rewards/margins": 0.42524200677871704, "rewards/rejected": -0.8735004663467407, "step": 405 }, { "epoch": 0.6137566137566137, "epsilon_dpo/beta": 0.0282400231808424, "epsilon_dpo/beta_margin_grad_mean": -0.3645992875099182, "epsilon_dpo/beta_margin_grad_std": 0.18260234594345093, "epsilon_dpo/beta_margin_mean": 0.6403554677963257, "epsilon_dpo/beta_margin_std": 0.8946476578712463, "epsilon_dpo/loss_margin_mean": 22.858041763305664, "grad_norm": 9.639616012573242, "kl/avg_steps": 0.59375, "kl/beta": 0.02840586006641388, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.9621286303497914e-07, "logits/chosen": -0.5095548629760742, "logits/rejected": -0.504365086555481, "logps/chosen": -81.74095153808594, "logps/ref_chosen": -69.46755981445312, "logps/ref_rejected": -119.13258361816406, "logps/rejected": -154.26402282714844, "loss": 1.0191, "rewards/accuracies": 0.796875, "rewards/chosen": -0.34815317392349243, "rewards/margins": 0.6403554677963257, "rewards/rejected": -0.9885086417198181, "step": 406 }, { "epoch": 0.6152683295540439, "epsilon_dpo/beta": 0.028135115280747414, "epsilon_dpo/beta_margin_grad_mean": -0.4090394377708435, "epsilon_dpo/beta_margin_grad_std": 0.17076851427555084, "epsilon_dpo/beta_margin_mean": 0.4270060062408447, "epsilon_dpo/beta_margin_std": 0.7996051907539368, "epsilon_dpo/loss_margin_mean": 15.369938850402832, "grad_norm": 11.385334014892578, "kl/avg_steps": 0.375, "kl/beta": 0.028238195925951004, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.9492236680336483e-07, "logits/chosen": -0.28435638546943665, "logits/rejected": -0.6896387338638306, "logps/chosen": -140.2555389404297, "logps/ref_chosen": -113.93284606933594, "logps/ref_rejected": -133.01231384277344, "logps/rejected": -174.7049560546875, "loss": 1.1481, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7427991032600403, "rewards/margins": 0.4270060062408447, "rewards/rejected": -1.1698050498962402, "step": 407 }, { "epoch": 0.6167800453514739, "epsilon_dpo/beta": 0.028003625571727753, "epsilon_dpo/beta_margin_grad_mean": -0.36860692501068115, "epsilon_dpo/beta_margin_grad_std": 0.16879484057426453, "epsilon_dpo/beta_margin_mean": 0.6323613524436951, "epsilon_dpo/beta_margin_std": 0.8337081074714661, "epsilon_dpo/loss_margin_mean": 22.75626564025879, "grad_norm": 9.038987159729004, "kl/avg_steps": 0.46875, "kl/beta": 0.028132697567343712, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.9363341121154895e-07, "logits/chosen": -0.42244648933410645, "logits/rejected": -0.5274006128311157, "logps/chosen": -93.25194549560547, "logps/ref_chosen": -77.43348693847656, "logps/ref_rejected": -100.76187133789062, "logps/rejected": -139.33657836914062, "loss": 0.9995, "rewards/accuracies": 0.75, "rewards/chosen": -0.4446289837360382, "rewards/margins": 0.6323613524436951, "rewards/rejected": -1.0769903659820557, "step": 408 }, { "epoch": 0.618291761148904, "epsilon_dpo/beta": 0.027951732277870178, "epsilon_dpo/beta_margin_grad_mean": -0.4339336156845093, "epsilon_dpo/beta_margin_grad_std": 0.18402144312858582, "epsilon_dpo/beta_margin_mean": 0.3334841728210449, "epsilon_dpo/beta_margin_std": 0.8810906410217285, "epsilon_dpo/loss_margin_mean": 12.15471076965332, "grad_norm": 10.198472023010254, "kl/avg_steps": 0.1875, "kl/beta": 0.028001440688967705, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 1.9234603231438994e-07, "logits/chosen": -0.12168080359697342, "logits/rejected": -0.14667344093322754, "logps/chosen": -110.5352554321289, "logps/ref_chosen": -86.81268310546875, "logps/ref_rejected": -84.29527282714844, "logps/rejected": -120.17254638671875, "loss": 1.2535, "rewards/accuracies": 0.625, "rewards/chosen": -0.6660339832305908, "rewards/margins": 0.3334841728210449, "rewards/rejected": -0.9995181560516357, "step": 409 }, { "epoch": 0.6198034769463341, "epsilon_dpo/beta": 0.027829542756080627, "epsilon_dpo/beta_margin_grad_mean": -0.37669700384140015, "epsilon_dpo/beta_margin_grad_std": 0.16883623600006104, "epsilon_dpo/beta_margin_mean": 0.5865784883499146, "epsilon_dpo/beta_margin_std": 0.8136049509048462, "epsilon_dpo/loss_margin_mean": 21.269351959228516, "grad_norm": 10.556927680969238, "kl/avg_steps": 0.4375, "kl/beta": 0.02794903703033924, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.9106026612264315e-07, "logits/chosen": -0.5301246643066406, "logits/rejected": -0.44228461384773254, "logps/chosen": -105.40447998046875, "logps/ref_chosen": -85.89543151855469, "logps/ref_rejected": -96.34490203857422, "logps/rejected": -137.123291015625, "loss": 1.0278, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5454895496368408, "rewards/margins": 0.5865784883499146, "rewards/rejected": -1.1320680379867554, "step": 410 }, { "epoch": 0.6213151927437641, "epsilon_dpo/beta": 0.027725713327527046, "epsilon_dpo/beta_margin_grad_mean": -0.3819405436515808, "epsilon_dpo/beta_margin_grad_std": 0.1781851202249527, "epsilon_dpo/beta_margin_mean": 0.5704288482666016, "epsilon_dpo/beta_margin_std": 0.8644275069236755, "epsilon_dpo/loss_margin_mean": 20.78996467590332, "grad_norm": 17.217391967773438, "kl/avg_steps": 0.375, "kl/beta": 0.027827292680740356, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.8977614860195296e-07, "logits/chosen": -0.2461598962545395, "logits/rejected": -0.2894115447998047, "logps/chosen": -96.726318359375, "logps/ref_chosen": -75.55298614501953, "logps/ref_rejected": -100.882568359375, "logps/rejected": -142.84585571289062, "loss": 1.0574, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5909495949745178, "rewards/margins": 0.5704289078712463, "rewards/rejected": -1.1613783836364746, "step": 411 }, { "epoch": 0.6228269085411943, "epsilon_dpo/beta": 0.027596134692430496, "epsilon_dpo/beta_margin_grad_mean": -0.3919138014316559, "epsilon_dpo/beta_margin_grad_std": 0.1753304898738861, "epsilon_dpo/beta_margin_mean": 0.5203702449798584, "epsilon_dpo/beta_margin_std": 0.8557584881782532, "epsilon_dpo/loss_margin_mean": 19.04148292541504, "grad_norm": 11.123086929321289, "kl/avg_steps": 0.46875, "kl/beta": 0.02772332914173603, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.8849371567184662e-07, "logits/chosen": -0.10348869115114212, "logits/rejected": -0.18310517072677612, "logps/chosen": -103.1690673828125, "logps/ref_chosen": -78.12739562988281, "logps/ref_rejected": -97.01484680175781, "logps/rejected": -141.09799194335938, "loss": 1.0914, "rewards/accuracies": 0.75, "rewards/chosen": -0.6940072774887085, "rewards/margins": 0.5203702449798584, "rewards/rejected": -1.214377522468567, "step": 412 }, { "epoch": 0.6243386243386243, "epsilon_dpo/beta": 0.027523396536707878, "epsilon_dpo/beta_margin_grad_mean": -0.4108419120311737, "epsilon_dpo/beta_margin_grad_std": 0.1892181932926178, "epsilon_dpo/beta_margin_mean": 0.42955899238586426, "epsilon_dpo/beta_margin_std": 0.9341761469841003, "epsilon_dpo/loss_margin_mean": 15.84644603729248, "grad_norm": 14.241096496582031, "kl/avg_steps": 0.265625, "kl/beta": 0.027593983337283134, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.872130032047302e-07, "logits/chosen": -0.3025868535041809, "logits/rejected": -0.5660539865493774, "logps/chosen": -118.65052795410156, "logps/ref_chosen": -91.40047454833984, "logps/ref_rejected": -99.7619857788086, "logps/rejected": -142.8584747314453, "loss": 1.1932, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7531765699386597, "rewards/margins": 0.42955899238586426, "rewards/rejected": -1.182735562324524, "step": 413 }, { "epoch": 0.6258503401360545, "epsilon_dpo/beta": 0.02738601341843605, "epsilon_dpo/beta_margin_grad_mean": -0.3774614930152893, "epsilon_dpo/beta_margin_grad_std": 0.1806628406047821, "epsilon_dpo/beta_margin_mean": 0.6020256876945496, "epsilon_dpo/beta_margin_std": 0.9087315201759338, "epsilon_dpo/loss_margin_mean": 22.174726486206055, "grad_norm": 10.976476669311523, "kl/avg_steps": 0.5, "kl/beta": 0.027520880103111267, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.8593404702488436e-07, "logits/chosen": -0.06367478519678116, "logits/rejected": -0.1593630015850067, "logps/chosen": -100.70257568359375, "logps/ref_chosen": -75.29246520996094, "logps/ref_rejected": -100.37710571289062, "logps/rejected": -147.96194458007812, "loss": 1.0476, "rewards/accuracies": 0.75, "rewards/chosen": -0.6971333622932434, "rewards/margins": 0.6020256280899048, "rewards/rejected": -1.299159049987793, "step": 414 }, { "epoch": 0.6273620559334845, "epsilon_dpo/beta": 0.02725832350552082, "epsilon_dpo/beta_margin_grad_mean": -0.39092764258384705, "epsilon_dpo/beta_margin_grad_std": 0.1978784054517746, "epsilon_dpo/beta_margin_mean": 0.5205000042915344, "epsilon_dpo/beta_margin_std": 0.9865575432777405, "epsilon_dpo/loss_margin_mean": 19.31855583190918, "grad_norm": 10.629053115844727, "kl/avg_steps": 0.46875, "kl/beta": 0.027383960783481598, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.846568829074628e-07, "logits/chosen": -0.2597837448120117, "logits/rejected": -0.2640593647956848, "logps/chosen": -102.90240478515625, "logps/ref_chosen": -77.00994873046875, "logps/ref_rejected": -80.96036529541016, "logps/rejected": -126.17137908935547, "loss": 1.1433, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7089916467666626, "rewards/margins": 0.5205000638961792, "rewards/rejected": -1.2294917106628418, "step": 415 }, { "epoch": 0.6288737717309146, "epsilon_dpo/beta": 0.02716521918773651, "epsilon_dpo/beta_margin_grad_mean": -0.4312796890735626, "epsilon_dpo/beta_margin_grad_std": 0.18393133580684662, "epsilon_dpo/beta_margin_mean": 0.33430424332618713, "epsilon_dpo/beta_margin_std": 0.8818397521972656, "epsilon_dpo/loss_margin_mean": 12.520179748535156, "grad_norm": 11.289984703063965, "kl/avg_steps": 0.34375, "kl/beta": 0.0272561963647604, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.8338154657749128e-07, "logits/chosen": -0.38363879919052124, "logits/rejected": -0.6178092956542969, "logps/chosen": -110.30681610107422, "logps/ref_chosen": -79.6038589477539, "logps/ref_rejected": -93.3397216796875, "logps/rejected": -136.5628662109375, "loss": 1.2532, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8370293378829956, "rewards/margins": 0.3343042731285095, "rewards/rejected": -1.1713335514068604, "step": 416 }, { "epoch": 0.6303854875283447, "epsilon_dpo/beta": 0.027021225541830063, "epsilon_dpo/beta_margin_grad_mean": -0.36599236726760864, "epsilon_dpo/beta_margin_grad_std": 0.18626999855041504, "epsilon_dpo/beta_margin_mean": 0.647750735282898, "epsilon_dpo/beta_margin_std": 0.9404109716415405, "epsilon_dpo/loss_margin_mean": 24.17540740966797, "grad_norm": 10.080581665039062, "kl/avg_steps": 0.53125, "kl/beta": 0.02716282568871975, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.8210807370886849e-07, "logits/chosen": -0.2293497771024704, "logits/rejected": -0.2854086458683014, "logps/chosen": -105.18743133544922, "logps/ref_chosen": -79.04769897460938, "logps/ref_rejected": -101.35102844238281, "logps/rejected": -151.66616821289062, "loss": 1.0277, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7088662385940552, "rewards/margins": 0.647750735282898, "rewards/rejected": -1.3566169738769531, "step": 417 }, { "epoch": 0.6318972033257747, "epsilon_dpo/beta": 0.02697976492345333, "epsilon_dpo/beta_margin_grad_mean": -0.44641682505607605, "epsilon_dpo/beta_margin_grad_std": 0.19468529522418976, "epsilon_dpo/beta_margin_mean": 0.26206421852111816, "epsilon_dpo/beta_margin_std": 0.9137290120124817, "epsilon_dpo/loss_margin_mean": 9.974386215209961, "grad_norm": 13.358625411987305, "kl/avg_steps": 0.15625, "kl/beta": 0.027019284665584564, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 1.8083649992336825e-07, "logits/chosen": -0.29838278889656067, "logits/rejected": -0.4096614718437195, "logps/chosen": -131.08416748046875, "logps/ref_chosen": -96.25605773925781, "logps/ref_rejected": -97.95796203613281, "logps/rejected": -142.76046752929688, "loss": 1.3312, "rewards/accuracies": 0.625, "rewards/chosen": -0.9437500834465027, "rewards/margins": 0.2620641887187958, "rewards/rejected": -1.205814242362976, "step": 418 }, { "epoch": 0.6334089191232048, "epsilon_dpo/beta": 0.026811206713318825, "epsilon_dpo/beta_margin_grad_mean": -0.3356010615825653, "epsilon_dpo/beta_margin_grad_std": 0.17441126704216003, "epsilon_dpo/beta_margin_mean": 0.7969227433204651, "epsilon_dpo/beta_margin_std": 0.8698088526725769, "epsilon_dpo/loss_margin_mean": 29.899383544921875, "grad_norm": 10.102495193481445, "kl/avg_steps": 0.625, "kl/beta": 0.026977133005857468, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.7956686078964255e-07, "logits/chosen": -0.16823619604110718, "logits/rejected": -0.3001464605331421, "logps/chosen": -100.81460571289062, "logps/ref_chosen": -81.72348022460938, "logps/ref_rejected": -99.48829650878906, "logps/rejected": -148.4788055419922, "loss": 0.9006, "rewards/accuracies": 0.8125, "rewards/chosen": -0.51449054479599, "rewards/margins": 0.7969227433204651, "rewards/rejected": -1.311413288116455, "step": 419 }, { "epoch": 0.6349206349206349, "epsilon_dpo/beta": 0.026736844331026077, "epsilon_dpo/beta_margin_grad_mean": -0.41811904311180115, "epsilon_dpo/beta_margin_grad_std": 0.2055063396692276, "epsilon_dpo/beta_margin_mean": 0.407371461391449, "epsilon_dpo/beta_margin_std": 1.029210090637207, "epsilon_dpo/loss_margin_mean": 15.509881973266602, "grad_norm": 12.732593536376953, "kl/avg_steps": 0.28125, "kl/beta": 0.02680957317352295, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.782991918222275e-07, "logits/chosen": -0.293887197971344, "logits/rejected": -0.454693078994751, "logps/chosen": -109.46027374267578, "logps/ref_chosen": -78.56198120117188, "logps/ref_rejected": -85.0513916015625, "logps/rejected": -131.45956420898438, "loss": 1.25, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8302141427993774, "rewards/margins": 0.4073714315891266, "rewards/rejected": -1.2375855445861816, "step": 420 }, { "epoch": 0.636432350718065, "epsilon_dpo/beta": 0.026653503999114037, "epsilon_dpo/beta_margin_grad_mean": -0.4126037061214447, "epsilon_dpo/beta_margin_grad_std": 0.21195949614048004, "epsilon_dpo/beta_margin_mean": 0.41447150707244873, "epsilon_dpo/beta_margin_std": 1.0345741510391235, "epsilon_dpo/loss_margin_mean": 15.841025352478027, "grad_norm": 11.281766891479492, "kl/avg_steps": 0.3125, "kl/beta": 0.026734383776783943, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.7703352848054887e-07, "logits/chosen": -0.1268298327922821, "logits/rejected": -0.5857471823692322, "logps/chosen": -110.41204071044922, "logps/ref_chosen": -82.40538024902344, "logps/ref_rejected": -100.89399719238281, "logps/rejected": -144.74168395996094, "loss": 1.2508, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7509020566940308, "rewards/margins": 0.41447150707244873, "rewards/rejected": -1.1653735637664795, "step": 421 }, { "epoch": 0.6379440665154951, "epsilon_dpo/beta": 0.026520494371652603, "epsilon_dpo/beta_margin_grad_mean": -0.3952094614505768, "epsilon_dpo/beta_margin_grad_std": 0.1741863489151001, "epsilon_dpo/beta_margin_mean": 0.5068070888519287, "epsilon_dpo/beta_margin_std": 0.8488343954086304, "epsilon_dpo/loss_margin_mean": 19.29708480834961, "grad_norm": 11.663961410522461, "kl/avg_steps": 0.5, "kl/beta": 0.02665109746158123, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.7576990616793137e-07, "logits/chosen": -0.09396468847990036, "logits/rejected": -0.29736435413360596, "logps/chosen": -119.38473510742188, "logps/ref_chosen": -93.38703918457031, "logps/ref_rejected": -103.51509094238281, "logps/rejected": -148.80987548828125, "loss": 1.0988, "rewards/accuracies": 0.75, "rewards/chosen": -0.6915041208267212, "rewards/margins": 0.5068070888519287, "rewards/rejected": -1.19831120967865, "step": 422 }, { "epoch": 0.6394557823129252, "epsilon_dpo/beta": 0.02638855203986168, "epsilon_dpo/beta_margin_grad_mean": -0.3787572979927063, "epsilon_dpo/beta_margin_grad_std": 0.169308602809906, "epsilon_dpo/beta_margin_mean": 0.5847669243812561, "epsilon_dpo/beta_margin_std": 0.862210750579834, "epsilon_dpo/loss_margin_mean": 22.34361457824707, "grad_norm": 9.964740753173828, "kl/avg_steps": 0.5, "kl/beta": 0.026518505066633224, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.745083602306071e-07, "logits/chosen": -0.1422388255596161, "logits/rejected": -0.5984101295471191, "logps/chosen": -104.98013305664062, "logps/ref_chosen": -78.10965728759766, "logps/ref_rejected": -99.3348388671875, "logps/rejected": -148.54891967773438, "loss": 1.0417, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7118363976478577, "rewards/margins": 0.5847669839859009, "rewards/rejected": -1.2966034412384033, "step": 423 }, { "epoch": 0.6409674981103552, "epsilon_dpo/beta": 0.0262325257062912, "epsilon_dpo/beta_margin_grad_mean": -0.3739382028579712, "epsilon_dpo/beta_margin_grad_std": 0.17924833297729492, "epsilon_dpo/beta_margin_mean": 0.5975351929664612, "epsilon_dpo/beta_margin_std": 0.8674590587615967, "epsilon_dpo/loss_margin_mean": 22.970008850097656, "grad_norm": 10.012207984924316, "kl/avg_steps": 0.59375, "kl/beta": 0.026386572048068047, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.7324892595672804e-07, "logits/chosen": -0.6137542724609375, "logits/rejected": -0.353971928358078, "logps/chosen": -101.5662841796875, "logps/ref_chosen": -73.11288452148438, "logps/ref_rejected": -98.70126342773438, "logps/rejected": -150.12466430664062, "loss": 1.0399, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7486594915390015, "rewards/margins": 0.5975351333618164, "rewards/rejected": -1.3461946249008179, "step": 424 }, { "epoch": 0.6424792139077853, "epsilon_dpo/beta": 0.026085887104272842, "epsilon_dpo/beta_margin_grad_mean": -0.3714931607246399, "epsilon_dpo/beta_margin_grad_std": 0.17195554077625275, "epsilon_dpo/beta_margin_mean": 0.6200801730155945, "epsilon_dpo/beta_margin_std": 0.8400947451591492, "epsilon_dpo/loss_margin_mean": 23.949697494506836, "grad_norm": 10.907929420471191, "kl/avg_steps": 0.5625, "kl/beta": 0.0262308269739151, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.7199163857537824e-07, "logits/chosen": -0.07301705330610275, "logits/rejected": -0.19498585164546967, "logps/chosen": -109.0750732421875, "logps/ref_chosen": -83.90113830566406, "logps/ref_rejected": -93.6903076171875, "logps/rejected": -142.81394958496094, "loss": 1.0115, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6586664915084839, "rewards/margins": 0.6200801730155945, "rewards/rejected": -1.2787466049194336, "step": 425 }, { "epoch": 0.6439909297052154, "epsilon_dpo/beta": 0.026054104790091515, "epsilon_dpo/beta_margin_grad_mean": -0.45818114280700684, "epsilon_dpo/beta_margin_grad_std": 0.2074817568063736, "epsilon_dpo/beta_margin_mean": 0.21456298232078552, "epsilon_dpo/beta_margin_std": 1.0599706172943115, "epsilon_dpo/loss_margin_mean": 8.526104927062988, "grad_norm": 16.841733932495117, "kl/avg_steps": 0.125, "kl/beta": 0.02608410455286503, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 1.7073653325558828e-07, "logits/chosen": -0.15785779058933258, "logits/rejected": -0.3029636740684509, "logps/chosen": -130.49227905273438, "logps/ref_chosen": -95.90599060058594, "logps/ref_rejected": -96.2318115234375, "logps/rejected": -139.34420776367188, "loss": 1.427, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9045907258987427, "rewards/margins": 0.2145630121231079, "rewards/rejected": -1.1191537380218506, "step": 426 }, { "epoch": 0.6455026455026455, "epsilon_dpo/beta": 0.02594830095767975, "epsilon_dpo/beta_margin_grad_mean": -0.39626121520996094, "epsilon_dpo/beta_margin_grad_std": 0.18655137717723846, "epsilon_dpo/beta_margin_mean": 0.5007147789001465, "epsilon_dpo/beta_margin_std": 0.9225812554359436, "epsilon_dpo/loss_margin_mean": 19.527267456054688, "grad_norm": 9.823708534240723, "kl/avg_steps": 0.40625, "kl/beta": 0.026051539927721024, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.6948364510535218e-07, "logits/chosen": -0.28479713201522827, "logits/rejected": -0.48131686449050903, "logps/chosen": -112.27393341064453, "logps/ref_chosen": -81.92965698242188, "logps/ref_rejected": -105.96514892578125, "logps/rejected": -155.83670043945312, "loss": 1.1312, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7907149791717529, "rewards/margins": 0.5007147789001465, "rewards/rejected": -1.2914297580718994, "step": 427 }, { "epoch": 0.6470143613000756, "epsilon_dpo/beta": 0.025835201144218445, "epsilon_dpo/beta_margin_grad_mean": -0.38718709349632263, "epsilon_dpo/beta_margin_grad_std": 0.1809690296649933, "epsilon_dpo/beta_margin_mean": 0.5327649116516113, "epsilon_dpo/beta_margin_std": 0.869602382183075, "epsilon_dpo/loss_margin_mean": 20.841089248657227, "grad_norm": 12.027029991149902, "kl/avg_steps": 0.4375, "kl/beta": 0.025946132838726044, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.6823300917064458e-07, "logits/chosen": -0.12337511777877808, "logits/rejected": -0.30909281969070435, "logps/chosen": -126.36972045898438, "logps/ref_chosen": -95.26530456542969, "logps/ref_rejected": -106.78121948242188, "logps/rejected": -158.72671508789062, "loss": 1.0898, "rewards/accuracies": 0.75, "rewards/chosen": -0.8065395355224609, "rewards/margins": 0.5327649116516113, "rewards/rejected": -1.3393044471740723, "step": 428 }, { "epoch": 0.6485260770975056, "epsilon_dpo/beta": 0.025714591145515442, "epsilon_dpo/beta_margin_grad_mean": -0.38070225715637207, "epsilon_dpo/beta_margin_grad_std": 0.20825932919979095, "epsilon_dpo/beta_margin_mean": 0.5661614537239075, "epsilon_dpo/beta_margin_std": 1.029008150100708, "epsilon_dpo/loss_margin_mean": 22.288246154785156, "grad_norm": 10.108731269836426, "kl/avg_steps": 0.46875, "kl/beta": 0.025833113119006157, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.669846604344412e-07, "logits/chosen": -0.21388304233551025, "logits/rejected": -0.1304803490638733, "logps/chosen": -116.25877380371094, "logps/ref_chosen": -84.36827850341797, "logps/ref_rejected": -79.90157318115234, "logps/rejected": -134.080322265625, "loss": 1.1294, "rewards/accuracies": 0.75, "rewards/chosen": -0.8244999051094055, "rewards/margins": 0.5661614537239075, "rewards/rejected": -1.390661358833313, "step": 429 }, { "epoch": 0.6500377928949358, "epsilon_dpo/beta": 0.025586578994989395, "epsilon_dpo/beta_margin_grad_mean": -0.3593744933605194, "epsilon_dpo/beta_margin_grad_std": 0.17643196880817413, "epsilon_dpo/beta_margin_mean": 0.6664764285087585, "epsilon_dpo/beta_margin_std": 0.8701526522636414, "epsilon_dpo/loss_margin_mean": 26.25492286682129, "grad_norm": 9.700910568237305, "kl/avg_steps": 0.5, "kl/beta": 0.02571258507668972, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.6573863381573954e-07, "logits/chosen": -0.302243709564209, "logits/rejected": -0.07065822929143906, "logps/chosen": -108.92573547363281, "logps/ref_chosen": -82.68994140625, "logps/ref_rejected": -89.63719177246094, "logps/rejected": -142.12791442871094, "loss": 0.9903, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6739843487739563, "rewards/margins": 0.6664764881134033, "rewards/rejected": -1.3404607772827148, "step": 430 }, { "epoch": 0.6515495086923658, "epsilon_dpo/beta": 0.02546728029847145, "epsilon_dpo/beta_margin_grad_mean": -0.38510796427726746, "epsilon_dpo/beta_margin_grad_std": 0.1891801953315735, "epsilon_dpo/beta_margin_mean": 0.5427370667457581, "epsilon_dpo/beta_margin_std": 0.9203450083732605, "epsilon_dpo/loss_margin_mean": 21.542346954345703, "grad_norm": 10.811869621276855, "kl/avg_steps": 0.46875, "kl/beta": 0.025584662333130836, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -0.36825886368751526, "logits/rejected": -0.3587014079093933, "logps/chosen": -94.90249633789062, "logps/ref_chosen": -71.41302490234375, "logps/ref_rejected": -94.84451293945312, "logps/rejected": -139.8763427734375, "loss": 1.1017, "rewards/accuracies": 0.75, "rewards/chosen": -0.6012818813323975, "rewards/margins": 0.5427370071411133, "rewards/rejected": -1.1440188884735107, "step": 431 }, { "epoch": 0.6530612244897959, "epsilon_dpo/beta": 0.02537233754992485, "epsilon_dpo/beta_margin_grad_mean": -0.39833539724349976, "epsilon_dpo/beta_margin_grad_std": 0.20775966346263885, "epsilon_dpo/beta_margin_mean": 0.5033900141716003, "epsilon_dpo/beta_margin_std": 1.0366266965866089, "epsilon_dpo/loss_margin_mean": 20.118471145629883, "grad_norm": 12.222615242004395, "kl/avg_steps": 0.375, "kl/beta": 0.02546529471874237, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.632536862810844e-07, "logits/chosen": -0.6639053821563721, "logits/rejected": -0.31408679485321045, "logps/chosen": -117.53076171875, "logps/ref_chosen": -87.24708557128906, "logps/ref_rejected": -109.61161804199219, "logps/rejected": -160.01376342773438, "loss": 1.1779, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7728521823883057, "rewards/margins": 0.5033900141716003, "rewards/rejected": -1.2762421369552612, "step": 432 }, { "epoch": 0.654572940287226, "epsilon_dpo/beta": 0.025245830416679382, "epsilon_dpo/beta_margin_grad_mean": -0.35882607102394104, "epsilon_dpo/beta_margin_grad_std": 0.21039634943008423, "epsilon_dpo/beta_margin_mean": 0.7088367342948914, "epsilon_dpo/beta_margin_std": 1.0585843324661255, "epsilon_dpo/loss_margin_mean": 28.348417282104492, "grad_norm": 11.30290699005127, "kl/avg_steps": 0.5, "kl/beta": 0.0253701563924551, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.6201483487445515e-07, "logits/chosen": 0.08845943212509155, "logits/rejected": -0.13681207597255707, "logps/chosen": -109.51736450195312, "logps/ref_chosen": -84.12983703613281, "logps/ref_rejected": -88.07145690917969, "logps/rejected": -141.80740356445312, "loss": 1.0359, "rewards/accuracies": 0.75, "rewards/chosen": -0.6448712348937988, "rewards/margins": 0.7088366746902466, "rewards/rejected": -1.3537079095840454, "step": 433 }, { "epoch": 0.656084656084656, "epsilon_dpo/beta": 0.025136008858680725, "epsilon_dpo/beta_margin_grad_mean": -0.3597789704799652, "epsilon_dpo/beta_margin_grad_std": 0.1990552842617035, "epsilon_dpo/beta_margin_mean": 0.6943358182907104, "epsilon_dpo/beta_margin_std": 0.9835460782051086, "epsilon_dpo/loss_margin_mean": 27.884300231933594, "grad_norm": 9.337661743164062, "kl/avg_steps": 0.4375, "kl/beta": 0.025243936106562614, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.6077844460203204e-07, "logits/chosen": -0.3891775906085968, "logits/rejected": -0.4619581699371338, "logps/chosen": -87.47248077392578, "logps/ref_chosen": -67.95369720458984, "logps/ref_rejected": -95.22476196289062, "logps/rejected": -142.6278533935547, "loss": 1.0156, "rewards/accuracies": 0.71875, "rewards/chosen": -0.49363699555397034, "rewards/margins": 0.6943358778953552, "rewards/rejected": -1.1879727840423584, "step": 434 }, { "epoch": 0.6575963718820862, "epsilon_dpo/beta": 0.025065790861845016, "epsilon_dpo/beta_margin_grad_mean": -0.3998577892780304, "epsilon_dpo/beta_margin_grad_std": 0.19081562757492065, "epsilon_dpo/beta_margin_mean": 0.48510000109672546, "epsilon_dpo/beta_margin_std": 0.910292387008667, "epsilon_dpo/loss_margin_mean": 19.630142211914062, "grad_norm": 12.830538749694824, "kl/avg_steps": 0.28125, "kl/beta": 0.025133974850177765, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.5954455004830878e-07, "logits/chosen": -0.03043718822300434, "logits/rejected": 0.05432761833071709, "logps/chosen": -109.69837951660156, "logps/ref_chosen": -79.04033660888672, "logps/ref_rejected": -95.648193359375, "logps/rejected": -145.93637084960938, "loss": 1.1425, "rewards/accuracies": 0.640625, "rewards/chosen": -0.7722083926200867, "rewards/margins": 0.48510000109672546, "rewards/rejected": -1.2573083639144897, "step": 435 }, { "epoch": 0.6591080876795162, "epsilon_dpo/beta": 0.02495632693171501, "epsilon_dpo/beta_margin_grad_mean": -0.39338260889053345, "epsilon_dpo/beta_margin_grad_std": 0.19476363062858582, "epsilon_dpo/beta_margin_mean": 0.512516975402832, "epsilon_dpo/beta_margin_std": 0.9612401127815247, "epsilon_dpo/loss_margin_mean": 20.7864990234375, "grad_norm": 11.167028427124023, "kl/avg_steps": 0.4375, "kl/beta": 0.02506348490715027, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.5831318572796847e-07, "logits/chosen": -0.384287029504776, "logits/rejected": -0.22708836197853088, "logps/chosen": -105.12644958496094, "logps/ref_chosen": -79.86819458007812, "logps/ref_rejected": -94.28229522705078, "logps/rejected": -140.32705688476562, "loss": 1.1392, "rewards/accuracies": 0.734375, "rewards/chosen": -0.632888674736023, "rewards/margins": 0.512516975402832, "rewards/rejected": -1.145405650138855, "step": 436 }, { "epoch": 0.6606198034769464, "epsilon_dpo/beta": 0.024886613711714745, "epsilon_dpo/beta_margin_grad_mean": -0.40573927760124207, "epsilon_dpo/beta_margin_grad_std": 0.202871635556221, "epsilon_dpo/beta_margin_mean": 0.4717595875263214, "epsilon_dpo/beta_margin_std": 1.0110931396484375, "epsilon_dpo/loss_margin_mean": 19.250438690185547, "grad_norm": 17.93876075744629, "kl/avg_steps": 0.28125, "kl/beta": 0.024954309687018394, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.5708438608491815e-07, "logits/chosen": -0.05259464681148529, "logits/rejected": -0.2141057848930359, "logps/chosen": -112.15205383300781, "logps/ref_chosen": -78.60440826416016, "logps/ref_rejected": -120.1206283569336, "logps/rejected": -172.91871643066406, "loss": 1.1904, "rewards/accuracies": 0.671875, "rewards/chosen": -0.8363392949104309, "rewards/margins": 0.471759557723999, "rewards/rejected": -1.3080989122390747, "step": 437 }, { "epoch": 0.6621315192743764, "epsilon_dpo/beta": 0.024754598736763, "epsilon_dpo/beta_margin_grad_mean": -0.3753092885017395, "epsilon_dpo/beta_margin_grad_std": 0.18493610620498657, "epsilon_dpo/beta_margin_mean": 0.6025840640068054, "epsilon_dpo/beta_margin_std": 0.9036944508552551, "epsilon_dpo/loss_margin_mean": 24.554054260253906, "grad_norm": 9.134082794189453, "kl/avg_steps": 0.53125, "kl/beta": 0.02488432265818119, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.558581854913253e-07, "logits/chosen": -0.3997645974159241, "logits/rejected": -0.19586080312728882, "logps/chosen": -85.06373596191406, "logps/ref_chosen": -61.14134216308594, "logps/ref_rejected": -89.01496887207031, "logps/rejected": -137.49142456054688, "loss": 1.0492, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5958795547485352, "rewards/margins": 0.6025840640068054, "rewards/rejected": -1.1984636783599854, "step": 438 }, { "epoch": 0.6636432350718064, "epsilon_dpo/beta": 0.02466246485710144, "epsilon_dpo/beta_margin_grad_mean": -0.38297438621520996, "epsilon_dpo/beta_margin_grad_std": 0.1793513000011444, "epsilon_dpo/beta_margin_mean": 0.5795184373855591, "epsilon_dpo/beta_margin_std": 0.8874919414520264, "epsilon_dpo/loss_margin_mean": 23.736835479736328, "grad_norm": 9.677699089050293, "kl/avg_steps": 0.375, "kl/beta": 0.024752821773290634, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.5463461824665658e-07, "logits/chosen": -0.28539150953292847, "logits/rejected": -0.40400251746177673, "logps/chosen": -133.99826049804688, "logps/ref_chosen": -105.3409652709961, "logps/ref_rejected": -118.54618072509766, "logps/rejected": -170.9403076171875, "loss": 1.0571, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7099634408950806, "rewards/margins": 0.5795184373855591, "rewards/rejected": -1.2894818782806396, "step": 439 }, { "epoch": 0.6651549508692366, "epsilon_dpo/beta": 0.02450096420943737, "epsilon_dpo/beta_margin_grad_mean": -0.3457765281200409, "epsilon_dpo/beta_margin_grad_std": 0.17008066177368164, "epsilon_dpo/beta_margin_mean": 0.7509934902191162, "epsilon_dpo/beta_margin_std": 0.8576542139053345, "epsilon_dpo/loss_margin_mean": 30.81147003173828, "grad_norm": 11.456929206848145, "kl/avg_steps": 0.65625, "kl/beta": 0.024660347029566765, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.534137185767178e-07, "logits/chosen": -0.3186664581298828, "logits/rejected": -0.3347044289112091, "logps/chosen": -84.53070068359375, "logps/ref_chosen": -66.07489776611328, "logps/ref_rejected": -93.529296875, "logps/rejected": -142.79656982421875, "loss": 0.9247, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4537859559059143, "rewards/margins": 0.7509934902191162, "rewards/rejected": -1.2047793865203857, "step": 440 }, { "epoch": 0.6666666666666666, "epsilon_dpo/beta": 0.02435653656721115, "epsilon_dpo/beta_margin_grad_mean": -0.36488860845565796, "epsilon_dpo/beta_margin_grad_std": 0.17126530408859253, "epsilon_dpo/beta_margin_mean": 0.6469001173973083, "epsilon_dpo/beta_margin_std": 0.8343473076820374, "epsilon_dpo/loss_margin_mean": 26.742786407470703, "grad_norm": 10.66884994506836, "kl/avg_steps": 0.59375, "kl/beta": 0.024499567225575447, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.521955206326976e-07, "logits/chosen": -0.12676770985126495, "logits/rejected": -0.3456115126609802, "logps/chosen": -100.19915771484375, "logps/ref_chosen": -80.23723602294922, "logps/ref_rejected": -110.27183532714844, "logps/rejected": -156.97653198242188, "loss": 0.9907, "rewards/accuracies": 0.796875, "rewards/chosen": -0.48816603422164917, "rewards/margins": 0.6469000577926636, "rewards/rejected": -1.1350661516189575, "step": 441 }, { "epoch": 0.6681783824640968, "epsilon_dpo/beta": 0.024269822984933853, "epsilon_dpo/beta_margin_grad_mean": -0.3921697437763214, "epsilon_dpo/beta_margin_grad_std": 0.18120039999485016, "epsilon_dpo/beta_margin_mean": 0.5139843821525574, "epsilon_dpo/beta_margin_std": 0.8636588454246521, "epsilon_dpo/loss_margin_mean": 21.431564331054688, "grad_norm": 9.974164962768555, "kl/avg_steps": 0.359375, "kl/beta": 0.0243549607694149, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.5098005849021078e-07, "logits/chosen": -0.4980059862136841, "logits/rejected": -0.2725337743759155, "logps/chosen": -117.87562561035156, "logps/ref_chosen": -88.29069519042969, "logps/ref_rejected": -115.02373504638672, "logps/rejected": -166.04022216796875, "loss": 1.1021, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7207363247871399, "rewards/margins": 0.5139843821525574, "rewards/rejected": -1.2347207069396973, "step": 442 }, { "epoch": 0.6696900982615268, "epsilon_dpo/beta": 0.024133656173944473, "epsilon_dpo/beta_margin_grad_mean": -0.36061957478523254, "epsilon_dpo/beta_margin_grad_std": 0.19436120986938477, "epsilon_dpo/beta_margin_mean": 0.6958425045013428, "epsilon_dpo/beta_margin_std": 1.0084832906723022, "epsilon_dpo/loss_margin_mean": 29.060047149658203, "grad_norm": 9.462386131286621, "kl/avg_steps": 0.5625, "kl/beta": 0.02426774799823761, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.4976736614834662e-07, "logits/chosen": -0.018897585570812225, "logits/rejected": -0.1660281866788864, "logps/chosen": -105.89450073242188, "logps/ref_chosen": -79.12504577636719, "logps/ref_rejected": -106.16142272949219, "logps/rejected": -161.99093627929688, "loss": 1.0179, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6496339440345764, "rewards/margins": 0.6958425045013428, "rewards/rejected": -1.3454763889312744, "step": 443 }, { "epoch": 0.671201814058957, "epsilon_dpo/beta": 0.02405519038438797, "epsilon_dpo/beta_margin_grad_mean": -0.4312235116958618, "epsilon_dpo/beta_margin_grad_std": 0.1837351769208908, "epsilon_dpo/beta_margin_mean": 0.33834654092788696, "epsilon_dpo/beta_margin_std": 0.907697856426239, "epsilon_dpo/loss_margin_mean": 14.312912940979004, "grad_norm": 14.25627326965332, "kl/avg_steps": 0.328125, "kl/beta": 0.024132005870342255, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.4855747752871654e-07, "logits/chosen": -0.4729708135128021, "logits/rejected": -0.5134774446487427, "logps/chosen": -112.18655395507812, "logps/ref_chosen": -80.70797729492188, "logps/ref_rejected": -115.05545806884766, "logps/rejected": -160.84695434570312, "loss": 1.2564, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7601273059844971, "rewards/margins": 0.33834654092788696, "rewards/rejected": -1.0984739065170288, "step": 444 }, { "epoch": 0.672713529856387, "epsilon_dpo/beta": 0.023935209959745407, "epsilon_dpo/beta_margin_grad_mean": -0.3562336564064026, "epsilon_dpo/beta_margin_grad_std": 0.17927402257919312, "epsilon_dpo/beta_margin_mean": 0.6905251741409302, "epsilon_dpo/beta_margin_std": 0.8721351027488708, "epsilon_dpo/loss_margin_mean": 29.084095001220703, "grad_norm": 13.974209785461426, "kl/avg_steps": 0.5, "kl/beta": 0.02405308187007904, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.473504264745062e-07, "logits/chosen": -0.3294023871421814, "logits/rejected": -0.16536982357501984, "logps/chosen": -113.95536041259766, "logps/ref_chosen": -84.2096939086914, "logps/ref_rejected": -96.03083038330078, "logps/rejected": -154.860595703125, "loss": 0.9751, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7159168720245361, "rewards/margins": 0.6905251741409302, "rewards/rejected": -1.4064419269561768, "step": 445 }, { "epoch": 0.674225245653817, "epsilon_dpo/beta": 0.02380117028951645, "epsilon_dpo/beta_margin_grad_mean": -0.3451477587223053, "epsilon_dpo/beta_margin_grad_std": 0.15824930369853973, "epsilon_dpo/beta_margin_mean": 0.7407541275024414, "epsilon_dpo/beta_margin_std": 0.7942438721656799, "epsilon_dpo/loss_margin_mean": 31.29979133605957, "grad_norm": 9.890649795532227, "kl/avg_steps": 0.5625, "kl/beta": 0.02393341436982155, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.461462467495284e-07, "logits/chosen": -0.07980051636695862, "logits/rejected": -0.21256430447101593, "logps/chosen": -90.020263671875, "logps/ref_chosen": -67.17405700683594, "logps/ref_rejected": -93.05891418457031, "logps/rejected": -147.2049102783203, "loss": 0.9097, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5450651049613953, "rewards/margins": 0.7407541871070862, "rewards/rejected": -1.2858192920684814, "step": 446 }, { "epoch": 0.6757369614512472, "epsilon_dpo/beta": 0.023675473406910896, "epsilon_dpo/beta_margin_grad_mean": -0.37413400411605835, "epsilon_dpo/beta_margin_grad_std": 0.1963834911584854, "epsilon_dpo/beta_margin_mean": 0.6092168688774109, "epsilon_dpo/beta_margin_std": 0.9781890511512756, "epsilon_dpo/loss_margin_mean": 25.984163284301758, "grad_norm": 11.772629737854004, "kl/avg_steps": 0.53125, "kl/beta": 0.02379954233765602, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.4494497203727843e-07, "logits/chosen": -0.23869886994361877, "logits/rejected": -0.692590057849884, "logps/chosen": -97.84403991699219, "logps/ref_chosen": -76.00971984863281, "logps/ref_rejected": -110.74910736083984, "logps/rejected": -158.56759643554688, "loss": 1.0732, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5195660591125488, "rewards/margins": 0.6092169284820557, "rewards/rejected": -1.1287829875946045, "step": 447 }, { "epoch": 0.6772486772486772, "epsilon_dpo/beta": 0.023565160110592842, "epsilon_dpo/beta_margin_grad_mean": -0.3909008204936981, "epsilon_dpo/beta_margin_grad_std": 0.18127767741680145, "epsilon_dpo/beta_margin_mean": 0.5344966650009155, "epsilon_dpo/beta_margin_std": 0.8911596536636353, "epsilon_dpo/loss_margin_mean": 22.905698776245117, "grad_norm": 10.394296646118164, "kl/avg_steps": 0.46875, "kl/beta": 0.023673774674534798, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.4374663593999256e-07, "logits/chosen": -0.38351887464523315, "logits/rejected": -0.38271769881248474, "logps/chosen": -107.19539642333984, "logps/ref_chosen": -78.36839294433594, "logps/ref_rejected": -104.13056945800781, "logps/rejected": -155.86328125, "loss": 1.0935, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6800907850265503, "rewards/margins": 0.5344966650009155, "rewards/rejected": -1.2145874500274658, "step": 448 }, { "epoch": 0.6787603930461074, "epsilon_dpo/beta": 0.023492034524679184, "epsilon_dpo/beta_margin_grad_mean": -0.4497864544391632, "epsilon_dpo/beta_margin_grad_std": 0.19710615277290344, "epsilon_dpo/beta_margin_mean": 0.2375793606042862, "epsilon_dpo/beta_margin_std": 0.9235098958015442, "epsilon_dpo/loss_margin_mean": 10.408994674682617, "grad_norm": 24.015352249145508, "kl/avg_steps": 0.3125, "kl/beta": 0.02356332167983055, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.4255127197770707e-07, "logits/chosen": -0.5230458974838257, "logits/rejected": -0.2880995273590088, "logps/chosen": -136.14390563964844, "logps/ref_chosen": -101.54728698730469, "logps/ref_rejected": -109.60847473144531, "logps/rejected": -154.6140899658203, "loss": 1.3573, "rewards/accuracies": 0.640625, "rewards/chosen": -0.814102053642273, "rewards/margins": 0.2375793755054474, "rewards/rejected": -1.051681399345398, "step": 449 }, { "epoch": 0.6802721088435374, "epsilon_dpo/beta": 0.023389486595988274, "epsilon_dpo/beta_margin_grad_mean": -0.40028244256973267, "epsilon_dpo/beta_margin_grad_std": 0.19258946180343628, "epsilon_dpo/beta_margin_mean": 0.4728687107563019, "epsilon_dpo/beta_margin_std": 0.9459613561630249, "epsilon_dpo/loss_margin_mean": 20.483104705810547, "grad_norm": 12.487074851989746, "kl/avg_steps": 0.4375, "kl/beta": 0.02348991669714451, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.4135891358732205e-07, "logits/chosen": -0.49118733406066895, "logits/rejected": -0.6007115840911865, "logps/chosen": -92.49281311035156, "logps/ref_chosen": -66.52513122558594, "logps/ref_rejected": -100.74227905273438, "logps/rejected": -147.19308471679688, "loss": 1.1646, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6101650595664978, "rewards/margins": 0.4728686809539795, "rewards/rejected": -1.083033800125122, "step": 450 }, { "epoch": 0.6817838246409675, "epsilon_dpo/beta": 0.023302221670746803, "epsilon_dpo/beta_margin_grad_mean": -0.40755635499954224, "epsilon_dpo/beta_margin_grad_std": 0.18733078241348267, "epsilon_dpo/beta_margin_mean": 0.44163015484809875, "epsilon_dpo/beta_margin_std": 0.8944984674453735, "epsilon_dpo/loss_margin_mean": 19.218761444091797, "grad_norm": 9.85672378540039, "kl/avg_steps": 0.375, "kl/beta": 0.023387596011161804, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.4016959412166437e-07, "logits/chosen": -0.24648326635360718, "logits/rejected": -0.27126193046569824, "logps/chosen": -112.3831787109375, "logps/ref_chosen": -86.15756225585938, "logps/ref_rejected": -100.31735229492188, "logps/rejected": -145.76173400878906, "loss": 1.1708, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6132830381393433, "rewards/margins": 0.44163012504577637, "rewards/rejected": -1.0549132823944092, "step": 451 }, { "epoch": 0.6832955404383976, "epsilon_dpo/beta": 0.023193318396806717, "epsilon_dpo/beta_margin_grad_mean": -0.3813394606113434, "epsilon_dpo/beta_margin_grad_std": 0.19726483523845673, "epsilon_dpo/beta_margin_mean": 0.5745446085929871, "epsilon_dpo/beta_margin_std": 0.9635257720947266, "epsilon_dpo/loss_margin_mean": 25.042280197143555, "grad_norm": 10.472002029418945, "kl/avg_steps": 0.46875, "kl/beta": 0.02330021932721138, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.3898334684855645e-07, "logits/chosen": -0.33625802397727966, "logits/rejected": -0.4327930212020874, "logps/chosen": -102.27005004882812, "logps/ref_chosen": -75.09101867675781, "logps/ref_rejected": -104.30365753173828, "logps/rejected": -156.52496337890625, "loss": 1.0946, "rewards/accuracies": 0.6875, "rewards/chosen": -0.634591281414032, "rewards/margins": 0.5745445489883423, "rewards/rejected": -1.2091357707977295, "step": 452 }, { "epoch": 0.6848072562358276, "epsilon_dpo/beta": 0.02309594303369522, "epsilon_dpo/beta_margin_grad_mean": -0.40049630403518677, "epsilon_dpo/beta_margin_grad_std": 0.2005462944507599, "epsilon_dpo/beta_margin_mean": 0.48215773701667786, "epsilon_dpo/beta_margin_std": 0.9639610052108765, "epsilon_dpo/loss_margin_mean": 21.156658172607422, "grad_norm": 13.386679649353027, "kl/avg_steps": 0.421875, "kl/beta": 0.023191509768366814, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.3780020494988445e-07, "logits/chosen": -0.18490701913833618, "logits/rejected": -0.4014432430267334, "logps/chosen": -113.76175689697266, "logps/ref_chosen": -85.76741027832031, "logps/ref_rejected": -97.41229248046875, "logps/rejected": -146.56329345703125, "loss": 1.1674, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6490232348442078, "rewards/margins": 0.48215776681900024, "rewards/rejected": -1.131181001663208, "step": 453 }, { "epoch": 0.6863189720332578, "epsilon_dpo/beta": 0.022973690181970596, "epsilon_dpo/beta_margin_grad_mean": -0.36466944217681885, "epsilon_dpo/beta_margin_grad_std": 0.17821235954761505, "epsilon_dpo/beta_margin_mean": 0.6326720714569092, "epsilon_dpo/beta_margin_std": 0.8994981050491333, "epsilon_dpo/loss_margin_mean": 27.766921997070312, "grad_norm": 10.431668281555176, "kl/avg_steps": 0.53125, "kl/beta": 0.02309408038854599, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.366202015206706e-07, "logits/chosen": -0.28234240412712097, "logits/rejected": -0.38511741161346436, "logps/chosen": -92.9898452758789, "logps/ref_chosen": -73.73841857910156, "logps/ref_rejected": -89.2447280883789, "logps/rejected": -136.26307678222656, "loss": 1.0238, "rewards/accuracies": 0.765625, "rewards/chosen": -0.44317740201950073, "rewards/margins": 0.6326720714569092, "rewards/rejected": -1.0758495330810547, "step": 454 }, { "epoch": 0.6878306878306878, "epsilon_dpo/beta": 0.02285228855907917, "epsilon_dpo/beta_margin_grad_mean": -0.36160367727279663, "epsilon_dpo/beta_margin_grad_std": 0.1697792261838913, "epsilon_dpo/beta_margin_mean": 0.6611186861991882, "epsilon_dpo/beta_margin_std": 0.8520635962486267, "epsilon_dpo/loss_margin_mean": 29.138427734375, "grad_norm": 10.137971878051758, "kl/avg_steps": 0.53125, "kl/beta": 0.022972041741013527, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.354433695681474e-07, "logits/chosen": -0.44800513982772827, "logits/rejected": -0.49094438552856445, "logps/chosen": -125.23015594482422, "logps/ref_chosen": -97.0819320678711, "logps/ref_rejected": -106.07518768310547, "logps/rejected": -163.36184692382812, "loss": 0.9844, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6450259685516357, "rewards/margins": 0.661118745803833, "rewards/rejected": -1.3061447143554688, "step": 455 }, { "epoch": 0.6893424036281179, "epsilon_dpo/beta": 0.022717243060469627, "epsilon_dpo/beta_margin_grad_mean": -0.38223299384117126, "epsilon_dpo/beta_margin_grad_std": 0.17249758541584015, "epsilon_dpo/beta_margin_mean": 0.5534289479255676, "epsilon_dpo/beta_margin_std": 0.8530722856521606, "epsilon_dpo/loss_margin_mean": 24.55746841430664, "grad_norm": 10.840579986572266, "kl/avg_steps": 0.59375, "kl/beta": 0.022850647568702698, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.3426974201083439e-07, "logits/chosen": -0.3131183385848999, "logits/rejected": -0.05182047188282013, "logps/chosen": -105.27415466308594, "logps/ref_chosen": -78.9945297241211, "logps/ref_rejected": -106.11909484863281, "logps/rejected": -156.9561767578125, "loss": 1.0652, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5996953248977661, "rewards/margins": 0.5534289479255676, "rewards/rejected": -1.1531243324279785, "step": 456 }, { "epoch": 0.690854119425548, "epsilon_dpo/beta": 0.022618653252720833, "epsilon_dpo/beta_margin_grad_mean": -0.4127366840839386, "epsilon_dpo/beta_margin_grad_std": 0.159461110830307, "epsilon_dpo/beta_margin_mean": 0.40310725569725037, "epsilon_dpo/beta_margin_std": 0.7608078718185425, "epsilon_dpo/loss_margin_mean": 18.023801803588867, "grad_norm": 12.661500930786133, "kl/avg_steps": 0.4375, "kl/beta": 0.022715773433446884, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.3309935167761717e-07, "logits/chosen": -0.10714979469776154, "logits/rejected": -0.5415033102035522, "logps/chosen": -98.71037292480469, "logps/ref_chosen": -66.06265258789062, "logps/ref_rejected": -87.40638732910156, "logps/rejected": -138.07791137695312, "loss": 1.1523, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7403136491775513, "rewards/margins": 0.40310725569725037, "rewards/rejected": -1.143420934677124, "step": 457 }, { "epoch": 0.6923658352229781, "epsilon_dpo/beta": 0.022498922422528267, "epsilon_dpo/beta_margin_grad_mean": -0.36385974287986755, "epsilon_dpo/beta_margin_grad_std": 0.16366532444953918, "epsilon_dpo/beta_margin_mean": 0.6369403004646301, "epsilon_dpo/beta_margin_std": 0.7969620823860168, "epsilon_dpo/loss_margin_mean": 28.51059341430664, "grad_norm": 11.44316291809082, "kl/avg_steps": 0.53125, "kl/beta": 0.022616824135184288, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.3193223130682936e-07, "logits/chosen": -0.06593091785907745, "logits/rejected": -0.5136197805404663, "logps/chosen": -93.28556823730469, "logps/ref_chosen": -72.29696655273438, "logps/ref_rejected": -112.85076904296875, "logps/rejected": -162.34996032714844, "loss": 0.9855, "rewards/accuracies": 0.78125, "rewards/chosen": -0.47399500012397766, "rewards/margins": 0.6369403004646301, "rewards/rejected": -1.1109352111816406, "step": 458 }, { "epoch": 0.6938775510204082, "epsilon_dpo/beta": 0.022362414747476578, "epsilon_dpo/beta_margin_grad_mean": -0.3720557391643524, "epsilon_dpo/beta_margin_grad_std": 0.15929129719734192, "epsilon_dpo/beta_margin_mean": 0.6010655164718628, "epsilon_dpo/beta_margin_std": 0.7777773141860962, "epsilon_dpo/loss_margin_mean": 27.049121856689453, "grad_norm": 11.10675048828125, "kl/avg_steps": 0.609375, "kl/beta": 0.022497307509183884, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.3076841354533658e-07, "logits/chosen": -0.25048884749412537, "logits/rejected": -0.319161981344223, "logps/chosen": -113.13943481445312, "logps/ref_chosen": -90.7015609741211, "logps/ref_rejected": -112.55254364013672, "logps/rejected": -162.03955078125, "loss": 1.0037, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5026448965072632, "rewards/margins": 0.6010655164718628, "rewards/rejected": -1.103710412979126, "step": 459 }, { "epoch": 0.6953892668178382, "epsilon_dpo/beta": 0.022265443578362465, "epsilon_dpo/beta_margin_grad_mean": -0.3674984574317932, "epsilon_dpo/beta_margin_grad_std": 0.17106232047080994, "epsilon_dpo/beta_margin_mean": 0.6420856714248657, "epsilon_dpo/beta_margin_std": 0.8475755453109741, "epsilon_dpo/loss_margin_mean": 29.068513870239258, "grad_norm": 9.10698413848877, "kl/avg_steps": 0.4375, "kl/beta": 0.022361045703291893, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.2960793094762345e-07, "logits/chosen": -0.18218980729579926, "logits/rejected": -0.37891292572021484, "logps/chosen": -115.58694458007812, "logps/ref_chosen": -85.61819458007812, "logps/ref_rejected": -122.83721923828125, "logps/rejected": -181.87448120117188, "loss": 0.9972, "rewards/accuracies": 0.75, "rewards/chosen": -0.6702232360839844, "rewards/margins": 0.6420857310295105, "rewards/rejected": -1.3123090267181396, "step": 460 }, { "epoch": 0.6969009826152683, "epsilon_dpo/beta": 0.022154536098241806, "epsilon_dpo/beta_margin_grad_mean": -0.3609759509563446, "epsilon_dpo/beta_margin_grad_std": 0.16306711733341217, "epsilon_dpo/beta_margin_mean": 0.6506234407424927, "epsilon_dpo/beta_margin_std": 0.7753090858459473, "epsilon_dpo/loss_margin_mean": 29.590946197509766, "grad_norm": 10.064002990722656, "kl/avg_steps": 0.5, "kl/beta": 0.02226364053785801, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.2845081597488286e-07, "logits/chosen": -0.2814715504646301, "logits/rejected": -0.2578463554382324, "logps/chosen": -92.06724548339844, "logps/ref_chosen": -68.47746276855469, "logps/ref_rejected": -101.3572998046875, "logps/rejected": -154.5380401611328, "loss": 0.97, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5240465402603149, "rewards/margins": 0.6506234407424927, "rewards/rejected": -1.1746699810028076, "step": 461 }, { "epoch": 0.6984126984126984, "epsilon_dpo/beta": 0.022016622126102448, "epsilon_dpo/beta_margin_grad_mean": -0.364952951669693, "epsilon_dpo/beta_margin_grad_std": 0.17581763863563538, "epsilon_dpo/beta_margin_mean": 0.6401377320289612, "epsilon_dpo/beta_margin_std": 0.8566480875015259, "epsilon_dpo/loss_margin_mean": 29.284711837768555, "grad_norm": 11.0882568359375, "kl/avg_steps": 0.625, "kl/beta": 0.02215287648141384, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.27297100994108e-07, "logits/chosen": 0.07481776177883148, "logits/rejected": -0.13942140340805054, "logps/chosen": -109.2104263305664, "logps/ref_chosen": -82.09750366210938, "logps/ref_rejected": -96.87604522705078, "logps/rejected": -153.273681640625, "loss": 1.0044, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5979229807853699, "rewards/margins": 0.6401377320289612, "rewards/rejected": -1.238060712814331, "step": 462 }, { "epoch": 0.6999244142101285, "epsilon_dpo/beta": 0.02194179780781269, "epsilon_dpo/beta_margin_grad_mean": -0.40211692452430725, "epsilon_dpo/beta_margin_grad_std": 0.17251704633235931, "epsilon_dpo/beta_margin_mean": 0.47483089566230774, "epsilon_dpo/beta_margin_std": 0.8205597400665283, "epsilon_dpo/loss_margin_mean": 21.893447875976562, "grad_norm": 10.746212005615234, "kl/avg_steps": 0.34375, "kl/beta": 0.022015281021595, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.2614681827718695e-07, "logits/chosen": -0.27809298038482666, "logits/rejected": -0.3278348445892334, "logps/chosen": -124.42726135253906, "logps/ref_chosen": -94.08966064453125, "logps/ref_rejected": -94.4760971069336, "logps/rejected": -146.7071533203125, "loss": 1.1157, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6673296689987183, "rewards/margins": 0.4748309552669525, "rewards/rejected": -1.1421605348587036, "step": 463 }, { "epoch": 0.7014361300075586, "epsilon_dpo/beta": 0.021859774366021156, "epsilon_dpo/beta_margin_grad_mean": -0.38281190395355225, "epsilon_dpo/beta_margin_grad_std": 0.2125449925661087, "epsilon_dpo/beta_margin_mean": 0.5871289372444153, "epsilon_dpo/beta_margin_std": 1.0753085613250732, "epsilon_dpo/loss_margin_mean": 27.19884490966797, "grad_norm": 10.855381965637207, "kl/avg_steps": 0.375, "kl/beta": 0.02193986251950264, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.21308638155460358, "logits/rejected": -0.13446223735809326, "logps/chosen": -114.7242202758789, "logps/ref_chosen": -81.69437408447266, "logps/ref_rejected": -90.02086639404297, "logps/rejected": -150.2495574951172, "loss": 1.1303, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7249494791030884, "rewards/margins": 0.5871289372444153, "rewards/rejected": -1.3120784759521484, "step": 464 }, { "epoch": 0.7029478458049887, "epsilon_dpo/beta": 0.021750781685113907, "epsilon_dpo/beta_margin_grad_mean": -0.3837484121322632, "epsilon_dpo/beta_margin_grad_std": 0.19969280064105988, "epsilon_dpo/beta_margin_mean": 0.587910532951355, "epsilon_dpo/beta_margin_std": 0.9850619435310364, "epsilon_dpo/loss_margin_mean": 27.304344177246094, "grad_norm": 10.320209503173828, "kl/avg_steps": 0.5, "kl/beta": 0.02185789681971073, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.238566782415197e-07, "logits/chosen": -0.0753406435251236, "logits/rejected": -0.21963469684123993, "logps/chosen": -116.1302719116211, "logps/ref_chosen": -81.77964782714844, "logps/ref_rejected": -109.90087890625, "logps/rejected": -171.55584716796875, "loss": 1.0913, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7500728368759155, "rewards/margins": 0.587910532951355, "rewards/rejected": -1.3379833698272705, "step": 465 }, { "epoch": 0.7044595616024187, "epsilon_dpo/beta": 0.021703744307160378, "epsilon_dpo/beta_margin_grad_mean": -0.4434705674648285, "epsilon_dpo/beta_margin_grad_std": 0.17293018102645874, "epsilon_dpo/beta_margin_mean": 0.2722676396369934, "epsilon_dpo/beta_margin_std": 0.8159366250038147, "epsilon_dpo/loss_margin_mean": 12.814265251159668, "grad_norm": 19.282155990600586, "kl/avg_steps": 0.21875, "kl/beta": 0.02174915000796318, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 1.2271688498291334e-07, "logits/chosen": -0.382304847240448, "logits/rejected": -0.16424314677715302, "logps/chosen": -136.80435180664062, "logps/ref_chosen": -99.00543212890625, "logps/ref_rejected": -97.36400604248047, "logps/rejected": -147.97720336914062, "loss": 1.283, "rewards/accuracies": 0.640625, "rewards/chosen": -0.8247905373573303, "rewards/margins": 0.272267609834671, "rewards/rejected": -1.0970580577850342, "step": 466 }, { "epoch": 0.7059712773998488, "epsilon_dpo/beta": 0.02159532904624939, "epsilon_dpo/beta_margin_grad_mean": -0.3907034397125244, "epsilon_dpo/beta_margin_grad_std": 0.1725900024175644, "epsilon_dpo/beta_margin_mean": 0.5272072553634644, "epsilon_dpo/beta_margin_std": 0.8612512350082397, "epsilon_dpo/loss_margin_mean": 24.633930206298828, "grad_norm": 11.24698543548584, "kl/avg_steps": 0.5, "kl/beta": 0.02170167863368988, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.2158065210664848e-07, "logits/chosen": -0.05803515762090683, "logits/rejected": -0.4103708267211914, "logps/chosen": -98.90908813476562, "logps/ref_chosen": -68.85592651367188, "logps/ref_rejected": -114.98614501953125, "logps/rejected": -169.67323303222656, "loss": 1.0855, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6503223180770874, "rewards/margins": 0.5272072553634644, "rewards/rejected": -1.1775295734405518, "step": 467 }, { "epoch": 0.7074829931972789, "epsilon_dpo/beta": 0.021474391222000122, "epsilon_dpo/beta_margin_grad_mean": -0.3783760070800781, "epsilon_dpo/beta_margin_grad_std": 0.19576546549797058, "epsilon_dpo/beta_margin_mean": 0.5842656493186951, "epsilon_dpo/beta_margin_std": 0.9758591651916504, "epsilon_dpo/loss_margin_mean": 27.47597312927246, "grad_norm": 9.588836669921875, "kl/avg_steps": 0.5625, "kl/beta": 0.02159370854496956, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.204480113956011e-07, "logits/chosen": -0.25211167335510254, "logits/rejected": -0.08560752868652344, "logps/chosen": -119.99714660644531, "logps/ref_chosen": -86.93231201171875, "logps/ref_rejected": -97.35225677490234, "logps/rejected": -157.89306640625, "loss": 1.0908, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7126418352127075, "rewards/margins": 0.5842655897140503, "rewards/rejected": -1.2969074249267578, "step": 468 }, { "epoch": 0.708994708994709, "epsilon_dpo/beta": 0.02141467295587063, "epsilon_dpo/beta_margin_grad_mean": -0.3991638720035553, "epsilon_dpo/beta_margin_grad_std": 0.1622338443994522, "epsilon_dpo/beta_margin_mean": 0.4792685806751251, "epsilon_dpo/beta_margin_std": 0.7716690301895142, "epsilon_dpo/loss_margin_mean": 22.63409996032715, "grad_norm": 10.76986312866211, "kl/avg_steps": 0.28125, "kl/beta": 0.021472923457622528, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.1931899453216697e-07, "logits/chosen": -0.527290940284729, "logits/rejected": -0.24194341897964478, "logps/chosen": -113.86764526367188, "logps/ref_chosen": -81.70521545410156, "logps/ref_rejected": -95.8597640991211, "logps/rejected": -150.65628051757812, "loss": 1.0949, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6916900873184204, "rewards/margins": 0.47926855087280273, "rewards/rejected": -1.1709586381912231, "step": 469 }, { "epoch": 0.7105064247921391, "epsilon_dpo/beta": 0.021307768300175667, "epsilon_dpo/beta_margin_grad_mean": -0.3925294578075409, "epsilon_dpo/beta_margin_grad_std": 0.17281277477741241, "epsilon_dpo/beta_margin_mean": 0.5017194747924805, "epsilon_dpo/beta_margin_std": 0.8305301070213318, "epsilon_dpo/loss_margin_mean": 23.785242080688477, "grad_norm": 13.406644821166992, "kl/avg_steps": 0.5, "kl/beta": 0.021412700414657593, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.1819363309737438e-07, "logits/chosen": 0.09025856852531433, "logits/rejected": -0.2041073441505432, "logps/chosen": -101.21160888671875, "logps/ref_chosen": -69.24808502197266, "logps/ref_rejected": -91.63481140136719, "logps/rejected": -147.38357543945312, "loss": 1.0982, "rewards/accuracies": 0.75, "rewards/chosen": -0.6838077306747437, "rewards/margins": 0.5017194151878357, "rewards/rejected": -1.1855272054672241, "step": 470 }, { "epoch": 0.7120181405895691, "epsilon_dpo/beta": 0.021201759576797485, "epsilon_dpo/beta_margin_grad_mean": -0.36601752042770386, "epsilon_dpo/beta_margin_grad_std": 0.18967947363853455, "epsilon_dpo/beta_margin_mean": 0.6323056817054749, "epsilon_dpo/beta_margin_std": 0.9497935175895691, "epsilon_dpo/loss_margin_mean": 30.101776123046875, "grad_norm": 10.747753143310547, "kl/avg_steps": 0.5, "kl/beta": 0.021306170150637627, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.1707195857000215e-07, "logits/chosen": -0.4186813235282898, "logits/rejected": -0.3955724835395813, "logps/chosen": -109.00920867919922, "logps/ref_chosen": -80.25093078613281, "logps/ref_rejected": -102.29495239257812, "logps/rejected": -161.15499877929688, "loss": 1.0449, "rewards/accuracies": 0.75, "rewards/chosen": -0.6124268174171448, "rewards/margins": 0.6323057413101196, "rewards/rejected": -1.2447326183319092, "step": 471 }, { "epoch": 0.7135298563869993, "epsilon_dpo/beta": 0.021109528839588165, "epsilon_dpo/beta_margin_grad_mean": -0.3851298987865448, "epsilon_dpo/beta_margin_grad_std": 0.19840273261070251, "epsilon_dpo/beta_margin_mean": 0.5623181462287903, "epsilon_dpo/beta_margin_std": 1.0013166666030884, "epsilon_dpo/loss_margin_mean": 26.94999885559082, "grad_norm": 10.546019554138184, "kl/avg_steps": 0.4375, "kl/beta": 0.021200168877840042, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.1595400232569768e-07, "logits/chosen": -0.40806734561920166, "logits/rejected": -0.21839666366577148, "logps/chosen": -111.0872802734375, "logps/ref_chosen": -82.03518676757812, "logps/ref_rejected": -102.50106811523438, "logps/rejected": -158.503173828125, "loss": 1.1144, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6167609095573425, "rewards/margins": 0.5623180866241455, "rewards/rejected": -1.1790790557861328, "step": 472 }, { "epoch": 0.7150415721844293, "epsilon_dpo/beta": 0.02099778689444065, "epsilon_dpo/beta_margin_grad_mean": -0.3725453317165375, "epsilon_dpo/beta_margin_grad_std": 0.19428926706314087, "epsilon_dpo/beta_margin_mean": 0.6166351437568665, "epsilon_dpo/beta_margin_std": 0.9741420745849609, "epsilon_dpo/loss_margin_mean": 29.64267921447754, "grad_norm": 9.246530532836914, "kl/avg_steps": 0.53125, "kl/beta": 0.02110782265663147, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.1483979563610069e-07, "logits/chosen": -0.1701044887304306, "logits/rejected": -0.309928834438324, "logps/chosen": -100.15250396728516, "logps/ref_chosen": -75.42779541015625, "logps/ref_rejected": -120.35392761230469, "logps/rejected": -174.7213134765625, "loss": 1.0653, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5218599438667297, "rewards/margins": 0.6166351437568665, "rewards/rejected": -1.1384950876235962, "step": 473 }, { "epoch": 0.7165532879818595, "epsilon_dpo/beta": 0.020906511694192886, "epsilon_dpo/beta_margin_grad_mean": -0.4070540964603424, "epsilon_dpo/beta_margin_grad_std": 0.18986035883426666, "epsilon_dpo/beta_margin_mean": 0.44220811128616333, "epsilon_dpo/beta_margin_std": 0.9186369776725769, "epsilon_dpo/loss_margin_mean": 21.432273864746094, "grad_norm": 13.415764808654785, "kl/avg_steps": 0.4375, "kl/beta": 0.020996280014514923, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.1372936966796709e-07, "logits/chosen": -0.3077600598335266, "logits/rejected": -0.49201875925064087, "logps/chosen": -107.1239013671875, "logps/ref_chosen": -70.26007080078125, "logps/ref_rejected": -95.32382202148438, "logps/rejected": -153.61993408203125, "loss": 1.1793, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7730291485786438, "rewards/margins": 0.44220811128616333, "rewards/rejected": -1.2152371406555176, "step": 474 }, { "epoch": 0.7180650037792895, "epsilon_dpo/beta": 0.02077624201774597, "epsilon_dpo/beta_margin_grad_mean": -0.3475864827632904, "epsilon_dpo/beta_margin_grad_std": 0.15696246922016144, "epsilon_dpo/beta_margin_mean": 0.7207489013671875, "epsilon_dpo/beta_margin_std": 0.7813100814819336, "epsilon_dpo/loss_margin_mean": 34.88127899169922, "grad_norm": 10.983023643493652, "kl/avg_steps": 0.625, "kl/beta": 0.020904820412397385, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.126227554822985e-07, "logits/chosen": -0.3970502018928528, "logits/rejected": -0.2638678252696991, "logps/chosen": -116.77337646484375, "logps/ref_chosen": -82.82109069824219, "logps/ref_rejected": -113.2535400390625, "logps/rejected": -182.08709716796875, "loss": 0.9198, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7075306177139282, "rewards/margins": 0.7207489013671875, "rewards/rejected": -1.4282796382904053, "step": 475 }, { "epoch": 0.7195767195767195, "epsilon_dpo/beta": 0.020699141547083855, "epsilon_dpo/beta_margin_grad_mean": -0.4025791883468628, "epsilon_dpo/beta_margin_grad_std": 0.1724308729171753, "epsilon_dpo/beta_margin_mean": 0.45882412791252136, "epsilon_dpo/beta_margin_std": 0.8210156559944153, "epsilon_dpo/loss_margin_mean": 22.426555633544922, "grad_norm": 10.899810791015625, "kl/avg_steps": 0.375, "kl/beta": 0.020774977281689644, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.1151998403347243e-07, "logits/chosen": -0.178768128156662, "logits/rejected": -0.3625830411911011, "logps/chosen": -143.37841796875, "logps/ref_chosen": -102.48684692382812, "logps/ref_rejected": -102.62210083007812, "logps/rejected": -165.94021606445312, "loss": 1.1291, "rewards/accuracies": 0.671875, "rewards/chosen": -0.8488463163375854, "rewards/margins": 0.45882412791252136, "rewards/rejected": -1.3076703548431396, "step": 476 }, { "epoch": 0.7210884353741497, "epsilon_dpo/beta": 0.0206476841121912, "epsilon_dpo/beta_margin_grad_mean": -0.41202253103256226, "epsilon_dpo/beta_margin_grad_std": 0.20400650799274445, "epsilon_dpo/beta_margin_mean": 0.42743879556655884, "epsilon_dpo/beta_margin_std": 0.9944959878921509, "epsilon_dpo/loss_margin_mean": 21.062976837158203, "grad_norm": 14.26219654083252, "kl/avg_steps": 0.25, "kl/beta": 0.02069736272096634, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.1042108616837692e-07, "logits/chosen": -0.4083220660686493, "logits/rejected": -0.20560237765312195, "logps/chosen": -125.48551940917969, "logps/ref_chosen": -82.20433044433594, "logps/ref_rejected": -106.65583801269531, "logps/rejected": -171.0, "loss": 1.2225, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8975530862808228, "rewards/margins": 0.42743879556655884, "rewards/rejected": -1.3249918222427368, "step": 477 }, { "epoch": 0.7226001511715797, "epsilon_dpo/beta": 0.02059619314968586, "epsilon_dpo/beta_margin_grad_mean": -0.42403674125671387, "epsilon_dpo/beta_margin_grad_std": 0.21408548951148987, "epsilon_dpo/beta_margin_mean": 0.3858782947063446, "epsilon_dpo/beta_margin_std": 1.0612176656723022, "epsilon_dpo/loss_margin_mean": 19.113130569458008, "grad_norm": 11.53897476196289, "kl/avg_steps": 0.25, "kl/beta": 0.02064574882388115, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.0932609262554746e-07, "logits/chosen": -0.12969213724136353, "logits/rejected": -0.06466308236122131, "logps/chosen": -120.49067687988281, "logps/ref_chosen": -82.88982391357422, "logps/ref_rejected": -76.89048767089844, "logps/rejected": -133.60446166992188, "loss": 1.2838, "rewards/accuracies": 0.609375, "rewards/chosen": -0.7774056196212769, "rewards/margins": 0.3858782947063446, "rewards/rejected": -1.1632839441299438, "step": 478 }, { "epoch": 0.7241118669690099, "epsilon_dpo/beta": 0.02049977518618107, "epsilon_dpo/beta_margin_grad_mean": -0.40637218952178955, "epsilon_dpo/beta_margin_grad_std": 0.19583311676979065, "epsilon_dpo/beta_margin_mean": 0.43407750129699707, "epsilon_dpo/beta_margin_std": 0.9609951972961426, "epsilon_dpo/loss_margin_mean": 21.468713760375977, "grad_norm": 10.94175910949707, "kl/avg_steps": 0.46875, "kl/beta": 0.020594261586666107, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.0823503403430734e-07, "logits/chosen": 0.0649048462510109, "logits/rejected": -0.108011893928051, "logps/chosen": -119.17659759521484, "logps/ref_chosen": -82.81025695800781, "logps/ref_rejected": -93.0564956665039, "logps/rejected": -150.89154052734375, "loss": 1.2025, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7476523518562317, "rewards/margins": 0.43407750129699707, "rewards/rejected": -1.181729793548584, "step": 479 }, { "epoch": 0.7256235827664399, "epsilon_dpo/beta": 0.020410537719726562, "epsilon_dpo/beta_margin_grad_mean": -0.38558438420295715, "epsilon_dpo/beta_margin_grad_std": 0.1957198679447174, "epsilon_dpo/beta_margin_mean": 0.5398164987564087, "epsilon_dpo/beta_margin_std": 0.9710685014724731, "epsilon_dpo/loss_margin_mean": 26.758392333984375, "grad_norm": 14.283851623535156, "kl/avg_steps": 0.4375, "kl/beta": 0.020498177036643028, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.0714794091391072e-07, "logits/chosen": -0.2883099913597107, "logits/rejected": 0.03229174017906189, "logps/chosen": -122.21525573730469, "logps/ref_chosen": -86.16099548339844, "logps/ref_rejected": -89.46211242675781, "logps/rejected": -152.27476501464844, "loss": 1.1232, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7387363314628601, "rewards/margins": 0.5398164987564087, "rewards/rejected": -1.278552770614624, "step": 480 }, { "epoch": 0.72713529856387, "epsilon_dpo/beta": 0.020308874547481537, "epsilon_dpo/beta_margin_grad_mean": -0.38809558749198914, "epsilon_dpo/beta_margin_grad_std": 0.18458719551563263, "epsilon_dpo/beta_margin_mean": 0.5430381894111633, "epsilon_dpo/beta_margin_std": 0.9227668046951294, "epsilon_dpo/loss_margin_mean": 27.00404930114746, "grad_norm": 9.960450172424316, "kl/avg_steps": 0.5, "kl/beta": 0.020408889278769493, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.0606484367268906e-07, "logits/chosen": -0.2094596028327942, "logits/rejected": -0.17691798508167267, "logps/chosen": -128.1065216064453, "logps/ref_chosen": -91.20343017578125, "logps/ref_rejected": -95.93585968017578, "logps/rejected": -159.84300231933594, "loss": 1.098, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7517346143722534, "rewards/margins": 0.5430382490158081, "rewards/rejected": -1.2947728633880615, "step": 481 }, { "epoch": 0.7286470143613001, "epsilon_dpo/beta": 0.02025860734283924, "epsilon_dpo/beta_margin_grad_mean": -0.4095396399497986, "epsilon_dpo/beta_margin_grad_std": 0.21444030106067657, "epsilon_dpo/beta_margin_mean": 0.4701303243637085, "epsilon_dpo/beta_margin_std": 1.0936204195022583, "epsilon_dpo/loss_margin_mean": 23.593067169189453, "grad_norm": 14.232986450195312, "kl/avg_steps": 0.25, "kl/beta": 0.020307350903749466, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.0498577260720048e-07, "logits/chosen": 0.031998008489608765, "logits/rejected": -0.07167947292327881, "logps/chosen": -129.4146728515625, "logps/ref_chosen": -84.04693603515625, "logps/ref_rejected": -138.25186157226562, "logps/rejected": -207.21267700195312, "loss": 1.2254, "rewards/accuracies": 0.625, "rewards/chosen": -0.9225196242332458, "rewards/margins": 0.4701303243637085, "rewards/rejected": -1.3926498889923096, "step": 482 }, { "epoch": 0.7301587301587301, "epsilon_dpo/beta": 0.020132116973400116, "epsilon_dpo/beta_margin_grad_mean": -0.3548187017440796, "epsilon_dpo/beta_margin_grad_std": 0.17624545097351074, "epsilon_dpo/beta_margin_mean": 0.688190758228302, "epsilon_dpo/beta_margin_std": 0.8882216215133667, "epsilon_dpo/loss_margin_mean": 34.3974609375, "grad_norm": 10.093025207519531, "kl/avg_steps": 0.625, "kl/beta": 0.020256709307432175, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.0391075790138232e-07, "logits/chosen": -0.36327123641967773, "logits/rejected": -0.43129682540893555, "logps/chosen": -111.67202758789062, "logps/ref_chosen": -77.54084777832031, "logps/ref_rejected": -106.26319122314453, "logps/rejected": -174.7918243408203, "loss": 0.9805, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6897982358932495, "rewards/margins": 0.6881906986236572, "rewards/rejected": -1.3779890537261963, "step": 483 }, { "epoch": 0.7316704459561603, "epsilon_dpo/beta": 0.02001648023724556, "epsilon_dpo/beta_margin_grad_mean": -0.39846718311309814, "epsilon_dpo/beta_margin_grad_std": 0.15919949114322662, "epsilon_dpo/beta_margin_mean": 0.46173810958862305, "epsilon_dpo/beta_margin_std": 0.7505614161491394, "epsilon_dpo/loss_margin_mean": 23.269933700561523, "grad_norm": 12.176405906677246, "kl/avg_steps": 0.578125, "kl/beta": 0.020130891352891922, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.0283982962570681e-07, "logits/chosen": -0.49070245027542114, "logits/rejected": -0.2863999307155609, "logps/chosen": -113.4175033569336, "logps/ref_chosen": -79.68392181396484, "logps/ref_rejected": -97.50027465820312, "logps/rejected": -154.5037841796875, "loss": 1.1034, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6778692603111267, "rewards/margins": 0.46173810958862305, "rewards/rejected": -1.1396074295043945, "step": 484 }, { "epoch": 0.7331821617535903, "epsilon_dpo/beta": 0.019917093217372894, "epsilon_dpo/beta_margin_grad_mean": -0.3898495137691498, "epsilon_dpo/beta_margin_grad_std": 0.170601487159729, "epsilon_dpo/beta_margin_mean": 0.5218725800514221, "epsilon_dpo/beta_margin_std": 0.8386232256889343, "epsilon_dpo/loss_margin_mean": 26.436914443969727, "grad_norm": 10.812078475952148, "kl/avg_steps": 0.5, "kl/beta": 0.020015178248286247, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.0177301773633992e-07, "logits/chosen": -0.07163044810295105, "logits/rejected": -0.2883603274822235, "logps/chosen": -121.81181335449219, "logps/ref_chosen": -83.94019317626953, "logps/ref_rejected": -98.3631362915039, "logps/rejected": -162.67166137695312, "loss": 1.0839, "rewards/accuracies": 0.75, "rewards/chosen": -0.7567333579063416, "rewards/margins": 0.5218726396560669, "rewards/rejected": -1.2786059379577637, "step": 485 }, { "epoch": 0.7346938775510204, "epsilon_dpo/beta": 0.019861575216054916, "epsilon_dpo/beta_margin_grad_mean": -0.4068491756916046, "epsilon_dpo/beta_margin_grad_std": 0.2024412304162979, "epsilon_dpo/beta_margin_mean": 0.47068801522254944, "epsilon_dpo/beta_margin_std": 1.022111177444458, "epsilon_dpo/loss_margin_mean": 24.056474685668945, "grad_norm": 9.771635055541992, "kl/avg_steps": 0.28125, "kl/beta": 0.01991560123860836, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.007103520743035e-07, "logits/chosen": 0.19127486646175385, "logits/rejected": -0.22449460625648499, "logps/chosen": -122.89358520507812, "logps/ref_chosen": -78.64771270751953, "logps/ref_rejected": -116.00846862792969, "logps/rejected": -184.31082153320312, "loss": 1.1946, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8815645575523376, "rewards/margins": 0.47068801522254944, "rewards/rejected": -1.3522526025772095, "step": 486 }, { "epoch": 0.7362055933484505, "epsilon_dpo/beta": 0.019787250086665154, "epsilon_dpo/beta_margin_grad_mean": -0.40424415469169617, "epsilon_dpo/beta_margin_grad_std": 0.17017613351345062, "epsilon_dpo/beta_margin_mean": 0.4518623650074005, "epsilon_dpo/beta_margin_std": 0.8180162906646729, "epsilon_dpo/loss_margin_mean": 23.10869789123535, "grad_norm": 9.558370590209961, "kl/avg_steps": 0.375, "kl/beta": 0.01985974609851837, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 9.965186236464046e-08, "logits/chosen": -0.13608111441135406, "logits/rejected": -0.3792567551136017, "logps/chosen": -131.393798828125, "logps/ref_chosen": -85.29930114746094, "logps/ref_rejected": -108.99090576171875, "logps/rejected": -178.19412231445312, "loss": 1.1325, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9149771928787231, "rewards/margins": 0.4518623650074005, "rewards/rejected": -1.3668395280838013, "step": 487 }, { "epoch": 0.7377173091458806, "epsilon_dpo/beta": 0.01972569152712822, "epsilon_dpo/beta_margin_grad_mean": -0.3908286988735199, "epsilon_dpo/beta_margin_grad_std": 0.18784664571285248, "epsilon_dpo/beta_margin_mean": 0.5446649789810181, "epsilon_dpo/beta_margin_std": 0.9466550946235657, "epsilon_dpo/loss_margin_mean": 27.929880142211914, "grad_norm": 11.954972267150879, "kl/avg_steps": 0.3125, "kl/beta": 0.019785549491643906, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 9.859757821558337e-08, "logits/chosen": -0.36887866258621216, "logits/rejected": -0.5798311233520508, "logps/chosen": -125.43516540527344, "logps/ref_chosen": -87.57951354980469, "logps/ref_rejected": -112.03506469726562, "logps/rejected": -177.8206024169922, "loss": 1.1054, "rewards/accuracies": 0.640625, "rewards/chosen": -0.7492825984954834, "rewards/margins": 0.5446649789810181, "rewards/rejected": -1.2939475774765015, "step": 488 }, { "epoch": 0.7392290249433107, "epsilon_dpo/beta": 0.01966424286365509, "epsilon_dpo/beta_margin_grad_mean": -0.4462127089500427, "epsilon_dpo/beta_margin_grad_std": 0.1676499992609024, "epsilon_dpo/beta_margin_mean": 0.25041741132736206, "epsilon_dpo/beta_margin_std": 0.7841642498970032, "epsilon_dpo/loss_margin_mean": 12.99974536895752, "grad_norm": 10.487590789794922, "kl/avg_steps": 0.3125, "kl/beta": 0.019723912701010704, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 9.754752911772615e-08, "logits/chosen": -0.23175035417079926, "logits/rejected": -0.553443968296051, "logps/chosen": -133.7505645751953, "logps/ref_chosen": -91.55049896240234, "logps/ref_rejected": -108.33108520507812, "logps/rejected": -163.53089904785156, "loss": 1.2916, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8317919373512268, "rewards/margins": 0.2504173517227173, "rewards/rejected": -1.0822093486785889, "step": 489 }, { "epoch": 0.7407407407407407, "epsilon_dpo/beta": 0.01957840286195278, "epsilon_dpo/beta_margin_grad_mean": -0.3897029459476471, "epsilon_dpo/beta_margin_grad_std": 0.19855265319347382, "epsilon_dpo/beta_margin_mean": 0.5350128412246704, "epsilon_dpo/beta_margin_std": 0.9706516265869141, "epsilon_dpo/loss_margin_mean": 27.660192489624023, "grad_norm": 9.340205192565918, "kl/avg_steps": 0.4375, "kl/beta": 0.019662467762827873, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 9.650174444319956e-08, "logits/chosen": -0.056915752589702606, "logits/rejected": -0.16137561202049255, "logps/chosen": -122.04594421386719, "logps/ref_chosen": -83.35502624511719, "logps/ref_rejected": -97.42626953125, "logps/rejected": -163.77737426757812, "loss": 1.1271, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7606915235519409, "rewards/margins": 0.5350128412246704, "rewards/rejected": -1.2957043647766113, "step": 490 }, { "epoch": 0.7422524565381708, "epsilon_dpo/beta": 0.019499236717820168, "epsilon_dpo/beta_margin_grad_mean": -0.399957537651062, "epsilon_dpo/beta_margin_grad_std": 0.17069052159786224, "epsilon_dpo/beta_margin_mean": 0.4838140904903412, "epsilon_dpo/beta_margin_std": 0.8225111961364746, "epsilon_dpo/loss_margin_mean": 25.07085609436035, "grad_norm": 10.31070327758789, "kl/avg_steps": 0.40625, "kl/beta": 0.019576817750930786, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 9.546025344484868e-08, "logits/chosen": 0.1169687956571579, "logits/rejected": -0.1125451996922493, "logps/chosen": -119.66255187988281, "logps/ref_chosen": -78.405517578125, "logps/ref_rejected": -98.35725402832031, "logps/rejected": -164.68515014648438, "loss": 1.1083, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8066112399101257, "rewards/margins": 0.4838140606880188, "rewards/rejected": -1.2904253005981445, "step": 491 }, { "epoch": 0.7437641723356009, "epsilon_dpo/beta": 0.019438622519373894, "epsilon_dpo/beta_margin_grad_mean": -0.42248454689979553, "epsilon_dpo/beta_margin_grad_std": 0.1789523810148239, "epsilon_dpo/beta_margin_mean": 0.3718709647655487, "epsilon_dpo/beta_margin_std": 0.8428933024406433, "epsilon_dpo/loss_margin_mean": 19.4351749420166, "grad_norm": 9.827570915222168, "kl/avg_steps": 0.3125, "kl/beta": 0.01949760876595974, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 9.442308525541589e-08, "logits/chosen": -0.11652935296297073, "logits/rejected": -0.3884758949279785, "logps/chosen": -143.1727294921875, "logps/ref_chosen": -91.67491149902344, "logps/ref_rejected": -114.05909729003906, "logps/rejected": -184.99209594726562, "loss": 1.2084, "rewards/accuracies": 0.671875, "rewards/chosen": -1.0039048194885254, "rewards/margins": 0.3718709349632263, "rewards/rejected": -1.375775694847107, "step": 492 }, { "epoch": 0.745275888133031, "epsilon_dpo/beta": 0.019341617822647095, "epsilon_dpo/beta_margin_grad_mean": -0.3907162845134735, "epsilon_dpo/beta_margin_grad_std": 0.18976947665214539, "epsilon_dpo/beta_margin_mean": 0.5252335667610168, "epsilon_dpo/beta_margin_std": 0.924012303352356, "epsilon_dpo/loss_margin_mean": 27.459110260009766, "grad_norm": 10.20104694366455, "kl/avg_steps": 0.5, "kl/beta": 0.019436869770288467, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 9.339026888672468e-08, "logits/chosen": 0.04558924585580826, "logits/rejected": -0.18673455715179443, "logps/chosen": -119.65789794921875, "logps/ref_chosen": -80.52119445800781, "logps/ref_rejected": -105.2249755859375, "logps/rejected": -171.82077026367188, "loss": 1.1155, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7599451541900635, "rewards/margins": 0.5252335071563721, "rewards/rejected": -1.2851786613464355, "step": 493 }, { "epoch": 0.7467876039304611, "epsilon_dpo/beta": 0.019257480278611183, "epsilon_dpo/beta_margin_grad_mean": -0.4015882611274719, "epsilon_dpo/beta_margin_grad_std": 0.18990257382392883, "epsilon_dpo/beta_margin_mean": 0.477631151676178, "epsilon_dpo/beta_margin_std": 0.9436517953872681, "epsilon_dpo/loss_margin_mean": 25.109621047973633, "grad_norm": 9.032830238342285, "kl/avg_steps": 0.4375, "kl/beta": 0.019340168684720993, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 9.236183322886945e-08, "logits/chosen": -0.42797666788101196, "logits/rejected": -0.1930256187915802, "logps/chosen": -141.51026916503906, "logps/ref_chosen": -99.7325439453125, "logps/ref_rejected": -124.01393127441406, "logps/rejected": -190.90127563476562, "loss": 1.1578, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8071942329406738, "rewards/margins": 0.47763118147850037, "rewards/rejected": -1.284825325012207, "step": 494 }, { "epoch": 0.7482993197278912, "epsilon_dpo/beta": 0.019185630604624748, "epsilon_dpo/beta_margin_grad_mean": -0.4156641960144043, "epsilon_dpo/beta_margin_grad_std": 0.1824708878993988, "epsilon_dpo/beta_margin_mean": 0.3921421468257904, "epsilon_dpo/beta_margin_std": 0.8667027950286865, "epsilon_dpo/loss_margin_mean": 20.752164840698242, "grad_norm": 16.976558685302734, "kl/avg_steps": 0.375, "kl/beta": 0.019255923107266426, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 9.133780704940594e-08, "logits/chosen": 0.06622087955474854, "logits/rejected": -0.3368031978607178, "logps/chosen": -115.75942993164062, "logps/ref_chosen": -73.57146453857422, "logps/ref_rejected": -95.85881042480469, "logps/rejected": -158.7989501953125, "loss": 1.2008, "rewards/accuracies": 0.703125, "rewards/chosen": -0.810094952583313, "rewards/margins": 0.392142117023468, "rewards/rejected": -1.2022371292114258, "step": 495 }, { "epoch": 0.7498110355253212, "epsilon_dpo/beta": 0.019101964309811592, "epsilon_dpo/beta_margin_grad_mean": -0.3908800482749939, "epsilon_dpo/beta_margin_grad_std": 0.17948482930660248, "epsilon_dpo/beta_margin_mean": 0.5241735577583313, "epsilon_dpo/beta_margin_std": 0.8593854308128357, "epsilon_dpo/loss_margin_mean": 27.73284149169922, "grad_norm": 10.025064468383789, "kl/avg_steps": 0.4375, "kl/beta": 0.019183984026312828, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 9.031821899254797e-08, "logits/chosen": -0.03599054738879204, "logits/rejected": -0.2996342182159424, "logps/chosen": -119.87254333496094, "logps/ref_chosen": -77.81381225585938, "logps/ref_rejected": -127.28571319580078, "logps/rejected": -197.07728576660156, "loss": 1.0917, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8057615756988525, "rewards/margins": 0.5241735577583313, "rewards/rejected": -1.329935073852539, "step": 496 }, { "epoch": 0.7513227513227513, "epsilon_dpo/beta": 0.01898891106247902, "epsilon_dpo/beta_margin_grad_mean": -0.37980324029922485, "epsilon_dpo/beta_margin_grad_std": 0.1680688112974167, "epsilon_dpo/beta_margin_mean": 0.5702401399612427, "epsilon_dpo/beta_margin_std": 0.8251023292541504, "epsilon_dpo/loss_margin_mean": 30.249765396118164, "grad_norm": 10.120041847229004, "kl/avg_steps": 0.59375, "kl/beta": 0.01910042017698288, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 8.930309757836516e-08, "logits/chosen": -0.0725216418504715, "logits/rejected": -0.25850170850753784, "logps/chosen": -143.61422729492188, "logps/ref_chosen": -96.18318176269531, "logps/ref_rejected": -113.01858520507812, "logps/rejected": -190.69940185546875, "loss": 1.0426, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9025899767875671, "rewards/margins": 0.5702401399612427, "rewards/rejected": -1.472830057144165, "step": 497 }, { "epoch": 0.7528344671201814, "epsilon_dpo/beta": 0.018912434577941895, "epsilon_dpo/beta_margin_grad_mean": -0.3832828998565674, "epsilon_dpo/beta_margin_grad_std": 0.16757036745548248, "epsilon_dpo/beta_margin_mean": 0.5597019195556641, "epsilon_dpo/beta_margin_std": 0.8137642741203308, "epsilon_dpo/loss_margin_mean": 29.86823272705078, "grad_norm": 9.708072662353516, "kl/avg_steps": 0.40625, "kl/beta": 0.018987679854035378, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 8.829247120198563e-08, "logits/chosen": -0.2942112386226654, "logits/rejected": -0.17943748831748962, "logps/chosen": -126.88603210449219, "logps/ref_chosen": -87.08412170410156, "logps/ref_rejected": -97.77659606933594, "logps/rejected": -167.44674682617188, "loss": 1.0466, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7544086575508118, "rewards/margins": 0.5597019195556641, "rewards/rejected": -1.314110517501831, "step": 498 }, { "epoch": 0.7543461829176115, "epsilon_dpo/beta": 0.018847733736038208, "epsilon_dpo/beta_margin_grad_mean": -0.4056819677352905, "epsilon_dpo/beta_margin_grad_std": 0.2031472772359848, "epsilon_dpo/beta_margin_mean": 0.44904473423957825, "epsilon_dpo/beta_margin_std": 0.961423933506012, "epsilon_dpo/loss_margin_mean": 24.202713012695312, "grad_norm": 10.174310684204102, "kl/avg_steps": 0.34375, "kl/beta": 0.018910855054855347, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 8.728636813280163e-08, "logits/chosen": -0.3366241455078125, "logits/rejected": -0.31855422258377075, "logps/chosen": -132.77659606933594, "logps/ref_chosen": -84.11980438232422, "logps/ref_rejected": -111.52287292480469, "logps/rejected": -184.3823699951172, "loss": 1.1947, "rewards/accuracies": 0.6875, "rewards/chosen": -0.920242428779602, "rewards/margins": 0.44904473423957825, "rewards/rejected": -1.3692872524261475, "step": 499 }, { "epoch": 0.7558578987150416, "epsilon_dpo/beta": 0.01876549795269966, "epsilon_dpo/beta_margin_grad_mean": -0.3985728323459625, "epsilon_dpo/beta_margin_grad_std": 0.17415080964565277, "epsilon_dpo/beta_margin_mean": 0.48147937655448914, "epsilon_dpo/beta_margin_std": 0.8304465413093567, "epsilon_dpo/loss_margin_mean": 25.937110900878906, "grad_norm": 10.142269134521484, "kl/avg_steps": 0.4375, "kl/beta": 0.018846072256565094, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 8.628481651367875e-08, "logits/chosen": -0.3174499571323395, "logits/rejected": -0.3522814214229584, "logps/chosen": -139.42095947265625, "logps/ref_chosen": -93.23562622070312, "logps/ref_rejected": -96.9388427734375, "logps/rejected": -169.061279296875, "loss": 1.1147, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8695238828659058, "rewards/margins": 0.48147934675216675, "rewards/rejected": -1.3510031700134277, "step": 500 }, { "epoch": 0.7558578987150416, "eval_epsilon_dpo/beta": 0.018684910610318184, "eval_epsilon_dpo/beta_margin_grad_mean": -0.39601635932922363, "eval_epsilon_dpo/beta_margin_grad_std": 0.1754881739616394, "eval_epsilon_dpo/beta_margin_mean": 0.4919276833534241, "eval_epsilon_dpo/beta_margin_std": 0.8456751108169556, "eval_epsilon_dpo/loss_margin_mean": 26.623165130615234, "eval_kl/n_epsilon_steps": 0.28433099389076233, "eval_kl/p_epsilon_steps": 0.7156690359115601, "eval_logits/chosen": -0.19369195401668549, "eval_logits/rejected": -0.3317781388759613, "eval_logps/chosen": -130.41204833984375, "eval_logps/ref_chosen": -87.42715454101562, "eval_logps/ref_rejected": -104.23548889160156, "eval_logps/rejected": -173.8435516357422, "eval_loss": 0.5588386058807373, "eval_rewards/accuracies": 0.7205105423927307, "eval_rewards/chosen": -0.8058603405952454, "eval_rewards/margins": 0.4919276833534241, "eval_rewards/rejected": -1.2977880239486694, "eval_runtime": 47.5848, "eval_samples_per_second": 48.398, "eval_steps_per_second": 1.513, "step": 500 }, { "epoch": 0.7573696145124716, "epsilon_dpo/beta": 0.018677890300750732, "epsilon_dpo/beta_margin_grad_mean": -0.4091666638851166, "epsilon_dpo/beta_margin_grad_std": 0.14577420055866241, "epsilon_dpo/beta_margin_mean": 0.4170517921447754, "epsilon_dpo/beta_margin_std": 0.6771373748779297, "epsilon_dpo/loss_margin_mean": 22.541135787963867, "grad_norm": 8.958548545837402, "kl/avg_steps": 0.46875, "kl/beta": 0.018763979896903038, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 8.528784436016878e-08, "logits/chosen": -0.31239911913871765, "logits/rejected": -0.3793390989303589, "logps/chosen": -135.99786376953125, "logps/ref_chosen": -88.00139617919922, "logps/ref_rejected": -91.72904968261719, "logps/rejected": -162.26666259765625, "loss": 1.1154, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8980482220649719, "rewards/margins": 0.4170517921447754, "rewards/rejected": -1.3150999546051025, "step": 501 }, { "epoch": 0.7588813303099018, "epsilon_dpo/beta": 0.01859947293996811, "epsilon_dpo/beta_margin_grad_mean": -0.4219035804271698, "epsilon_dpo/beta_margin_grad_std": 0.15145091712474823, "epsilon_dpo/beta_margin_mean": 0.35392406582832336, "epsilon_dpo/beta_margin_std": 0.7095279097557068, "epsilon_dpo/loss_margin_mean": 19.268327713012695, "grad_norm": 11.017980575561523, "kl/avg_steps": 0.421875, "kl/beta": 0.018676433712244034, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 8.4295479559726e-08, "logits/chosen": -0.4218502938747406, "logits/rejected": -0.41790133714675903, "logps/chosen": -133.26429748535156, "logps/ref_chosen": -89.93152618408203, "logps/ref_rejected": -114.26683044433594, "logps/rejected": -176.867919921875, "loss": 1.1771, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8082338571548462, "rewards/margins": 0.353924036026001, "rewards/rejected": -1.1621580123901367, "step": 502 }, { "epoch": 0.7603930461073318, "epsilon_dpo/beta": 0.018501022830605507, "epsilon_dpo/beta_margin_grad_mean": -0.38354945182800293, "epsilon_dpo/beta_margin_grad_std": 0.17253164947032928, "epsilon_dpo/beta_margin_mean": 0.5339672565460205, "epsilon_dpo/beta_margin_std": 0.8142776489257812, "epsilon_dpo/loss_margin_mean": 29.133380889892578, "grad_norm": 9.433905601501465, "kl/avg_steps": 0.53125, "kl/beta": 0.018597973510622978, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.330774987092712e-08, "logits/chosen": -0.2990953326225281, "logits/rejected": 0.10869180411100388, "logps/chosen": -115.01774597167969, "logps/ref_chosen": -72.67813110351562, "logps/ref_rejected": -80.22515106201172, "logps/rejected": -151.69813537597656, "loss": 1.0705, "rewards/accuracies": 0.765625, "rewards/chosen": -0.786790132522583, "rewards/margins": 0.5339672565460205, "rewards/rejected": -1.3207573890686035, "step": 503 }, { "epoch": 0.7619047619047619, "epsilon_dpo/beta": 0.018385907635092735, "epsilon_dpo/beta_margin_grad_mean": -0.3638831079006195, "epsilon_dpo/beta_margin_grad_std": 0.15275923907756805, "epsilon_dpo/beta_margin_mean": 0.63335782289505, "epsilon_dpo/beta_margin_std": 0.7385804057121277, "epsilon_dpo/loss_margin_mean": 34.64598846435547, "grad_norm": 9.538182258605957, "kl/avg_steps": 0.625, "kl/beta": 0.018499692901968956, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 8.232468292269479e-08, "logits/chosen": -0.1938086897134781, "logits/rejected": -0.06743744015693665, "logps/chosen": -132.69613647460938, "logps/ref_chosen": -92.39486694335938, "logps/ref_rejected": -106.31135559082031, "logps/rejected": -181.25860595703125, "loss": 0.9686, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7422640919685364, "rewards/margins": 0.63335782289505, "rewards/rejected": -1.3756219148635864, "step": 504 }, { "epoch": 0.763416477702192, "epsilon_dpo/beta": 0.018323423340916634, "epsilon_dpo/beta_margin_grad_mean": -0.42362314462661743, "epsilon_dpo/beta_margin_grad_std": 0.19126078486442566, "epsilon_dpo/beta_margin_mean": 0.3635737895965576, "epsilon_dpo/beta_margin_std": 0.9163693189620972, "epsilon_dpo/loss_margin_mean": 20.19578742980957, "grad_norm": 12.260894775390625, "kl/avg_steps": 0.34375, "kl/beta": 0.018384788185358047, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 8.134630621352483e-08, "logits/chosen": -0.25550952553749084, "logits/rejected": -0.604439377784729, "logps/chosen": -130.39122009277344, "logps/ref_chosen": -84.77385711669922, "logps/ref_rejected": -103.21849060058594, "logps/rejected": -169.03164672851562, "loss": 1.2431, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8400043249130249, "rewards/margins": 0.3635737895965576, "rewards/rejected": -1.2035781145095825, "step": 505 }, { "epoch": 0.764928193499622, "epsilon_dpo/beta": 0.0182606503367424, "epsilon_dpo/beta_margin_grad_mean": -0.4268198311328888, "epsilon_dpo/beta_margin_grad_std": 0.19903631508350372, "epsilon_dpo/beta_margin_mean": 0.33908024430274963, "epsilon_dpo/beta_margin_std": 0.9598965048789978, "epsilon_dpo/loss_margin_mean": 18.941492080688477, "grad_norm": 13.514118194580078, "kl/avg_steps": 0.34375, "kl/beta": 0.0183218065649271, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 8.037264711071698e-08, "logits/chosen": -0.545729398727417, "logits/rejected": -0.5596314072608948, "logps/chosen": -143.47811889648438, "logps/ref_chosen": -95.78478240966797, "logps/ref_rejected": -108.5623779296875, "logps/rejected": -175.19720458984375, "loss": 1.2821, "rewards/accuracies": 0.671875, "rewards/chosen": -0.8736474514007568, "rewards/margins": 0.33908021450042725, "rewards/rejected": -1.2127277851104736, "step": 506 }, { "epoch": 0.7664399092970522, "epsilon_dpo/beta": 0.018180975690484047, "epsilon_dpo/beta_margin_grad_mean": -0.40145301818847656, "epsilon_dpo/beta_margin_grad_std": 0.17579539120197296, "epsilon_dpo/beta_margin_mean": 0.47464361786842346, "epsilon_dpo/beta_margin_std": 0.8506050109863281, "epsilon_dpo/loss_margin_mean": 26.402099609375, "grad_norm": 11.599726676940918, "kl/avg_steps": 0.4375, "kl/beta": 0.018259041011333466, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 7.940373284960933e-08, "logits/chosen": -0.22546380758285522, "logits/rejected": -0.4367539882659912, "logps/chosen": -143.24029541015625, "logps/ref_chosen": -94.66133880615234, "logps/ref_rejected": -119.80830383300781, "logps/rejected": -194.78936767578125, "loss": 1.1256, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8854374885559082, "rewards/margins": 0.47464364767074585, "rewards/rejected": -1.3600811958312988, "step": 507 }, { "epoch": 0.7679516250944822, "epsilon_dpo/beta": 0.01809609867632389, "epsilon_dpo/beta_margin_grad_mean": -0.39532774686813354, "epsilon_dpo/beta_margin_grad_std": 0.17735745012760162, "epsilon_dpo/beta_margin_mean": 0.5011078119277954, "epsilon_dpo/beta_margin_std": 0.8593124151229858, "epsilon_dpo/loss_margin_mean": 27.98598861694336, "grad_norm": 9.803098678588867, "kl/avg_steps": 0.46875, "kl/beta": 0.018179506063461304, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 7.843959053281663e-08, "logits/chosen": -0.011693110689520836, "logits/rejected": -0.5169368982315063, "logps/chosen": -128.99636840820312, "logps/ref_chosen": -83.423583984375, "logps/ref_rejected": -127.66143798828125, "logps/rejected": -201.22021484375, "loss": 1.1084, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8275296688079834, "rewards/margins": 0.5011078119277954, "rewards/rejected": -1.3286374807357788, "step": 508 }, { "epoch": 0.7694633408919124, "epsilon_dpo/beta": 0.01802298054099083, "epsilon_dpo/beta_margin_grad_mean": -0.3995635509490967, "epsilon_dpo/beta_margin_grad_std": 0.1715720146894455, "epsilon_dpo/beta_margin_mean": 0.4618225693702698, "epsilon_dpo/beta_margin_std": 0.780823290348053, "epsilon_dpo/loss_margin_mean": 25.937952041625977, "grad_norm": 11.328001022338867, "kl/avg_steps": 0.40625, "kl/beta": 0.018094688653945923, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 7.748024712947204e-08, "logits/chosen": -0.03571470454335213, "logits/rejected": -0.03329726681113243, "logps/chosen": -132.56443786621094, "logps/ref_chosen": -89.12423706054688, "logps/ref_rejected": -98.25785064697266, "logps/rejected": -167.63600158691406, "loss": 1.1163, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7848267555236816, "rewards/margins": 0.4618225693702698, "rewards/rejected": -1.2466492652893066, "step": 509 }, { "epoch": 0.7709750566893424, "epsilon_dpo/beta": 0.017944425344467163, "epsilon_dpo/beta_margin_grad_mean": -0.37997183203697205, "epsilon_dpo/beta_margin_grad_std": 0.18886786699295044, "epsilon_dpo/beta_margin_mean": 0.5730319619178772, "epsilon_dpo/beta_margin_std": 0.9132055044174194, "epsilon_dpo/loss_margin_mean": 32.2667350769043, "grad_norm": 11.396764755249023, "kl/avg_steps": 0.4375, "kl/beta": 0.01802147552371025, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 7.652572947447272e-08, "logits/chosen": -0.37277430295944214, "logits/rejected": -0.651774525642395, "logps/chosen": -112.097412109375, "logps/ref_chosen": -71.99481201171875, "logps/ref_rejected": -115.5155029296875, "logps/rejected": -187.88485717773438, "loss": 1.0765, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7218503952026367, "rewards/margins": 0.573032021522522, "rewards/rejected": -1.2948825359344482, "step": 510 }, { "epoch": 0.7724867724867724, "epsilon_dpo/beta": 0.0178606528788805, "epsilon_dpo/beta_margin_grad_mean": -0.3695341646671295, "epsilon_dpo/beta_margin_grad_std": 0.1857190579175949, "epsilon_dpo/beta_margin_mean": 0.6305907368659973, "epsilon_dpo/beta_margin_std": 0.9014586806297302, "epsilon_dpo/loss_margin_mean": 35.62781524658203, "grad_norm": 10.467903137207031, "kl/avg_steps": 0.46875, "kl/beta": 0.017942974343895912, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 7.557606426772961e-08, "logits/chosen": -0.44700726866722107, "logits/rejected": -0.35447224974632263, "logps/chosen": -122.42686462402344, "logps/ref_chosen": -79.94530487060547, "logps/ref_rejected": -112.25093078613281, "logps/rejected": -190.36029052734375, "loss": 1.0288, "rewards/accuracies": 0.75, "rewards/chosen": -0.7609899044036865, "rewards/margins": 0.6305907964706421, "rewards/rejected": -1.3915807008743286, "step": 511 }, { "epoch": 0.7739984882842026, "epsilon_dpo/beta": 0.01781081035733223, "epsilon_dpo/beta_margin_grad_mean": -0.4195144772529602, "epsilon_dpo/beta_margin_grad_std": 0.1699885129928589, "epsilon_dpo/beta_margin_mean": 0.3944665491580963, "epsilon_dpo/beta_margin_std": 0.8180778622627258, "epsilon_dpo/loss_margin_mean": 22.461170196533203, "grad_norm": 10.9349946975708, "kl/avg_steps": 0.28125, "kl/beta": 0.017859259620308876, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 7.463127807341966e-08, "logits/chosen": -0.30398204922676086, "logits/rejected": -0.6556848287582397, "logps/chosen": -131.9627227783203, "logps/ref_chosen": -86.12109375, "logps/ref_rejected": -89.51139068603516, "logps/rejected": -157.814208984375, "loss": 1.1781, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8200229406356812, "rewards/margins": 0.3944665491580963, "rewards/rejected": -1.214489459991455, "step": 512 }, { "epoch": 0.7755102040816326, "epsilon_dpo/beta": 0.01772189699113369, "epsilon_dpo/beta_margin_grad_mean": -0.39932936429977417, "epsilon_dpo/beta_margin_grad_std": 0.1577375829219818, "epsilon_dpo/beta_margin_mean": 0.47727683186531067, "epsilon_dpo/beta_margin_std": 0.7684574723243713, "epsilon_dpo/loss_margin_mean": 27.176280975341797, "grad_norm": 8.833486557006836, "kl/avg_steps": 0.5, "kl/beta": 0.017809171229600906, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 7.369139731924401e-08, "logits/chosen": -0.011374262161552906, "logits/rejected": -0.04961850494146347, "logps/chosen": -115.10137939453125, "logps/ref_chosen": -74.36705017089844, "logps/ref_rejected": -93.00857543945312, "logps/rejected": -160.919189453125, "loss": 1.0937, "rewards/accuracies": 0.75, "rewards/chosen": -0.7242238521575928, "rewards/margins": 0.4772768020629883, "rewards/rejected": -1.201500654220581, "step": 513 }, { "epoch": 0.7770219198790628, "epsilon_dpo/beta": 0.01762819103896618, "epsilon_dpo/beta_margin_grad_mean": -0.37859275937080383, "epsilon_dpo/beta_margin_grad_std": 0.16942396759986877, "epsilon_dpo/beta_margin_mean": 0.5625271201133728, "epsilon_dpo/beta_margin_std": 0.8023771047592163, "epsilon_dpo/loss_margin_mean": 32.183902740478516, "grad_norm": 11.468728065490723, "kl/avg_steps": 0.53125, "kl/beta": 0.017720568925142288, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 7.275644829568747e-08, "logits/chosen": -0.3658566474914551, "logits/rejected": -0.33812397718429565, "logps/chosen": -136.5714874267578, "logps/ref_chosen": -91.89642333984375, "logps/ref_rejected": -109.63691711425781, "logps/rejected": -186.49588012695312, "loss": 1.044, "rewards/accuracies": 0.765625, "rewards/chosen": -0.790207028388977, "rewards/margins": 0.562527060508728, "rewards/rejected": -1.352734088897705, "step": 514 }, { "epoch": 0.7785336356764928, "epsilon_dpo/beta": 0.017551563680171967, "epsilon_dpo/beta_margin_grad_mean": -0.40302208065986633, "epsilon_dpo/beta_margin_grad_std": 0.17570269107818604, "epsilon_dpo/beta_margin_mean": 0.45750582218170166, "epsilon_dpo/beta_margin_std": 0.8222211599349976, "epsilon_dpo/loss_margin_mean": 26.378746032714844, "grad_norm": 10.036144256591797, "kl/avg_steps": 0.4375, "kl/beta": 0.01762692630290985, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 7.182645715528435e-08, "logits/chosen": -0.2181454449892044, "logits/rejected": -0.17116093635559082, "logps/chosen": -128.15139770507812, "logps/ref_chosen": -80.23466491699219, "logps/ref_rejected": -113.73807525634766, "logps/rejected": -188.03355407714844, "loss": 1.1322, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8435226678848267, "rewards/margins": 0.45750582218170166, "rewards/rejected": -1.3010284900665283, "step": 515 }, { "epoch": 0.780045351473923, "epsilon_dpo/beta": 0.017458654940128326, "epsilon_dpo/beta_margin_grad_mean": -0.4042704999446869, "epsilon_dpo/beta_margin_grad_std": 0.16673848032951355, "epsilon_dpo/beta_margin_mean": 0.4266871213912964, "epsilon_dpo/beta_margin_std": 0.7753763794898987, "epsilon_dpo/loss_margin_mean": 24.717382431030273, "grad_norm": 10.534723281860352, "kl/avg_steps": 0.53125, "kl/beta": 0.017550144344568253, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 7.090144991188568e-08, "logits/chosen": -0.0516197606921196, "logits/rejected": -0.16222411394119263, "logps/chosen": -121.34104919433594, "logps/ref_chosen": -82.21078491210938, "logps/ref_rejected": -100.97323608398438, "logps/rejected": -164.82089233398438, "loss": 1.1413, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6861926317214966, "rewards/margins": 0.426687091588974, "rewards/rejected": -1.112879753112793, "step": 516 }, { "epoch": 0.781557067271353, "epsilon_dpo/beta": 0.017393674701452255, "epsilon_dpo/beta_margin_grad_mean": -0.43510809540748596, "epsilon_dpo/beta_margin_grad_std": 0.1779409646987915, "epsilon_dpo/beta_margin_mean": 0.32335782051086426, "epsilon_dpo/beta_margin_std": 0.8659263253211975, "epsilon_dpo/loss_margin_mean": 18.899690628051758, "grad_norm": 10.160877227783203, "kl/avg_steps": 0.375, "kl/beta": 0.017457401379942894, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 6.998145243993284e-08, "logits/chosen": -0.191817507147789, "logits/rejected": -0.15607379376888275, "logps/chosen": -131.53024291992188, "logps/ref_chosen": -83.18726348876953, "logps/ref_rejected": -87.62814331054688, "logps/rejected": -154.87081909179688, "loss": 1.2549, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8429970741271973, "rewards/margins": 0.32335782051086426, "rewards/rejected": -1.1663548946380615, "step": 517 }, { "epoch": 0.783068783068783, "epsilon_dpo/beta": 0.017355870455503464, "epsilon_dpo/beta_margin_grad_mean": -0.41893237829208374, "epsilon_dpo/beta_margin_grad_std": 0.17085179686546326, "epsilon_dpo/beta_margin_mean": 0.38197755813598633, "epsilon_dpo/beta_margin_std": 0.8127449750900269, "epsilon_dpo/loss_margin_mean": 22.35114097595215, "grad_norm": 9.441346168518066, "kl/avg_steps": 0.21875, "kl/beta": 0.017392180860042572, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 6.906649047373245e-08, "logits/chosen": -0.2893047034740448, "logits/rejected": -0.5340129733085632, "logps/chosen": -120.58061218261719, "logps/ref_chosen": -82.70620727539062, "logps/ref_rejected": -103.10424041748047, "logps/rejected": -163.32977294921875, "loss": 1.188, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6589775085449219, "rewards/margins": 0.38197755813598633, "rewards/rejected": -1.0409550666809082, "step": 518 }, { "epoch": 0.7845804988662132, "epsilon_dpo/beta": 0.017307139933109283, "epsilon_dpo/beta_margin_grad_mean": -0.4324077367782593, "epsilon_dpo/beta_margin_grad_std": 0.21184632182121277, "epsilon_dpo/beta_margin_mean": 0.31561097502708435, "epsilon_dpo/beta_margin_std": 1.0064094066619873, "epsilon_dpo/loss_margin_mean": 18.674875259399414, "grad_norm": 9.743617057800293, "kl/avg_steps": 0.28125, "kl/beta": 0.017354218289256096, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 6.815658960673781e-08, "logits/chosen": -0.023153584450483322, "logits/rejected": -0.30077916383743286, "logps/chosen": -133.44569396972656, "logps/ref_chosen": -83.7763442993164, "logps/ref_rejected": -104.38874816894531, "logps/rejected": -172.73297119140625, "loss": 1.3247, "rewards/accuracies": 0.640625, "rewards/chosen": -0.8625408411026001, "rewards/margins": 0.31561097502708435, "rewards/rejected": -1.1781518459320068, "step": 519 }, { "epoch": 0.7860922146636432, "epsilon_dpo/beta": 0.017226146534085274, "epsilon_dpo/beta_margin_grad_mean": -0.4106348156929016, "epsilon_dpo/beta_margin_grad_std": 0.14547856152057648, "epsilon_dpo/beta_margin_mean": 0.40368935465812683, "epsilon_dpo/beta_margin_std": 0.6645292043685913, "epsilon_dpo/loss_margin_mean": 23.6751651763916, "grad_norm": 10.935470581054688, "kl/avg_steps": 0.46875, "kl/beta": 0.017305545508861542, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 6.725177529083209e-08, "logits/chosen": -0.020371049642562866, "logits/rejected": -0.517693281173706, "logps/chosen": -128.94769287109375, "logps/ref_chosen": -87.44279479980469, "logps/ref_rejected": -101.50048828125, "logps/rejected": -166.68057250976562, "loss": 1.1235, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7172441482543945, "rewards/margins": 0.40368932485580444, "rewards/rejected": -1.1209335327148438, "step": 520 }, { "epoch": 0.7876039304610734, "epsilon_dpo/beta": 0.01715654507279396, "epsilon_dpo/beta_margin_grad_mean": -0.390647292137146, "epsilon_dpo/beta_margin_grad_std": 0.17379479110240936, "epsilon_dpo/beta_margin_mean": 0.5044295191764832, "epsilon_dpo/beta_margin_std": 0.830818235874176, "epsilon_dpo/loss_margin_mean": 29.717973709106445, "grad_norm": 8.840238571166992, "kl/avg_steps": 0.40625, "kl/beta": 0.017224805429577827, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 6.63520728356167e-08, "logits/chosen": 0.1258752942085266, "logits/rejected": -0.2747128903865814, "logps/chosen": -136.19918823242188, "logps/ref_chosen": -93.29463195800781, "logps/ref_rejected": -115.53233337402344, "logps/rejected": -188.15487670898438, "loss": 1.0978, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7379418611526489, "rewards/margins": 0.5044295191764832, "rewards/rejected": -1.2423713207244873, "step": 521 }, { "epoch": 0.7891156462585034, "epsilon_dpo/beta": 0.017097851261496544, "epsilon_dpo/beta_margin_grad_mean": -0.42847952246665955, "epsilon_dpo/beta_margin_grad_std": 0.19407261908054352, "epsilon_dpo/beta_margin_mean": 0.3344910442829132, "epsilon_dpo/beta_margin_std": 0.9310140609741211, "epsilon_dpo/loss_margin_mean": 19.93511962890625, "grad_norm": 12.875117301940918, "kl/avg_steps": 0.34375, "kl/beta": 0.017155112698674202, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 6.545750740770336e-08, "logits/chosen": -0.20354172587394714, "logits/rejected": -0.12197837978601456, "logps/chosen": -129.2339324951172, "logps/ref_chosen": -84.8248291015625, "logps/ref_rejected": -90.75812530517578, "logps/rejected": -155.10235595703125, "loss": 1.2743, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7611554861068726, "rewards/margins": 0.3344910144805908, "rewards/rejected": -1.0956465005874634, "step": 522 }, { "epoch": 0.7906273620559335, "epsilon_dpo/beta": 0.01701790653169155, "epsilon_dpo/beta_margin_grad_mean": -0.4060684144496918, "epsilon_dpo/beta_margin_grad_std": 0.1653435081243515, "epsilon_dpo/beta_margin_mean": 0.42627379298210144, "epsilon_dpo/beta_margin_std": 0.7653840780258179, "epsilon_dpo/loss_margin_mean": 25.342716217041016, "grad_norm": 10.446045875549316, "kl/avg_steps": 0.46875, "kl/beta": 0.017096344381570816, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 6.456810403001012e-08, "logits/chosen": -0.18978756666183472, "logits/rejected": -0.33216989040374756, "logps/chosen": -134.27200317382812, "logps/ref_chosen": -87.69473266601562, "logps/ref_rejected": -127.14915466308594, "logps/rejected": -199.0691375732422, "loss": 1.138, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7956066727638245, "rewards/margins": 0.42627379298210144, "rewards/rejected": -1.2218804359436035, "step": 523 }, { "epoch": 0.7921390778533636, "epsilon_dpo/beta": 0.01693318784236908, "epsilon_dpo/beta_margin_grad_mean": -0.41478970646858215, "epsilon_dpo/beta_margin_grad_std": 0.16659562289714813, "epsilon_dpo/beta_margin_mean": 0.37822225689888, "epsilon_dpo/beta_margin_std": 0.748524010181427, "epsilon_dpo/loss_margin_mean": 22.62693977355957, "grad_norm": 9.329117774963379, "kl/avg_steps": 0.5, "kl/beta": 0.01701657846570015, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 6.368388758106134e-08, "logits/chosen": -0.4357072114944458, "logits/rejected": -0.3554335832595825, "logps/chosen": -138.2708740234375, "logps/ref_chosen": -101.48445129394531, "logps/ref_rejected": -116.5414047241211, "logps/rejected": -175.95477294921875, "loss": 1.1738, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6247434616088867, "rewards/margins": 0.37822225689888, "rewards/rejected": -1.0029656887054443, "step": 524 }, { "epoch": 0.7936507936507936, "epsilon_dpo/beta": 0.016896570101380348, "epsilon_dpo/beta_margin_grad_mean": -0.43039360642433167, "epsilon_dpo/beta_margin_grad_std": 0.15428684651851654, "epsilon_dpo/beta_margin_mean": 0.3196568191051483, "epsilon_dpo/beta_margin_std": 0.7115899324417114, "epsilon_dpo/loss_margin_mean": 19.21969223022461, "grad_norm": 8.687216758728027, "kl/avg_steps": 0.21875, "kl/beta": 0.016931919381022453, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 6.280488279429185e-08, "logits/chosen": -0.5150451064109802, "logits/rejected": -0.6030400395393372, "logps/chosen": -145.2410888671875, "logps/ref_chosen": -100.77320861816406, "logps/ref_rejected": -106.02122497558594, "logps/rejected": -169.70880126953125, "loss": 1.2076, "rewards/accuracies": 0.625, "rewards/chosen": -0.7533972859382629, "rewards/margins": 0.3196568191051483, "rewards/rejected": -1.0730540752410889, "step": 525 }, { "epoch": 0.7951625094482238, "epsilon_dpo/beta": 0.016849128529429436, "epsilon_dpo/beta_margin_grad_mean": -0.4284980595111847, "epsilon_dpo/beta_margin_grad_std": 0.15590006113052368, "epsilon_dpo/beta_margin_mean": 0.3317577838897705, "epsilon_dpo/beta_margin_std": 0.725226879119873, "epsilon_dpo/loss_margin_mean": 19.985027313232422, "grad_norm": 8.52363109588623, "kl/avg_steps": 0.28125, "kl/beta": 0.016894960775971413, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 6.193111425735515e-08, "logits/chosen": -0.17107675969600677, "logits/rejected": -0.36276209354400635, "logps/chosen": -129.8316192626953, "logps/ref_chosen": -83.02246856689453, "logps/ref_rejected": -108.54130554199219, "logps/rejected": -175.33547973632812, "loss": 1.2012, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7910605669021606, "rewards/margins": 0.3317577838897705, "rewards/rejected": -1.1228183507919312, "step": 526 }, { "epoch": 0.7966742252456538, "epsilon_dpo/beta": 0.016817670315504074, "epsilon_dpo/beta_margin_grad_mean": -0.43790262937545776, "epsilon_dpo/beta_margin_grad_std": 0.1645323485136032, "epsilon_dpo/beta_margin_mean": 0.28252390027046204, "epsilon_dpo/beta_margin_std": 0.7518173456192017, "epsilon_dpo/loss_margin_mean": 17.13136863708496, "grad_norm": 9.994187355041504, "kl/avg_steps": 0.1875, "kl/beta": 0.016847576946020126, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 6.106260641143546e-08, "logits/chosen": -0.1380855292081833, "logits/rejected": -0.5256826877593994, "logps/chosen": -134.24891662597656, "logps/ref_chosen": -82.10969543457031, "logps/ref_rejected": -118.33562469482422, "logps/rejected": -187.60621643066406, "loss": 1.2541, "rewards/accuracies": 0.609375, "rewards/chosen": -0.8798421621322632, "rewards/margins": 0.28252387046813965, "rewards/rejected": -1.1623661518096924, "step": 527 }, { "epoch": 0.7981859410430839, "epsilon_dpo/beta": 0.01674940623342991, "epsilon_dpo/beta_margin_grad_mean": -0.414455384016037, "epsilon_dpo/beta_margin_grad_std": 0.17547161877155304, "epsilon_dpo/beta_margin_mean": 0.38497909903526306, "epsilon_dpo/beta_margin_std": 0.8312879800796509, "epsilon_dpo/loss_margin_mean": 23.30469512939453, "grad_norm": 8.494281768798828, "kl/avg_steps": 0.40625, "kl/beta": 0.016816047951579094, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 6.019938355056422e-08, "logits/chosen": 0.061112236231565475, "logits/rejected": -0.18409264087677002, "logps/chosen": -120.67262268066406, "logps/ref_chosen": -80.03038787841797, "logps/ref_rejected": -102.83058166503906, "logps/rejected": -166.7775115966797, "loss": 1.1941, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6828817129135132, "rewards/margins": 0.38497909903526306, "rewards/rejected": -1.0678608417510986, "step": 528 }, { "epoch": 0.799697656840514, "epsilon_dpo/beta": 0.016650233417749405, "epsilon_dpo/beta_margin_grad_mean": -0.34349969029426575, "epsilon_dpo/beta_margin_grad_std": 0.16111795604228973, "epsilon_dpo/beta_margin_mean": 0.7405751943588257, "epsilon_dpo/beta_margin_std": 0.7790158987045288, "epsilon_dpo/loss_margin_mean": 44.73019790649414, "grad_norm": 8.767058372497559, "kl/avg_steps": 0.59375, "kl/beta": 0.016748009249567986, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.934146982094049e-08, "logits/chosen": 0.014895215630531311, "logits/rejected": -0.07006177306175232, "logps/chosen": -113.50651550292969, "logps/ref_chosen": -78.1475830078125, "logps/ref_rejected": -98.78194427490234, "logps/rejected": -178.87107849121094, "loss": 0.9084, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5910122990608215, "rewards/margins": 0.7405751943588257, "rewards/rejected": -1.331587553024292, "step": 529 }, { "epoch": 0.8012093726379441, "epsilon_dpo/beta": 0.016588380560278893, "epsilon_dpo/beta_margin_grad_mean": -0.3937232792377472, "epsilon_dpo/beta_margin_grad_std": 0.1646030843257904, "epsilon_dpo/beta_margin_mean": 0.49445220828056335, "epsilon_dpo/beta_margin_std": 0.761608362197876, "epsilon_dpo/loss_margin_mean": 30.12356948852539, "grad_norm": 9.491631507873535, "kl/avg_steps": 0.375, "kl/beta": 0.016649154946208, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 5.848888922025552e-08, "logits/chosen": -0.3080604076385498, "logits/rejected": -0.39112964272499084, "logps/chosen": -124.18809509277344, "logps/ref_chosen": -84.73670196533203, "logps/ref_rejected": -100.79402160644531, "logps/rejected": -170.36898803710938, "loss": 1.0825, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6560468673706055, "rewards/margins": 0.49445223808288574, "rewards/rejected": -1.1504991054534912, "step": 530 }, { "epoch": 0.8027210884353742, "epsilon_dpo/beta": 0.01652122102677822, "epsilon_dpo/beta_margin_grad_mean": -0.41418811678886414, "epsilon_dpo/beta_margin_grad_std": 0.14102323353290558, "epsilon_dpo/beta_margin_mean": 0.38079819083213806, "epsilon_dpo/beta_margin_std": 0.6443190574645996, "epsilon_dpo/loss_margin_mean": 23.29291343688965, "grad_norm": 12.684146881103516, "kl/avg_steps": 0.40625, "kl/beta": 0.01658695377409458, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 5.7641665597021435e-08, "logits/chosen": -0.11376606673002243, "logits/rejected": -0.1980658918619156, "logps/chosen": -117.44862365722656, "logps/ref_chosen": -75.68649291992188, "logps/ref_rejected": -102.33787536621094, "logps/rejected": -167.39292907714844, "loss": 1.1365, "rewards/accuracies": 0.75, "rewards/chosen": -0.6924214959144592, "rewards/margins": 0.38079819083213806, "rewards/rejected": -1.0732197761535645, "step": 531 }, { "epoch": 0.8042328042328042, "epsilon_dpo/beta": 0.016438886523246765, "epsilon_dpo/beta_margin_grad_mean": -0.40522220730781555, "epsilon_dpo/beta_margin_grad_std": 0.16460780799388885, "epsilon_dpo/beta_margin_mean": 0.4439644515514374, "epsilon_dpo/beta_margin_std": 0.7691618204116821, "epsilon_dpo/loss_margin_mean": 27.288150787353516, "grad_norm": 8.678123474121094, "kl/avg_steps": 0.5, "kl/beta": 0.01651984080672264, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.679982264990424e-08, "logits/chosen": -0.21569831669330597, "logits/rejected": -0.23878905177116394, "logps/chosen": -130.5865478515625, "logps/ref_chosen": -83.01626586914062, "logps/ref_rejected": -103.29268646240234, "logps/rejected": -178.151123046875, "loss": 1.1238, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7830289602279663, "rewards/margins": 0.44396448135375977, "rewards/rejected": -1.226993441581726, "step": 532 }, { "epoch": 0.8057445200302343, "epsilon_dpo/beta": 0.016387924551963806, "epsilon_dpo/beta_margin_grad_mean": -0.4147703945636749, "epsilon_dpo/beta_margin_grad_std": 0.18282870948314667, "epsilon_dpo/beta_margin_mean": 0.40231624245643616, "epsilon_dpo/beta_margin_std": 0.8481965065002441, "epsilon_dpo/loss_margin_mean": 24.933364868164062, "grad_norm": 10.612200736999512, "kl/avg_steps": 0.3125, "kl/beta": 0.016437653452157974, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 5.596338392706076e-08, "logits/chosen": 0.08782052993774414, "logits/rejected": -0.4669248163700104, "logps/chosen": -113.88616943359375, "logps/ref_chosen": -78.34988403320312, "logps/ref_rejected": -96.21743774414062, "logps/rejected": -156.6870880126953, "loss": 1.1873, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5851448774337769, "rewards/margins": 0.40231621265411377, "rewards/rejected": -0.9874610900878906, "step": 533 }, { "epoch": 0.8072562358276644, "epsilon_dpo/beta": 0.016306143254041672, "epsilon_dpo/beta_margin_grad_mean": -0.39130768179893494, "epsilon_dpo/beta_margin_grad_std": 0.16498181223869324, "epsilon_dpo/beta_margin_mean": 0.4981834590435028, "epsilon_dpo/beta_margin_std": 0.7556285858154297, "epsilon_dpo/loss_margin_mean": 30.85486602783203, "grad_norm": 10.962309837341309, "kl/avg_steps": 0.5, "kl/beta": 0.016386445611715317, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.513237282548033e-08, "logits/chosen": -0.48020732402801514, "logits/rejected": -0.33289915323257446, "logps/chosen": -116.89681243896484, "logps/ref_chosen": -84.2362060546875, "logps/ref_rejected": -98.3175048828125, "logps/rejected": -161.83297729492188, "loss": 1.0786, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5349410176277161, "rewards/margins": 0.4981834292411804, "rewards/rejected": -1.0331244468688965, "step": 534 }, { "epoch": 0.8087679516250945, "epsilon_dpo/beta": 0.01623521000146866, "epsilon_dpo/beta_margin_grad_mean": -0.4018467962741852, "epsilon_dpo/beta_margin_grad_std": 0.1728149652481079, "epsilon_dpo/beta_margin_mean": 0.4536625146865845, "epsilon_dpo/beta_margin_std": 0.8184735178947449, "epsilon_dpo/loss_margin_mean": 28.273239135742188, "grad_norm": 7.956579685211182, "kl/avg_steps": 0.4375, "kl/beta": 0.016304921358823776, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 5.430681259032957e-08, "logits/chosen": -0.20313288271427155, "logits/rejected": -0.416176974773407, "logps/chosen": -125.93107604980469, "logps/ref_chosen": -84.17073822021484, "logps/ref_rejected": -104.83849334716797, "logps/rejected": -174.8720703125, "loss": 1.1333, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6805884838104248, "rewards/margins": 0.4536625146865845, "rewards/rejected": -1.1342509984970093, "step": 535 }, { "epoch": 0.8102796674225246, "epsilon_dpo/beta": 0.016134047880768776, "epsilon_dpo/beta_margin_grad_mean": -0.37715545296669006, "epsilon_dpo/beta_margin_grad_std": 0.1385839879512787, "epsilon_dpo/beta_margin_mean": 0.5649297833442688, "epsilon_dpo/beta_margin_std": 0.6600634455680847, "epsilon_dpo/loss_margin_mean": 35.17201614379883, "grad_norm": 7.054563999176025, "kl/avg_steps": 0.625, "kl/beta": 0.01623389683663845, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 5.3486726314303175e-08, "logits/chosen": -0.08141159266233444, "logits/rejected": -0.3341004550457001, "logps/chosen": -105.59613037109375, "logps/ref_chosen": -70.18797302246094, "logps/ref_rejected": -97.0616455078125, "logps/rejected": -167.64183044433594, "loss": 0.9949, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5719255805015564, "rewards/margins": 0.5649298429489136, "rewards/rejected": -1.1368553638458252, "step": 536 }, { "epoch": 0.8117913832199547, "epsilon_dpo/beta": 0.016084259375929832, "epsilon_dpo/beta_margin_grad_mean": -0.4204951226711273, "epsilon_dpo/beta_margin_grad_std": 0.15833215415477753, "epsilon_dpo/beta_margin_mean": 0.35688626766204834, "epsilon_dpo/beta_margin_std": 0.7070753574371338, "epsilon_dpo/loss_margin_mean": 22.500165939331055, "grad_norm": 10.414916038513184, "kl/avg_steps": 0.3125, "kl/beta": 0.01613306626677513, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 5.267213693697695e-08, "logits/chosen": -0.13419167697429657, "logits/rejected": -0.4870593547821045, "logps/chosen": -133.16604614257812, "logps/ref_chosen": -80.709716796875, "logps/ref_rejected": -116.72985076904297, "logps/rejected": -191.68634033203125, "loss": 1.1774, "rewards/accuracies": 0.671875, "rewards/chosen": -0.8454914093017578, "rewards/margins": 0.35688626766204834, "rewards/rejected": -1.2023777961730957, "step": 537 }, { "epoch": 0.8133030990173847, "epsilon_dpo/beta": 0.016014045104384422, "epsilon_dpo/beta_margin_grad_mean": -0.4055761396884918, "epsilon_dpo/beta_margin_grad_std": 0.16074000298976898, "epsilon_dpo/beta_margin_mean": 0.437906414270401, "epsilon_dpo/beta_margin_std": 0.7546932697296143, "epsilon_dpo/loss_margin_mean": 27.643733978271484, "grad_norm": 8.242533683776855, "kl/avg_steps": 0.4375, "kl/beta": 0.016082806512713432, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 5.1863067244167144e-08, "logits/chosen": -0.13110458850860596, "logits/rejected": -0.23731385171413422, "logps/chosen": -143.7471466064453, "logps/ref_chosen": -94.1431884765625, "logps/ref_rejected": -103.38029479980469, "logps/rejected": -180.62799072265625, "loss": 1.1233, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7969592809677124, "rewards/margins": 0.4379063844680786, "rewards/rejected": -1.234865665435791, "step": 538 }, { "epoch": 0.8148148148148148, "epsilon_dpo/beta": 0.015919266268610954, "epsilon_dpo/beta_margin_grad_mean": -0.4092956781387329, "epsilon_dpo/beta_margin_grad_std": 0.13377299904823303, "epsilon_dpo/beta_margin_mean": 0.40943920612335205, "epsilon_dpo/beta_margin_std": 0.6206133961677551, "epsilon_dpo/loss_margin_mean": 25.90907859802246, "grad_norm": 9.922149658203125, "kl/avg_steps": 0.59375, "kl/beta": 0.016012750566005707, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.105953986729195e-08, "logits/chosen": 0.24353787302970886, "logits/rejected": -0.5500741004943848, "logps/chosen": -129.62351989746094, "logps/ref_chosen": -84.61248779296875, "logps/ref_rejected": -107.14148712158203, "logps/rejected": -178.06158447265625, "loss": 1.1052, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7179849147796631, "rewards/margins": 0.40943920612335205, "rewards/rejected": -1.1274241209030151, "step": 539 }, { "epoch": 0.8163265306122449, "epsilon_dpo/beta": 0.015845203772187233, "epsilon_dpo/beta_margin_grad_mean": -0.3944285809993744, "epsilon_dpo/beta_margin_grad_std": 0.1632717400789261, "epsilon_dpo/beta_margin_mean": 0.48868271708488464, "epsilon_dpo/beta_margin_std": 0.7638451457023621, "epsilon_dpo/loss_margin_mean": 31.13498306274414, "grad_norm": 9.602672576904297, "kl/avg_steps": 0.46875, "kl/beta": 0.01591823622584343, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 5.026157728273966e-08, "logits/chosen": -0.14873777329921722, "logits/rejected": -0.4375326335430145, "logps/chosen": -132.97463989257812, "logps/ref_chosen": -89.30429077148438, "logps/ref_rejected": -122.46199035644531, "logps/rejected": -197.267333984375, "loss": 1.0872, "rewards/accuracies": 0.75, "rewards/chosen": -0.6940083503723145, "rewards/margins": 0.48868274688720703, "rewards/rejected": -1.1826910972595215, "step": 540 }, { "epoch": 0.817838246409675, "epsilon_dpo/beta": 0.015781180933117867, "epsilon_dpo/beta_margin_grad_mean": -0.38085731863975525, "epsilon_dpo/beta_margin_grad_std": 0.16132763028144836, "epsilon_dpo/beta_margin_mean": 0.5413305163383484, "epsilon_dpo/beta_margin_std": 0.7297938466072083, "epsilon_dpo/loss_margin_mean": 34.643150329589844, "grad_norm": 12.654080390930176, "kl/avg_steps": 0.40625, "kl/beta": 0.015843968838453293, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.9469201811239035e-08, "logits/chosen": -0.13464142382144928, "logits/rejected": 0.12418127059936523, "logps/chosen": -125.31890869140625, "logps/ref_chosen": -87.92848205566406, "logps/ref_rejected": -81.73931884765625, "logps/rejected": -153.7729034423828, "loss": 1.0383, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5931801199913025, "rewards/margins": 0.5413305163383484, "rewards/rejected": -1.1345106363296509, "step": 541 }, { "epoch": 0.8193499622071051, "epsilon_dpo/beta": 0.01569266989827156, "epsilon_dpo/beta_margin_grad_mean": -0.37570175528526306, "epsilon_dpo/beta_margin_grad_std": 0.16721844673156738, "epsilon_dpo/beta_margin_mean": 0.5741678476333618, "epsilon_dpo/beta_margin_std": 0.7826369404792786, "epsilon_dpo/loss_margin_mean": 36.878963470458984, "grad_norm": 9.048364639282227, "kl/avg_steps": 0.5625, "kl/beta": 0.01577986218035221, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.868243561723534e-08, "logits/chosen": -0.06194941699504852, "logits/rejected": -0.1134922131896019, "logps/chosen": -103.31153869628906, "logps/ref_chosen": -71.24203491210938, "logps/ref_rejected": -88.06472778320312, "logps/rejected": -157.01319885253906, "loss": 1.0295, "rewards/accuracies": 0.75, "rewards/chosen": -0.5051934123039246, "rewards/margins": 0.5741678476333618, "rewards/rejected": -1.0793613195419312, "step": 542 }, { "epoch": 0.8208616780045351, "epsilon_dpo/beta": 0.015595084056258202, "epsilon_dpo/beta_margin_grad_mean": -0.3929816782474518, "epsilon_dpo/beta_margin_grad_std": 0.16900266706943512, "epsilon_dpo/beta_margin_mean": 0.49183303117752075, "epsilon_dpo/beta_margin_std": 0.8085128664970398, "epsilon_dpo/loss_margin_mean": 31.800132751464844, "grad_norm": 7.8205671310424805, "kl/avg_steps": 0.625, "kl/beta": 0.015691597014665604, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.790130070827028e-08, "logits/chosen": -0.1205248311161995, "logits/rejected": -0.30794036388397217, "logps/chosen": -114.30192565917969, "logps/ref_chosen": -73.00680541992188, "logps/ref_rejected": -99.84335327148438, "logps/rejected": -172.9385986328125, "loss": 1.0993, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6449280977249146, "rewards/margins": 0.49183300137519836, "rewards/rejected": -1.1367610692977905, "step": 543 }, { "epoch": 0.8223733938019653, "epsilon_dpo/beta": 0.015512841753661633, "epsilon_dpo/beta_margin_grad_mean": -0.3808680474758148, "epsilon_dpo/beta_margin_grad_std": 0.17236299812793732, "epsilon_dpo/beta_margin_mean": 0.5604837536811829, "epsilon_dpo/beta_margin_std": 0.8075725436210632, "epsilon_dpo/loss_margin_mean": 36.44947052001953, "grad_norm": 11.447324752807617, "kl/avg_steps": 0.53125, "kl/beta": 0.015594134107232094, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.7125818934366454e-08, "logits/chosen": -0.10196401923894882, "logits/rejected": -0.42052143812179565, "logps/chosen": -129.619140625, "logps/ref_chosen": -87.3775863647461, "logps/ref_rejected": -112.98512268066406, "logps/rejected": -191.6761474609375, "loss": 1.0477, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6571992635726929, "rewards/margins": 0.5604837536811829, "rewards/rejected": -1.2176830768585205, "step": 544 }, { "epoch": 0.8238851095993953, "epsilon_dpo/beta": 0.015464799478650093, "epsilon_dpo/beta_margin_grad_mean": -0.41932106018066406, "epsilon_dpo/beta_margin_grad_std": 0.16810117661952972, "epsilon_dpo/beta_margin_mean": 0.37158551812171936, "epsilon_dpo/beta_margin_std": 0.780260443687439, "epsilon_dpo/loss_margin_mean": 24.382139205932617, "grad_norm": 7.527310848236084, "kl/avg_steps": 0.3125, "kl/beta": 0.015511727891862392, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.635601198741607e-08, "logits/chosen": -0.010136006399989128, "logits/rejected": -0.3866921663284302, "logps/chosen": -129.83676147460938, "logps/ref_chosen": -85.00263977050781, "logps/ref_rejected": -105.95549011230469, "logps/rejected": -175.1717529296875, "loss": 1.1873, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6954314708709717, "rewards/margins": 0.371585488319397, "rewards/rejected": -1.0670169591903687, "step": 545 }, { "epoch": 0.8253968253968254, "epsilon_dpo/beta": 0.015387626364827156, "epsilon_dpo/beta_margin_grad_mean": -0.3998827040195465, "epsilon_dpo/beta_margin_grad_std": 0.1473892629146576, "epsilon_dpo/beta_margin_mean": 0.45398467779159546, "epsilon_dpo/beta_margin_std": 0.682119607925415, "epsilon_dpo/loss_margin_mean": 29.757774353027344, "grad_norm": 8.235089302062988, "kl/avg_steps": 0.5, "kl/beta": 0.015463404357433319, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.559190140057428e-08, "logits/chosen": -0.2149541974067688, "logits/rejected": -0.11502528190612793, "logps/chosen": -117.99137878417969, "logps/ref_chosen": -79.49039459228516, "logps/ref_rejected": -87.46519470214844, "logps/rejected": -155.7239532470703, "loss": 1.0882, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5945260524749756, "rewards/margins": 0.45398467779159546, "rewards/rejected": -1.0485107898712158, "step": 546 }, { "epoch": 0.8269085411942555, "epsilon_dpo/beta": 0.015287027694284916, "epsilon_dpo/beta_margin_grad_mean": -0.3833127021789551, "epsilon_dpo/beta_margin_grad_std": 0.15344193577766418, "epsilon_dpo/beta_margin_mean": 0.5260194540023804, "epsilon_dpo/beta_margin_std": 0.7001830339431763, "epsilon_dpo/loss_margin_mean": 34.64781951904297, "grad_norm": 8.841313362121582, "kl/avg_steps": 0.65625, "kl/beta": 0.015386472456157207, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.483350854765672e-08, "logits/chosen": -0.3627888560295105, "logits/rejected": -0.21551699936389923, "logps/chosen": -110.71125793457031, "logps/ref_chosen": -75.10725402832031, "logps/ref_rejected": -97.79402160644531, "logps/rejected": -168.0458526611328, "loss": 1.0396, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5463103652000427, "rewards/margins": 0.5260194540023804, "rewards/rejected": -1.0723297595977783, "step": 547 }, { "epoch": 0.8284202569916855, "epsilon_dpo/beta": 0.015235133469104767, "epsilon_dpo/beta_margin_grad_mean": -0.4219856262207031, "epsilon_dpo/beta_margin_grad_std": 0.16018447279930115, "epsilon_dpo/beta_margin_mean": 0.3609781563282013, "epsilon_dpo/beta_margin_std": 0.7288519144058228, "epsilon_dpo/loss_margin_mean": 24.029176712036133, "grad_norm": 9.881436347961426, "kl/avg_steps": 0.34375, "kl/beta": 0.01528615690767765, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.4080854642541826e-08, "logits/chosen": 0.08564075827598572, "logits/rejected": -0.3461897671222687, "logps/chosen": -133.748779296875, "logps/ref_chosen": -85.9015121459961, "logps/ref_rejected": -108.32438659667969, "logps/rejected": -180.20083618164062, "loss": 1.1796, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7319462299346924, "rewards/margins": 0.3609781265258789, "rewards/rejected": -1.0929243564605713, "step": 548 }, { "epoch": 0.8299319727891157, "epsilon_dpo/beta": 0.015168659389019012, "epsilon_dpo/beta_margin_grad_mean": -0.42518672347068787, "epsilon_dpo/beta_margin_grad_std": 0.17199109494686127, "epsilon_dpo/beta_margin_mean": 0.3430107533931732, "epsilon_dpo/beta_margin_std": 0.7894731163978577, "epsilon_dpo/loss_margin_mean": 22.958250045776367, "grad_norm": 16.42542839050293, "kl/avg_steps": 0.4375, "kl/beta": 0.015233790501952171, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.333396073857723e-08, "logits/chosen": -0.14731433987617493, "logits/rejected": -0.31095337867736816, "logps/chosen": -131.48519897460938, "logps/ref_chosen": -87.12271118164062, "logps/ref_rejected": -121.61825561523438, "logps/rejected": -188.93899536132812, "loss": 1.2156, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6761950254440308, "rewards/margins": 0.3430107533931732, "rewards/rejected": -1.0192058086395264, "step": 549 }, { "epoch": 0.8314436885865457, "epsilon_dpo/beta": 0.015102584846317768, "epsilon_dpo/beta_margin_grad_mean": -0.4151040017604828, "epsilon_dpo/beta_margin_grad_std": 0.1606675386428833, "epsilon_dpo/beta_margin_mean": 0.3868383765220642, "epsilon_dpo/beta_margin_std": 0.7236196994781494, "epsilon_dpo/loss_margin_mean": 25.927637100219727, "grad_norm": 9.797503471374512, "kl/avg_steps": 0.4375, "kl/beta": 0.015167432837188244, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.259284772799099e-08, "logits/chosen": -0.009366992861032486, "logits/rejected": -0.278700590133667, "logps/chosen": -124.78021240234375, "logps/ref_chosen": -80.63383483886719, "logps/ref_rejected": -87.28580474853516, "logps/rejected": -157.35983276367188, "loss": 1.1575, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6684355735778809, "rewards/margins": 0.3868383765220642, "rewards/rejected": -1.0552740097045898, "step": 550 }, { "epoch": 0.8329554043839759, "epsilon_dpo/beta": 0.01503679994493723, "epsilon_dpo/beta_margin_grad_mean": -0.41068235039711, "epsilon_dpo/beta_margin_grad_std": 0.13416306674480438, "epsilon_dpo/beta_margin_mean": 0.39317819476127625, "epsilon_dpo/beta_margin_std": 0.6135088205337524, "epsilon_dpo/loss_margin_mean": 26.400938034057617, "grad_norm": 8.979781150817871, "kl/avg_steps": 0.4375, "kl/beta": 0.015101364813745022, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.1857536341307176e-08, "logits/chosen": 0.00559891015291214, "logits/rejected": -0.38543951511383057, "logps/chosen": -133.9377899169922, "logps/ref_chosen": -92.95742797851562, "logps/ref_rejected": -112.68172454833984, "logps/rejected": -180.06301879882812, "loss": 1.1175, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6161550283432007, "rewards/margins": 0.39317816495895386, "rewards/rejected": -1.0093331336975098, "step": 551 }, { "epoch": 0.8344671201814059, "epsilon_dpo/beta": 0.014957202598452568, "epsilon_dpo/beta_margin_grad_mean": -0.4155219495296478, "epsilon_dpo/beta_margin_grad_std": 0.1576501727104187, "epsilon_dpo/beta_margin_mean": 0.3671215772628784, "epsilon_dpo/beta_margin_std": 0.7070900797843933, "epsilon_dpo/loss_margin_mean": 24.845149993896484, "grad_norm": 10.171217918395996, "kl/avg_steps": 0.53125, "kl/beta": 0.015035583637654781, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.112804714676593e-08, "logits/chosen": -0.25429147481918335, "logits/rejected": -0.29028773307800293, "logps/chosen": -131.58714294433594, "logps/ref_chosen": -88.42652130126953, "logps/ref_rejected": -111.27716064453125, "logps/rejected": -179.28292846679688, "loss": 1.1692, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6473616361618042, "rewards/margins": 0.3671215772628784, "rewards/rejected": -1.0144832134246826, "step": 552 }, { "epoch": 0.8359788359788359, "epsilon_dpo/beta": 0.014906208962202072, "epsilon_dpo/beta_margin_grad_mean": -0.4175128936767578, "epsilon_dpo/beta_margin_grad_std": 0.18361634016036987, "epsilon_dpo/beta_margin_mean": 0.3894207775592804, "epsilon_dpo/beta_margin_std": 0.86497563123703, "epsilon_dpo/loss_margin_mean": 26.525848388671875, "grad_norm": 9.084397315979004, "kl/avg_steps": 0.34375, "kl/beta": 0.014956129714846611, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.0404400549748144e-08, "logits/chosen": 0.05000998079776764, "logits/rejected": -0.2641226649284363, "logps/chosen": -126.92143249511719, "logps/ref_chosen": -78.53276824951172, "logps/ref_rejected": -111.15892028808594, "logps/rejected": -186.07342529296875, "loss": 1.2031, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7230408191680908, "rewards/margins": 0.3894208073616028, "rewards/rejected": -1.1124615669250488, "step": 553 }, { "epoch": 0.8374905517762661, "epsilon_dpo/beta": 0.01482719462364912, "epsilon_dpo/beta_margin_grad_mean": -0.39437729120254517, "epsilon_dpo/beta_margin_grad_std": 0.15337437391281128, "epsilon_dpo/beta_margin_mean": 0.4819382429122925, "epsilon_dpo/beta_margin_std": 0.7117973566055298, "epsilon_dpo/loss_margin_mean": 32.775718688964844, "grad_norm": 8.914608001708984, "kl/avg_steps": 0.53125, "kl/beta": 0.014904893934726715, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.968661679220467e-08, "logits/chosen": -0.3598790466785431, "logits/rejected": -0.19130371510982513, "logps/chosen": -129.90811157226562, "logps/ref_chosen": -85.90342712402344, "logps/ref_rejected": -94.58822631835938, "logps/rejected": -171.36862182617188, "loss": 1.0757, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6546410918235779, "rewards/margins": 0.4819382429122925, "rewards/rejected": -1.1365792751312256, "step": 554 }, { "epoch": 0.8390022675736961, "epsilon_dpo/beta": 0.014753474853932858, "epsilon_dpo/beta_margin_grad_mean": -0.40565571188926697, "epsilon_dpo/beta_margin_grad_std": 0.1690124273300171, "epsilon_dpo/beta_margin_mean": 0.42326241731643677, "epsilon_dpo/beta_margin_std": 0.7654484510421753, "epsilon_dpo/loss_margin_mean": 29.026073455810547, "grad_norm": 9.91170597076416, "kl/avg_steps": 0.5, "kl/beta": 0.014826130121946335, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.89747159520904e-08, "logits/chosen": -0.008408933877944946, "logits/rejected": -0.022642306983470917, "logps/chosen": -136.08340454101562, "logps/ref_chosen": -87.41081237792969, "logps/ref_rejected": -94.39276123046875, "logps/rejected": -172.09144592285156, "loss": 1.1425, "rewards/accuracies": 0.75, "rewards/chosen": -0.7206442952156067, "rewards/margins": 0.42326241731643677, "rewards/rejected": -1.1439067125320435, "step": 555 }, { "epoch": 0.8405139833711263, "epsilon_dpo/beta": 0.014689295552670956, "epsilon_dpo/beta_margin_grad_mean": -0.419183611869812, "epsilon_dpo/beta_margin_grad_std": 0.1609971672296524, "epsilon_dpo/beta_margin_mean": 0.37554728984832764, "epsilon_dpo/beta_margin_std": 0.7430645823478699, "epsilon_dpo/loss_margin_mean": 25.880294799804688, "grad_norm": 7.737394332885742, "kl/avg_steps": 0.4375, "kl/beta": 0.014752368442714214, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.826871794280192e-08, "logits/chosen": 0.10626913607120514, "logits/rejected": -0.08207418769598007, "logps/chosen": -123.96664428710938, "logps/ref_chosen": -71.60616302490234, "logps/ref_rejected": -90.15902709960938, "logps/rejected": -168.39981079101562, "loss": 1.1713, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7713250517845154, "rewards/margins": 0.37554728984832764, "rewards/rejected": -1.1468722820281982, "step": 556 }, { "epoch": 0.8420256991685563, "epsilon_dpo/beta": 0.014616128988564014, "epsilon_dpo/beta_margin_grad_mean": -0.39464807510375977, "epsilon_dpo/beta_margin_grad_std": 0.14853645861148834, "epsilon_dpo/beta_margin_mean": 0.4820627272129059, "epsilon_dpo/beta_margin_std": 0.6935344338417053, "epsilon_dpo/loss_margin_mean": 33.26234436035156, "grad_norm": 8.162524223327637, "kl/avg_steps": 0.5, "kl/beta": 0.014688108116388321, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.756864251262143e-08, "logits/chosen": -0.1140388548374176, "logits/rejected": -0.6151837706565857, "logps/chosen": -125.14369201660156, "logps/ref_chosen": -78.79825592041016, "logps/ref_rejected": -105.23713684082031, "logps/rejected": -184.84490966796875, "loss": 1.069, "rewards/accuracies": 0.75, "rewards/chosen": -0.6800782680511475, "rewards/margins": 0.4820627272129059, "rewards/rejected": -1.1621410846710205, "step": 557 }, { "epoch": 0.8435374149659864, "epsilon_dpo/beta": 0.014561682008206844, "epsilon_dpo/beta_margin_grad_mean": -0.40123188495635986, "epsilon_dpo/beta_margin_grad_std": 0.1584625244140625, "epsilon_dpo/beta_margin_mean": 0.4439917504787445, "epsilon_dpo/beta_margin_std": 0.7239269614219666, "epsilon_dpo/loss_margin_mean": 30.838037490844727, "grad_norm": 10.327901840209961, "kl/avg_steps": 0.375, "kl/beta": 0.014615032821893692, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.687450924416341e-08, "logits/chosen": -0.21190424263477325, "logits/rejected": -0.20192435383796692, "logps/chosen": -131.4257354736328, "logps/ref_chosen": -86.41659545898438, "logps/ref_rejected": -115.81890869140625, "logps/rejected": -191.6660919189453, "loss": 1.1108, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6573868989944458, "rewards/margins": 0.4439917504787445, "rewards/rejected": -1.1013786792755127, "step": 558 }, { "epoch": 0.8450491307634165, "epsilon_dpo/beta": 0.014511831104755402, "epsilon_dpo/beta_margin_grad_mean": -0.41161999106407166, "epsilon_dpo/beta_margin_grad_std": 0.18268904089927673, "epsilon_dpo/beta_margin_mean": 0.41245096921920776, "epsilon_dpo/beta_margin_std": 0.8520277142524719, "epsilon_dpo/loss_margin_mean": 28.83552360534668, "grad_norm": 8.330303192138672, "kl/avg_steps": 0.34375, "kl/beta": 0.014560431241989136, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.6186337553827743e-08, "logits/chosen": -0.13291501998901367, "logits/rejected": -0.49170762300491333, "logps/chosen": -128.95718383789062, "logps/ref_chosen": -84.51522064208984, "logps/ref_rejected": -105.63461303710938, "logps/rejected": -178.912109375, "loss": 1.1804, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6472938060760498, "rewards/margins": 0.41245096921920776, "rewards/rejected": -1.0597448348999023, "step": 559 }, { "epoch": 0.8465608465608465, "epsilon_dpo/beta": 0.014443977735936642, "epsilon_dpo/beta_margin_grad_mean": -0.38788938522338867, "epsilon_dpo/beta_margin_grad_std": 0.1522243767976761, "epsilon_dpo/beta_margin_mean": 0.5074370503425598, "epsilon_dpo/beta_margin_std": 0.6997931003570557, "epsilon_dpo/loss_margin_mean": 35.44127655029297, "grad_norm": 7.828211784362793, "kl/avg_steps": 0.46875, "kl/beta": 0.01451055146753788, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.550414669125573e-08, "logits/chosen": 0.03868032246828079, "logits/rejected": 0.07886442542076111, "logps/chosen": -128.6056671142578, "logps/ref_chosen": -84.04110717773438, "logps/ref_rejected": -98.00209045410156, "logps/rejected": -178.0079345703125, "loss": 1.053, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6456395983695984, "rewards/margins": 0.5074370503425598, "rewards/rejected": -1.1530766487121582, "step": 560 }, { "epoch": 0.8480725623582767, "epsilon_dpo/beta": 0.014367558993399143, "epsilon_dpo/beta_margin_grad_mean": -0.38891005516052246, "epsilon_dpo/beta_margin_grad_std": 0.1498725563287735, "epsilon_dpo/beta_margin_mean": 0.5143095850944519, "epsilon_dpo/beta_margin_std": 0.7102375626564026, "epsilon_dpo/loss_margin_mean": 36.05918884277344, "grad_norm": 7.392366409301758, "kl/avg_steps": 0.53125, "kl/beta": 0.014442849904298782, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.482795573879241e-08, "logits/chosen": -0.23694953322410583, "logits/rejected": -0.20725828409194946, "logps/chosen": -127.86459350585938, "logps/ref_chosen": -85.260009765625, "logps/ref_rejected": -99.17951202392578, "logps/rejected": -177.84329223632812, "loss": 1.0484, "rewards/accuracies": 0.75, "rewards/chosen": -0.61405348777771, "rewards/margins": 0.5143096446990967, "rewards/rejected": -1.128363013267517, "step": 561 }, { "epoch": 0.8495842781557067, "epsilon_dpo/beta": 0.01430959440767765, "epsilon_dpo/beta_margin_grad_mean": -0.402746319770813, "epsilon_dpo/beta_margin_grad_std": 0.14407731592655182, "epsilon_dpo/beta_margin_mean": 0.441642165184021, "epsilon_dpo/beta_margin_std": 0.6657573580741882, "epsilon_dpo/loss_margin_mean": 31.14838218688965, "grad_norm": 8.244582176208496, "kl/avg_steps": 0.40625, "kl/beta": 0.014366528019309044, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.415778361095226e-08, "logits/chosen": -0.13111603260040283, "logits/rejected": -0.1937008500099182, "logps/chosen": -150.1193389892578, "logps/ref_chosen": -101.58757019042969, "logps/ref_rejected": -116.84658813476562, "logps/rejected": -196.52674865722656, "loss": 1.0929, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6962746977806091, "rewards/margins": 0.441642165184021, "rewards/rejected": -1.1379168033599854, "step": 562 }, { "epoch": 0.8510959939531368, "epsilon_dpo/beta": 0.014247225597500801, "epsilon_dpo/beta_margin_grad_mean": -0.39969557523727417, "epsilon_dpo/beta_margin_grad_std": 0.17446556687355042, "epsilon_dpo/beta_margin_mean": 0.466126948595047, "epsilon_dpo/beta_margin_std": 0.8048774003982544, "epsilon_dpo/loss_margin_mean": 33.10220718383789, "grad_norm": 8.794232368469238, "kl/avg_steps": 0.4375, "kl/beta": 0.014308400452136993, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.349364905389032e-08, "logits/chosen": -0.10935737937688828, "logits/rejected": -0.1448805034160614, "logps/chosen": -109.391845703125, "logps/ref_chosen": -69.69686889648438, "logps/ref_rejected": -90.02664184570312, "logps/rejected": -162.82382202148438, "loss": 1.1205, "rewards/accuracies": 0.75, "rewards/chosen": -0.5677230358123779, "rewards/margins": 0.4661269187927246, "rewards/rejected": -1.0338499546051025, "step": 563 }, { "epoch": 0.8526077097505669, "epsilon_dpo/beta": 0.014180713333189487, "epsilon_dpo/beta_margin_grad_mean": -0.4073896110057831, "epsilon_dpo/beta_margin_grad_std": 0.1713608354330063, "epsilon_dpo/beta_margin_mean": 0.4204823076725006, "epsilon_dpo/beta_margin_std": 0.78926020860672, "epsilon_dpo/loss_margin_mean": 30.017087936401367, "grad_norm": 8.327298164367676, "kl/avg_steps": 0.46875, "kl/beta": 0.014246073551476002, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.283557064487785e-08, "logits/chosen": -0.06540387868881226, "logits/rejected": -0.05503258854150772, "logps/chosen": -117.8368911743164, "logps/ref_chosen": -79.32344055175781, "logps/ref_rejected": -90.12184143066406, "logps/rejected": -158.65237426757812, "loss": 1.1517, "rewards/accuracies": 0.75, "rewards/chosen": -0.5486175417900085, "rewards/margins": 0.4204823076725006, "rewards/rejected": -0.9690998792648315, "step": 564 }, { "epoch": 0.854119425547997, "epsilon_dpo/beta": 0.01411898247897625, "epsilon_dpo/beta_margin_grad_mean": -0.41273975372314453, "epsilon_dpo/beta_margin_grad_std": 0.15865632891654968, "epsilon_dpo/beta_margin_mean": 0.39410722255706787, "epsilon_dpo/beta_margin_std": 0.7258629202842712, "epsilon_dpo/loss_margin_mean": 28.23537254333496, "grad_norm": 6.96705436706543, "kl/avg_steps": 0.4375, "kl/beta": 0.014179606921970844, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.218356679178252e-08, "logits/chosen": -0.21644821763038635, "logits/rejected": -0.06834931671619415, "logps/chosen": -133.81097412109375, "logps/ref_chosen": -85.25615692138672, "logps/ref_rejected": -108.77261352539062, "logps/rejected": -185.5627899169922, "loss": 1.1514, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6872435212135315, "rewards/margins": 0.39410722255706787, "rewards/rejected": -1.0813508033752441, "step": 565 }, { "epoch": 0.8556311413454271, "epsilon_dpo/beta": 0.014070717617869377, "epsilon_dpo/beta_margin_grad_mean": -0.4168214201927185, "epsilon_dpo/beta_margin_grad_std": 0.1676950603723526, "epsilon_dpo/beta_margin_mean": 0.3780567944049835, "epsilon_dpo/beta_margin_std": 0.7630917429924011, "epsilon_dpo/loss_margin_mean": 27.264877319335938, "grad_norm": 12.438030242919922, "kl/avg_steps": 0.34375, "kl/beta": 0.014117840677499771, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.1537655732553764e-08, "logits/chosen": -0.2886316180229187, "logits/rejected": -0.3793463706970215, "logps/chosen": -137.4063262939453, "logps/ref_chosen": -93.50871276855469, "logps/ref_rejected": -94.66215515136719, "logps/rejected": -165.82464599609375, "loss": 1.1777, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6199392080307007, "rewards/margins": 0.3780567944049835, "rewards/rejected": -0.9979960918426514, "step": 566 }, { "epoch": 0.8571428571428571, "epsilon_dpo/beta": 0.013991734944283962, "epsilon_dpo/beta_margin_grad_mean": -0.3932262063026428, "epsilon_dpo/beta_margin_grad_std": 0.14714936912059784, "epsilon_dpo/beta_margin_mean": 0.48000258207321167, "epsilon_dpo/beta_margin_std": 0.6740255355834961, "epsilon_dpo/loss_margin_mean": 34.59055709838867, "grad_norm": 8.408370971679688, "kl/avg_steps": 0.5625, "kl/beta": 0.01406947709619999, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.089785553471233e-08, "logits/chosen": -0.0031261444091796875, "logits/rejected": -0.3361750841140747, "logps/chosen": -118.34280395507812, "logps/ref_chosen": -74.65908813476562, "logps/ref_rejected": -104.87959289550781, "logps/rejected": -183.15386962890625, "loss": 1.0661, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6130213737487793, "rewards/margins": 0.48000258207321167, "rewards/rejected": -1.0930240154266357, "step": 567 }, { "epoch": 0.8586545729402872, "epsilon_dpo/beta": 0.013922218233346939, "epsilon_dpo/beta_margin_grad_mean": -0.3934234082698822, "epsilon_dpo/beta_margin_grad_std": 0.1543276011943817, "epsilon_dpo/beta_margin_mean": 0.4908979535102844, "epsilon_dpo/beta_margin_std": 0.7174976468086243, "epsilon_dpo/loss_margin_mean": 35.56275177001953, "grad_norm": 7.462749004364014, "kl/avg_steps": 0.5, "kl/beta": 0.013990779407322407, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.026418409484513e-08, "logits/chosen": -0.2286403328180313, "logits/rejected": -0.3169150948524475, "logps/chosen": -112.17529296875, "logps/ref_chosen": -75.05364227294922, "logps/ref_rejected": -115.28018188476562, "logps/rejected": -187.96458435058594, "loss": 1.0702, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5188618302345276, "rewards/margins": 0.4908979535102844, "rewards/rejected": -1.009759783744812, "step": 568 }, { "epoch": 0.8601662887377173, "epsilon_dpo/beta": 0.013870356604456902, "epsilon_dpo/beta_margin_grad_mean": -0.4265102446079254, "epsilon_dpo/beta_margin_grad_std": 0.15814505517482758, "epsilon_dpo/beta_margin_mean": 0.3268107771873474, "epsilon_dpo/beta_margin_std": 0.7094203233718872, "epsilon_dpo/loss_margin_mean": 23.92220687866211, "grad_norm": 9.30414867401123, "kl/avg_steps": 0.375, "kl/beta": 0.01392117328941822, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.963665913810451e-08, "logits/chosen": -0.32446902990341187, "logits/rejected": -0.19666746258735657, "logps/chosen": -131.11181640625, "logps/ref_chosen": -87.77325439453125, "logps/ref_rejected": -98.90798950195312, "logps/rejected": -166.16876220703125, "loss": 1.2032, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6034388542175293, "rewards/margins": 0.3268108069896698, "rewards/rejected": -0.9302496910095215, "step": 569 }, { "epoch": 0.8616780045351474, "epsilon_dpo/beta": 0.013783859089016914, "epsilon_dpo/beta_margin_grad_mean": -0.3789195418357849, "epsilon_dpo/beta_margin_grad_std": 0.14751386642456055, "epsilon_dpo/beta_margin_mean": 0.554857611656189, "epsilon_dpo/beta_margin_std": 0.6948018074035645, "epsilon_dpo/loss_margin_mean": 40.50212097167969, "grad_norm": 7.067422866821289, "kl/avg_steps": 0.625, "kl/beta": 0.013869163580238819, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.9015298217712453e-08, "logits/chosen": -0.28197193145751953, "logits/rejected": -0.05522429943084717, "logps/chosen": -109.03466796875, "logps/ref_chosen": -74.21205139160156, "logps/ref_rejected": -107.4862060546875, "logps/rejected": -182.81094360351562, "loss": 1.0136, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4814842939376831, "rewards/margins": 0.554857611656189, "rewards/rejected": -1.036341905593872, "step": 570 }, { "epoch": 0.8631897203325775, "epsilon_dpo/beta": 0.013724091462790966, "epsilon_dpo/beta_margin_grad_mean": -0.4350747764110565, "epsilon_dpo/beta_margin_grad_std": 0.1576703041791916, "epsilon_dpo/beta_margin_mean": 0.2830599248409271, "epsilon_dpo/beta_margin_std": 0.6972734332084656, "epsilon_dpo/loss_margin_mean": 20.96841049194336, "grad_norm": 9.307204246520996, "kl/avg_steps": 0.4375, "kl/beta": 0.013783019967377186, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.840011871446962e-08, "logits/chosen": -0.12085636705160141, "logits/rejected": -0.09424938261508942, "logps/chosen": -122.22715759277344, "logps/ref_chosen": -77.99315643310547, "logps/ref_rejected": -92.53704833984375, "logps/rejected": -157.73947143554688, "loss": 1.2378, "rewards/accuracies": 0.75, "rewards/chosen": -0.6095610857009888, "rewards/margins": 0.2830599546432495, "rewards/rejected": -0.8926210403442383, "step": 571 }, { "epoch": 0.8647014361300076, "epsilon_dpo/beta": 0.013642866164445877, "epsilon_dpo/beta_margin_grad_mean": -0.41651660203933716, "epsilon_dpo/beta_margin_grad_std": 0.13157060742378235, "epsilon_dpo/beta_margin_mean": 0.3664814531803131, "epsilon_dpo/beta_margin_std": 0.597045361995697, "epsilon_dpo/loss_margin_mean": 27.090225219726562, "grad_norm": 8.313310623168945, "kl/avg_steps": 0.59375, "kl/beta": 0.013722982257604599, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.7791137836269158e-08, "logits/chosen": -0.28194281458854675, "logits/rejected": -0.2647858262062073, "logps/chosen": -144.88211059570312, "logps/ref_chosen": -98.92315673828125, "logps/ref_rejected": -86.06552124023438, "logps/rejected": -159.1147003173828, "loss": 1.1352, "rewards/accuracies": 0.78125, "rewards/chosen": -0.62903892993927, "rewards/margins": 0.3664814233779907, "rewards/rejected": -0.9955204129219055, "step": 572 }, { "epoch": 0.8662131519274376, "epsilon_dpo/beta": 0.013583657331764698, "epsilon_dpo/beta_margin_grad_mean": -0.4110637307167053, "epsilon_dpo/beta_margin_grad_std": 0.17743809521198273, "epsilon_dpo/beta_margin_mean": 0.42062875628471375, "epsilon_dpo/beta_margin_std": 0.8343303799629211, "epsilon_dpo/loss_margin_mean": 31.3470401763916, "grad_norm": 6.728772163391113, "kl/avg_steps": 0.4375, "kl/beta": 0.013641982339322567, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.718837261761528e-08, "logits/chosen": -0.49589985609054565, "logits/rejected": -0.6189365386962891, "logps/chosen": -141.02272033691406, "logps/ref_chosen": -94.72535705566406, "logps/ref_rejected": -111.88986206054688, "logps/rejected": -189.5342559814453, "loss": 1.1658, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6313270330429077, "rewards/margins": 0.42062878608703613, "rewards/rejected": -1.0519558191299438, "step": 573 }, { "epoch": 0.8677248677248677, "epsilon_dpo/beta": 0.013522343710064888, "epsilon_dpo/beta_margin_grad_mean": -0.3827129304409027, "epsilon_dpo/beta_margin_grad_std": 0.1437026411294937, "epsilon_dpo/beta_margin_mean": 0.5322164297103882, "epsilon_dpo/beta_margin_std": 0.659810483455658, "epsilon_dpo/loss_margin_mean": 39.66382598876953, "grad_norm": 7.087203502655029, "kl/avg_steps": 0.453125, "kl/beta": 0.013582559302449226, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.659183991914696e-08, "logits/chosen": -0.19681048393249512, "logits/rejected": -0.04592698812484741, "logps/chosen": -123.19125366210938, "logps/ref_chosen": -80.30760192871094, "logps/ref_rejected": -112.79890441894531, "logps/rejected": -195.3463897705078, "loss": 1.0216, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5813426971435547, "rewards/margins": 0.5322164297103882, "rewards/rejected": -1.1135591268539429, "step": 574 }, { "epoch": 0.8692365835222978, "epsilon_dpo/beta": 0.013476158492267132, "epsilon_dpo/beta_margin_grad_mean": -0.42741483449935913, "epsilon_dpo/beta_margin_grad_std": 0.15975911915302277, "epsilon_dpo/beta_margin_mean": 0.3417285084724426, "epsilon_dpo/beta_margin_std": 0.7441447973251343, "epsilon_dpo/loss_margin_mean": 25.704574584960938, "grad_norm": 6.972466945648193, "kl/avg_steps": 0.34375, "kl/beta": 0.013521290384232998, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.600155642716606e-08, "logits/chosen": -0.1062895804643631, "logits/rejected": -0.19543448090553284, "logps/chosen": -135.07362365722656, "logps/ref_chosen": -87.42359161376953, "logps/ref_rejected": -117.86439514160156, "logps/rejected": -191.218994140625, "loss": 1.199, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6450673341751099, "rewards/margins": 0.3417285084724426, "rewards/rejected": -0.9867959022521973, "step": 575 }, { "epoch": 0.8707482993197279, "epsilon_dpo/beta": 0.013392090797424316, "epsilon_dpo/beta_margin_grad_mean": -0.38101744651794434, "epsilon_dpo/beta_margin_grad_std": 0.1552283763885498, "epsilon_dpo/beta_margin_mean": 0.5403695702552795, "epsilon_dpo/beta_margin_std": 0.7085763216018677, "epsilon_dpo/loss_margin_mean": 40.633262634277344, "grad_norm": 8.821754455566406, "kl/avg_steps": 0.625, "kl/beta": 0.013474970124661922, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.5417538653170754e-08, "logits/chosen": -0.09155446290969849, "logits/rejected": -0.23096878826618195, "logps/chosen": -109.19087219238281, "logps/ref_chosen": -73.79335021972656, "logps/ref_rejected": -111.63740539550781, "logps/rejected": -187.66818237304688, "loss": 1.0313, "rewards/accuracies": 0.78125, "rewards/chosen": -0.475741982460022, "rewards/margins": 0.5403695702552795, "rewards/rejected": -1.0161116123199463, "step": 576 }, { "epoch": 0.872260015117158, "epsilon_dpo/beta": 0.013346577063202858, "epsilon_dpo/beta_margin_grad_mean": -0.4340493381023407, "epsilon_dpo/beta_margin_grad_std": 0.15020710229873657, "epsilon_dpo/beta_margin_mean": 0.29871833324432373, "epsilon_dpo/beta_margin_std": 0.6770634055137634, "epsilon_dpo/loss_margin_mean": 22.716121673583984, "grad_norm": 8.71159553527832, "kl/avg_steps": 0.34375, "kl/beta": 0.013391274958848953, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.4839802933393607e-08, "logits/chosen": -0.28973859548568726, "logits/rejected": -0.2949914336204529, "logps/chosen": -130.9017333984375, "logps/ref_chosen": -86.85696411132812, "logps/ref_rejected": -90.04165649414062, "logps/rejected": -156.8025665283203, "loss": 1.2161, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5898460149765015, "rewards/margins": 0.29871833324432373, "rewards/rejected": -0.8885643482208252, "step": 577 }, { "epoch": 0.873771730914588, "epsilon_dpo/beta": 0.01332170981913805, "epsilon_dpo/beta_margin_grad_mean": -0.4408455193042755, "epsilon_dpo/beta_margin_grad_std": 0.14023886620998383, "epsilon_dpo/beta_margin_mean": 0.26660671830177307, "epsilon_dpo/beta_margin_std": 0.6218577027320862, "epsilon_dpo/loss_margin_mean": 20.344070434570312, "grad_norm": 7.295518398284912, "kl/avg_steps": 0.1875, "kl/beta": 0.013345399871468544, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 2.4268365428344733e-08, "logits/chosen": 0.05835941433906555, "logits/rejected": -0.23089148104190826, "logps/chosen": -125.45712280273438, "logps/ref_chosen": -80.96696472167969, "logps/ref_rejected": -90.08758544921875, "logps/rejected": -154.92181396484375, "loss": 1.228, "rewards/accuracies": 0.625, "rewards/chosen": -0.5944623947143555, "rewards/margins": 0.26660671830177307, "rewards/rejected": -0.8610690832138062, "step": 578 }, { "epoch": 0.8752834467120182, "epsilon_dpo/beta": 0.013238495215773582, "epsilon_dpo/beta_margin_grad_mean": -0.396721214056015, "epsilon_dpo/beta_margin_grad_std": 0.14099986851215363, "epsilon_dpo/beta_margin_mean": 0.4538329839706421, "epsilon_dpo/beta_margin_std": 0.6490737795829773, "epsilon_dpo/loss_margin_mean": 34.52947235107422, "grad_norm": 7.298642635345459, "kl/avg_steps": 0.625, "kl/beta": 0.01332042459398508, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.3703242122359357e-08, "logits/chosen": 0.07702606916427612, "logits/rejected": 0.09959688782691956, "logps/chosen": -129.31687927246094, "logps/ref_chosen": -80.56574249267578, "logps/ref_rejected": -92.1923828125, "logps/rejected": -175.47299194335938, "loss": 1.0793, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6462571620941162, "rewards/margins": 0.4538329839706421, "rewards/rejected": -1.1000901460647583, "step": 579 }, { "epoch": 0.8767951625094482, "epsilon_dpo/beta": 0.013181092217564583, "epsilon_dpo/beta_margin_grad_mean": -0.4260570704936981, "epsilon_dpo/beta_margin_grad_std": 0.15772272646427155, "epsilon_dpo/beta_margin_mean": 0.33158257603645325, "epsilon_dpo/beta_margin_std": 0.7125260233879089, "epsilon_dpo/loss_margin_mean": 25.510705947875977, "grad_norm": 9.164182662963867, "kl/avg_steps": 0.4375, "kl/beta": 0.013237688690423965, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.3144448823151392e-08, "logits/chosen": -0.31698185205459595, "logits/rejected": -0.5376484990119934, "logps/chosen": -122.604248046875, "logps/ref_chosen": -81.20346069335938, "logps/ref_rejected": -105.1043701171875, "logps/rejected": -172.015869140625, "loss": 1.1996, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5469723343849182, "rewards/margins": 0.33158260583877563, "rewards/rejected": -0.8785548806190491, "step": 580 }, { "epoch": 0.8783068783068783, "epsilon_dpo/beta": 0.013111318461596966, "epsilon_dpo/beta_margin_grad_mean": -0.40876150131225586, "epsilon_dpo/beta_margin_grad_std": 0.1576615869998932, "epsilon_dpo/beta_margin_mean": 0.4062279760837555, "epsilon_dpo/beta_margin_std": 0.7067951560020447, "epsilon_dpo/loss_margin_mean": 31.322311401367188, "grad_norm": 8.796152114868164, "kl/avg_steps": 0.53125, "kl/beta": 0.013180025853216648, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.259200116137039e-08, "logits/chosen": -0.1651584953069687, "logits/rejected": -0.24735815823078156, "logps/chosen": -129.39312744140625, "logps/ref_chosen": -80.71034240722656, "logps/ref_rejected": -107.85765838623047, "logps/rejected": -187.86276245117188, "loss": 1.1368, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6408957839012146, "rewards/margins": 0.4062279462814331, "rewards/rejected": -1.0471237897872925, "step": 581 }, { "epoch": 0.8798185941043084, "epsilon_dpo/beta": 0.013060449622571468, "epsilon_dpo/beta_margin_grad_mean": -0.4191682040691376, "epsilon_dpo/beta_margin_grad_std": 0.152615487575531, "epsilon_dpo/beta_margin_mean": 0.3628266453742981, "epsilon_dpo/beta_margin_std": 0.6817345023155212, "epsilon_dpo/loss_margin_mean": 28.12277603149414, "grad_norm": 8.426379203796387, "kl/avg_steps": 0.390625, "kl/beta": 0.01311037689447403, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.204591459016525e-08, "logits/chosen": -0.1529655158519745, "logits/rejected": -0.05211438238620758, "logps/chosen": -135.3836212158203, "logps/ref_chosen": -85.75233459472656, "logps/ref_rejected": -77.65898895263672, "logps/rejected": -155.41305541992188, "loss": 1.1641, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6505056619644165, "rewards/margins": 0.3628266453742981, "rewards/rejected": -1.0133322477340698, "step": 582 }, { "epoch": 0.8813303099017384, "epsilon_dpo/beta": 0.013015775009989738, "epsilon_dpo/beta_margin_grad_mean": -0.43119898438453674, "epsilon_dpo/beta_margin_grad_std": 0.1673259735107422, "epsilon_dpo/beta_margin_mean": 0.31490734219551086, "epsilon_dpo/beta_margin_std": 0.7513457536697388, "epsilon_dpo/loss_margin_mean": 24.607948303222656, "grad_norm": 10.292269706726074, "kl/avg_steps": 0.34375, "kl/beta": 0.013059364631772041, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.1506204384751064e-08, "logits/chosen": -0.07927985489368439, "logits/rejected": -0.35749489068984985, "logps/chosen": -120.08602905273438, "logps/ref_chosen": -71.96379089355469, "logps/ref_rejected": -112.45161437988281, "logps/rejected": -185.18179321289062, "loss": 1.2273, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6287330389022827, "rewards/margins": 0.3149073123931885, "rewards/rejected": -0.9436403512954712, "step": 583 }, { "epoch": 0.8828420256991686, "epsilon_dpo/beta": 0.012958982959389687, "epsilon_dpo/beta_margin_grad_mean": -0.41085541248321533, "epsilon_dpo/beta_margin_grad_std": 0.15736804902553558, "epsilon_dpo/beta_margin_mean": 0.40208297967910767, "epsilon_dpo/beta_margin_std": 0.7118176817893982, "epsilon_dpo/loss_margin_mean": 31.387554168701172, "grad_norm": 7.973084926605225, "kl/avg_steps": 0.4375, "kl/beta": 0.013014626689255238, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.09728856419826e-08, "logits/chosen": 0.2169530838727951, "logits/rejected": -0.1978251039981842, "logps/chosen": -100.49431610107422, "logps/ref_chosen": -65.38093566894531, "logps/ref_rejected": -107.81880187988281, "logps/rejected": -174.31973266601562, "loss": 1.1409, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4563906788825989, "rewards/margins": 0.40208297967910767, "rewards/rejected": -0.8584736585617065, "step": 584 }, { "epoch": 0.8843537414965986, "epsilon_dpo/beta": 0.012898484244942665, "epsilon_dpo/beta_margin_grad_mean": -0.42835086584091187, "epsilon_dpo/beta_margin_grad_std": 0.14225082099437714, "epsilon_dpo/beta_margin_mean": 0.3104247748851776, "epsilon_dpo/beta_margin_std": 0.6384141445159912, "epsilon_dpo/loss_margin_mean": 24.375465393066406, "grad_norm": 7.945513725280762, "kl/avg_steps": 0.46875, "kl/beta": 0.012957935221493244, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.044597327993153e-08, "logits/chosen": -0.2803378701210022, "logits/rejected": -0.17878082394599915, "logps/chosen": -124.75691223144531, "logps/ref_chosen": -80.82450866699219, "logps/ref_rejected": -100.2001953125, "logps/rejected": -168.508056640625, "loss": 1.1949, "rewards/accuracies": 0.75, "rewards/chosen": -0.5681923627853394, "rewards/margins": 0.3104247450828552, "rewards/rejected": -0.8786171674728394, "step": 585 }, { "epoch": 0.8858654572940288, "epsilon_dpo/beta": 0.01282621268182993, "epsilon_dpo/beta_margin_grad_mean": -0.39538994431495667, "epsilon_dpo/beta_margin_grad_std": 0.1316211223602295, "epsilon_dpo/beta_margin_mean": 0.4575497508049011, "epsilon_dpo/beta_margin_std": 0.5963794589042664, "epsilon_dpo/loss_margin_mean": 35.92658233642578, "grad_norm": 8.383708000183105, "kl/avg_steps": 0.5625, "kl/beta": 0.01289747841656208, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.9925482037469187e-08, "logits/chosen": 0.13802039623260498, "logits/rejected": 0.03304041922092438, "logps/chosen": -113.84898376464844, "logps/ref_chosen": -74.60169982910156, "logps/ref_rejected": -90.7721939086914, "logps/rejected": -165.94606018066406, "loss": 1.062, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5045956969261169, "rewards/margins": 0.4575497508049011, "rewards/rejected": -0.9621454477310181, "step": 586 }, { "epoch": 0.8873771730914588, "epsilon_dpo/beta": 0.01277050282806158, "epsilon_dpo/beta_margin_grad_mean": -0.39474213123321533, "epsilon_dpo/beta_margin_grad_std": 0.16267690062522888, "epsilon_dpo/beta_margin_mean": 0.47096407413482666, "epsilon_dpo/beta_margin_std": 0.749113142490387, "epsilon_dpo/loss_margin_mean": 37.267826080322266, "grad_norm": 10.214089393615723, "kl/avg_steps": 0.4375, "kl/beta": 0.012825336307287216, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.9411426473854687e-08, "logits/chosen": -0.44531530141830444, "logits/rejected": -0.30305105447769165, "logps/chosen": -123.94343566894531, "logps/ref_chosen": -84.01087188720703, "logps/ref_rejected": -85.56326293945312, "logps/rejected": -162.76364135742188, "loss": 1.0979, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5123590230941772, "rewards/margins": 0.47096407413482666, "rewards/rejected": -0.9833230972290039, "step": 587 }, { "epoch": 0.8888888888888888, "epsilon_dpo/beta": 0.012702901847660542, "epsilon_dpo/beta_margin_grad_mean": -0.4087964594364166, "epsilon_dpo/beta_margin_grad_std": 0.14179039001464844, "epsilon_dpo/beta_margin_mean": 0.3948504626750946, "epsilon_dpo/beta_margin_std": 0.6233694553375244, "epsilon_dpo/loss_margin_mean": 31.392719268798828, "grad_norm": 9.330618858337402, "kl/avg_steps": 0.53125, "kl/beta": 0.012769469991326332, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.890382096832699e-08, "logits/chosen": -0.4431498050689697, "logits/rejected": -0.33868607878685, "logps/chosen": -128.64813232421875, "logps/ref_chosen": -83.79899597167969, "logps/ref_rejected": -105.11346435546875, "logps/rejected": -181.35531616210938, "loss": 1.1214, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5720565319061279, "rewards/margins": 0.3948504328727722, "rewards/rejected": -0.9669069647789001, "step": 588 }, { "epoch": 0.890400604686319, "epsilon_dpo/beta": 0.012623865157365799, "epsilon_dpo/beta_margin_grad_mean": -0.404887318611145, "epsilon_dpo/beta_margin_grad_std": 0.13321241736412048, "epsilon_dpo/beta_margin_mean": 0.41894814372062683, "epsilon_dpo/beta_margin_std": 0.5990987420082092, "epsilon_dpo/loss_margin_mean": 33.42301940917969, "grad_norm": 9.14968204498291, "kl/avg_steps": 0.625, "kl/beta": 0.012701990082859993, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.840267971970344e-08, "logits/chosen": -0.6417205929756165, "logits/rejected": -0.3048175871372223, "logps/chosen": -121.8003158569336, "logps/ref_chosen": -80.36790466308594, "logps/ref_rejected": -97.85157775878906, "logps/rejected": -172.70700073242188, "loss": 1.0935, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5246583819389343, "rewards/margins": 0.41894811391830444, "rewards/rejected": -0.9436064958572388, "step": 589 }, { "epoch": 0.891912320483749, "epsilon_dpo/beta": 0.012557252310216427, "epsilon_dpo/beta_margin_grad_mean": -0.3977271616458893, "epsilon_dpo/beta_margin_grad_std": 0.13341881334781647, "epsilon_dpo/beta_margin_mean": 0.45191994309425354, "epsilon_dpo/beta_margin_std": 0.6039386987686157, "epsilon_dpo/loss_margin_mean": 36.24995803833008, "grad_norm": 7.618983268737793, "kl/avg_steps": 0.53125, "kl/beta": 0.012623095884919167, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.7908016745981856e-08, "logits/chosen": -0.39597877860069275, "logits/rejected": -0.14002303779125214, "logps/chosen": -138.19757080078125, "logps/ref_chosen": -93.33839416503906, "logps/ref_rejected": -107.83362579345703, "logps/rejected": -188.94276428222656, "loss": 1.0684, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5645281076431274, "rewards/margins": 0.4519199728965759, "rewards/rejected": -1.0164480209350586, "step": 590 }, { "epoch": 0.8934240362811792, "epsilon_dpo/beta": 0.012490932829678059, "epsilon_dpo/beta_margin_grad_mean": -0.4069448709487915, "epsilon_dpo/beta_margin_grad_std": 0.14455479383468628, "epsilon_dpo/beta_margin_mean": 0.41144993901252747, "epsilon_dpo/beta_margin_std": 0.6489174962043762, "epsilon_dpo/loss_margin_mean": 33.24910354614258, "grad_norm": 9.191134452819824, "kl/avg_steps": 0.53125, "kl/beta": 0.012556389905512333, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.7419845883949098e-08, "logits/chosen": -0.2569578289985657, "logits/rejected": -0.37240132689476013, "logps/chosen": -116.85206604003906, "logps/ref_chosen": -81.7674560546875, "logps/ref_rejected": -107.47393798828125, "logps/rejected": -175.80764770507812, "loss": 1.1141, "rewards/accuracies": 0.75, "rewards/chosen": -0.4404708743095398, "rewards/margins": 0.41144996881484985, "rewards/rejected": -0.8519208431243896, "step": 591 }, { "epoch": 0.8949357520786092, "epsilon_dpo/beta": 0.012424926273524761, "epsilon_dpo/beta_margin_grad_mean": -0.41350051760673523, "epsilon_dpo/beta_margin_grad_std": 0.1312229186296463, "epsilon_dpo/beta_margin_mean": 0.3863595724105835, "epsilon_dpo/beta_margin_std": 0.600444495677948, "epsilon_dpo/loss_margin_mean": 31.340322494506836, "grad_norm": 7.968548774719238, "kl/avg_steps": 0.53125, "kl/beta": 0.012490036897361279, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.6938180788793556e-08, "logits/chosen": -0.11009342223405838, "logits/rejected": -0.2954660654067993, "logps/chosen": -112.07117462158203, "logps/ref_chosen": -72.5201416015625, "logps/ref_rejected": -109.68800354003906, "logps/rejected": -180.57936096191406, "loss": 1.1191, "rewards/accuracies": 0.765625, "rewards/chosen": -0.49197059869766235, "rewards/margins": 0.3863595724105835, "rewards/rejected": -0.8783301711082458, "step": 592 }, { "epoch": 0.8964474678760394, "epsilon_dpo/beta": 0.012367033399641514, "epsilon_dpo/beta_margin_grad_mean": -0.41395166516304016, "epsilon_dpo/beta_margin_grad_std": 0.14591091871261597, "epsilon_dpo/beta_margin_mean": 0.38128843903541565, "epsilon_dpo/beta_margin_std": 0.65277498960495, "epsilon_dpo/loss_margin_mean": 31.176050186157227, "grad_norm": 6.928558826446533, "kl/avg_steps": 0.46875, "kl/beta": 0.01242403406649828, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.6463034933723336e-08, "logits/chosen": 0.08199097216129303, "logits/rejected": -0.12141124159097672, "logps/chosen": -98.86445617675781, "logps/ref_chosen": -62.098533630371094, "logps/ref_rejected": -90.62057495117188, "logps/rejected": -158.56256103515625, "loss": 1.1399, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4569452404975891, "rewards/margins": 0.38128843903541565, "rewards/rejected": -0.8382337093353271, "step": 593 }, { "epoch": 0.8979591836734694, "epsilon_dpo/beta": 0.01231706328690052, "epsilon_dpo/beta_margin_grad_mean": -0.42870694398880005, "epsilon_dpo/beta_margin_grad_std": 0.13863804936408997, "epsilon_dpo/beta_margin_mean": 0.314374178647995, "epsilon_dpo/beta_margin_std": 0.6136248111724854, "epsilon_dpo/loss_margin_mean": 25.851320266723633, "grad_norm": 8.039793014526367, "kl/avg_steps": 0.40625, "kl/beta": 0.012366068549454212, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.5994421609589385e-08, "logits/chosen": -0.09637521952390671, "logits/rejected": -0.3282632827758789, "logps/chosen": -136.2696990966797, "logps/ref_chosen": -89.39488220214844, "logps/ref_rejected": -96.3311767578125, "logps/rejected": -169.05731201171875, "loss": 1.1847, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5789014101028442, "rewards/margins": 0.314374178647995, "rewards/rejected": -0.8932756185531616, "step": 594 }, { "epoch": 0.8994708994708994, "epsilon_dpo/beta": 0.012251830659806728, "epsilon_dpo/beta_margin_grad_mean": -0.3980482816696167, "epsilon_dpo/beta_margin_grad_std": 0.1583113670349121, "epsilon_dpo/beta_margin_mean": 0.4468056857585907, "epsilon_dpo/beta_margin_std": 0.7155683636665344, "epsilon_dpo/loss_margin_mean": 36.836448669433594, "grad_norm": 7.929884910583496, "kl/avg_steps": 0.53125, "kl/beta": 0.01231603417545557, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.553235392451377e-08, "logits/chosen": 0.14382167160511017, "logits/rejected": -0.2625676095485687, "logps/chosen": -119.94728088378906, "logps/ref_chosen": -76.57876586914062, "logps/ref_rejected": -110.7374267578125, "logps/rejected": -190.9423828125, "loss": 1.1072, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5333274602890015, "rewards/margins": 0.4468056857585907, "rewards/rejected": -0.9801331758499146, "step": 595 }, { "epoch": 0.9009826152683296, "epsilon_dpo/beta": 0.01225217618048191, "epsilon_dpo/beta_margin_grad_mean": -0.4713704288005829, "epsilon_dpo/beta_margin_grad_std": 0.13841482996940613, "epsilon_dpo/beta_margin_mean": 0.13456059992313385, "epsilon_dpo/beta_margin_std": 0.6147952079772949, "epsilon_dpo/loss_margin_mean": 11.352704048156738, "grad_norm": 7.926502704620361, "kl/avg_steps": 0.0, "kl/beta": 0.012250951491296291, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 1.507684480352292e-08, "logits/chosen": -0.09347207844257355, "logits/rejected": -0.08081246167421341, "logps/chosen": -143.42486572265625, "logps/ref_chosen": -92.4140625, "logps/ref_rejected": -84.22576904296875, "logps/rejected": -146.5892791748047, "loss": 1.3453, "rewards/accuracies": 0.53125, "rewards/chosen": -0.627160906791687, "rewards/margins": 0.13456059992313385, "rewards/rejected": -0.7617214918136597, "step": 596 }, { "epoch": 0.9024943310657596, "epsilon_dpo/beta": 0.012204297818243504, "epsilon_dpo/beta_margin_grad_mean": -0.41553056240081787, "epsilon_dpo/beta_margin_grad_std": 0.13693609833717346, "epsilon_dpo/beta_margin_mean": 0.37261930108070374, "epsilon_dpo/beta_margin_std": 0.6076087951660156, "epsilon_dpo/loss_margin_mean": 30.870195388793945, "grad_norm": 7.076600551605225, "kl/avg_steps": 0.390625, "kl/beta": 0.012250951491296291, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.4627906988186111e-08, "logits/chosen": -0.1559733748435974, "logits/rejected": 0.09400632977485657, "logps/chosen": -108.35760498046875, "logps/ref_chosen": -71.6175537109375, "logps/ref_rejected": -76.79582214355469, "logps/rejected": -144.40606689453125, "loss": 1.1341, "rewards/accuracies": 0.703125, "rewards/chosen": -0.44950392842292786, "rewards/margins": 0.37261927127838135, "rewards/rejected": -0.8221231698989868, "step": 597 }, { "epoch": 0.9040060468631897, "epsilon_dpo/beta": 0.012158735655248165, "epsilon_dpo/beta_margin_grad_mean": -0.4356701672077179, "epsilon_dpo/beta_margin_grad_std": 0.13405224680900574, "epsilon_dpo/beta_margin_mean": 0.28293609619140625, "epsilon_dpo/beta_margin_std": 0.5979596972465515, "epsilon_dpo/loss_margin_mean": 23.588106155395508, "grad_norm": 7.478119850158691, "kl/avg_steps": 0.375, "kl/beta": 0.012203281745314598, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.4185553036259095e-08, "logits/chosen": -0.34806156158447266, "logits/rejected": -0.42723479866981506, "logps/chosen": -133.13076782226562, "logps/ref_chosen": -79.51602172851562, "logps/ref_rejected": -104.75083923339844, "logps/rejected": -181.95370483398438, "loss": 1.2068, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6541522741317749, "rewards/margins": 0.28293609619140625, "rewards/rejected": -0.9370884299278259, "step": 598 }, { "epoch": 0.9055177626606198, "epsilon_dpo/beta": 0.012111391872167587, "epsilon_dpo/beta_margin_grad_mean": -0.424478679895401, "epsilon_dpo/beta_margin_grad_std": 0.1403839886188507, "epsilon_dpo/beta_margin_mean": 0.3317543566226959, "epsilon_dpo/beta_margin_std": 0.6224579215049744, "epsilon_dpo/loss_margin_mean": 27.74344825744629, "grad_norm": 9.905296325683594, "kl/avg_steps": 0.390625, "kl/beta": 0.012157690711319447, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.3749795321332885e-08, "logits/chosen": -0.06255094707012177, "logits/rejected": -0.120047926902771, "logps/chosen": -126.98910522460938, "logps/ref_chosen": -77.9075927734375, "logps/ref_rejected": -96.17262268066406, "logps/rejected": -172.99758911132812, "loss": 1.1725, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5957086682319641, "rewards/margins": 0.3317543864250183, "rewards/rejected": -0.9274630546569824, "step": 599 }, { "epoch": 0.9070294784580499, "epsilon_dpo/beta": 0.01207374595105648, "epsilon_dpo/beta_margin_grad_mean": -0.4346500635147095, "epsilon_dpo/beta_margin_grad_std": 0.12690189480781555, "epsilon_dpo/beta_margin_mean": 0.28666380047798157, "epsilon_dpo/beta_margin_std": 0.5536522269248962, "epsilon_dpo/loss_margin_mean": 24.055171966552734, "grad_norm": 8.634313583374023, "kl/avg_steps": 0.3125, "kl/beta": 0.012110384181141853, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.3320646032487393e-08, "logits/chosen": -0.2699419856071472, "logits/rejected": -0.45036131143569946, "logps/chosen": -131.400634765625, "logps/ref_chosen": -85.35400390625, "logps/ref_rejected": -104.27677154541016, "logps/rejected": -174.37857055664062, "loss": 1.1925, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5574197173118591, "rewards/margins": 0.28666380047798157, "rewards/rejected": -0.8440835475921631, "step": 600 }, { "epoch": 0.9070294784580499, "eval_epsilon_dpo/beta": 0.012021305970847607, "eval_epsilon_dpo/beta_margin_grad_mean": -0.4186500012874603, "eval_epsilon_dpo/beta_margin_grad_std": 0.13757552206516266, "eval_epsilon_dpo/beta_margin_mean": 0.35871562361717224, "eval_epsilon_dpo/beta_margin_std": 0.6142193675041199, "eval_epsilon_dpo/loss_margin_mean": 30.170278549194336, "eval_kl/n_epsilon_steps": 0.28169015049934387, "eval_kl/p_epsilon_steps": 0.7169894576072693, "eval_logits/chosen": -0.20889337360858917, "eval_logits/rejected": -0.34235018491744995, "eval_logps/chosen": -131.7626953125, "eval_logps/ref_chosen": -87.42715454101562, "eval_logps/ref_rejected": -104.23548889160156, "eval_logps/rejected": -178.7413330078125, "eval_loss": 0.5753073692321777, "eval_rewards/accuracies": 0.7235915660858154, "eval_rewards/chosen": -0.5348218083381653, "eval_rewards/margins": 0.35871562361717224, "eval_rewards/rejected": -0.8935373425483704, "eval_runtime": 47.731, "eval_samples_per_second": 48.25, "eval_steps_per_second": 1.508, "step": 600 }, { "epoch": 0.90854119425548, "epsilon_dpo/beta": 0.012013494968414307, "epsilon_dpo/beta_margin_grad_mean": -0.4087972640991211, "epsilon_dpo/beta_margin_grad_std": 0.14293596148490906, "epsilon_dpo/beta_margin_mean": 0.40643519163131714, "epsilon_dpo/beta_margin_std": 0.6445590853691101, "epsilon_dpo/loss_margin_mean": 34.13679504394531, "grad_norm": 7.5616302490234375, "kl/avg_steps": 0.5, "kl/beta": 0.012072657234966755, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.2898117173950868e-08, "logits/chosen": -0.26879027485847473, "logits/rejected": -0.3316187262535095, "logps/chosen": -113.0506591796875, "logps/ref_chosen": -76.60881042480469, "logps/ref_rejected": -104.78749084472656, "logps/rejected": -175.3661346435547, "loss": 1.1165, "rewards/accuracies": 0.734375, "rewards/chosen": -0.43959861993789673, "rewards/margins": 0.40643519163131714, "rewards/rejected": -0.8460338115692139, "step": 601 }, { "epoch": 0.91005291005291, "epsilon_dpo/beta": 0.011964989826083183, "epsilon_dpo/beta_margin_grad_mean": -0.4060874879360199, "epsilon_dpo/beta_margin_grad_std": 0.1276295930147171, "epsilon_dpo/beta_margin_mean": 0.41427186131477356, "epsilon_dpo/beta_margin_std": 0.5662131309509277, "epsilon_dpo/loss_margin_mean": 34.92207717895508, "grad_norm": 12.094139099121094, "kl/avg_steps": 0.40625, "kl/beta": 0.012012594379484653, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.2482220564763667e-08, "logits/chosen": -0.035280682146549225, "logits/rejected": -0.3258655071258545, "logps/chosen": -114.74050903320312, "logps/ref_chosen": -82.1366958618164, "logps/ref_rejected": -95.36003875732422, "logps/rejected": -162.88592529296875, "loss": 1.0888, "rewards/accuracies": 0.75, "rewards/chosen": -0.3918892741203308, "rewards/margins": 0.41427183151245117, "rewards/rejected": -0.8061611652374268, "step": 602 }, { "epoch": 0.9115646258503401, "epsilon_dpo/beta": 0.011905360966920853, "epsilon_dpo/beta_margin_grad_mean": -0.4135136306285858, "epsilon_dpo/beta_margin_grad_std": 0.13839876651763916, "epsilon_dpo/beta_margin_mean": 0.3756512701511383, "epsilon_dpo/beta_margin_std": 0.612551748752594, "epsilon_dpo/loss_margin_mean": 31.871030807495117, "grad_norm": 8.700664520263672, "kl/avg_steps": 0.5, "kl/beta": 0.011963990516960621, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.2072967838448051e-08, "logits/chosen": -0.3890395164489746, "logits/rejected": -0.2390347272157669, "logps/chosen": -116.29725646972656, "logps/ref_chosen": -72.34424591064453, "logps/ref_rejected": -94.63508605957031, "logps/rejected": -170.45913696289062, "loss": 1.1334, "rewards/accuracies": 0.75, "rewards/chosen": -0.5251051783561707, "rewards/margins": 0.3756512403488159, "rewards/rejected": -0.9007564783096313, "step": 603 }, { "epoch": 0.9130763416477702, "epsilon_dpo/beta": 0.011849851347506046, "epsilon_dpo/beta_margin_grad_mean": -0.41967111825942993, "epsilon_dpo/beta_margin_grad_std": 0.14534930884838104, "epsilon_dpo/beta_margin_mean": 0.35719671845436096, "epsilon_dpo/beta_margin_std": 0.6550381183624268, "epsilon_dpo/loss_margin_mean": 30.46562957763672, "grad_norm": 6.926849842071533, "kl/avg_steps": 0.46875, "kl/beta": 0.011904468759894371, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.1670370442682459e-08, "logits/chosen": -0.06593503057956696, "logits/rejected": -0.2122724950313568, "logps/chosen": -130.89698791503906, "logps/ref_chosen": -96.72001647949219, "logps/ref_rejected": -91.02459716796875, "logps/rejected": -155.66720581054688, "loss": 1.1601, "rewards/accuracies": 0.75, "rewards/chosen": -0.4063175618648529, "rewards/margins": 0.35719674825668335, "rewards/rejected": -0.7635142803192139, "step": 604 }, { "epoch": 0.9145880574452003, "epsilon_dpo/beta": 0.011801970191299915, "epsilon_dpo/beta_margin_grad_mean": -0.4265349209308624, "epsilon_dpo/beta_margin_grad_std": 0.13410498201847076, "epsilon_dpo/beta_margin_mean": 0.3190124034881592, "epsilon_dpo/beta_margin_std": 0.5920915007591248, "epsilon_dpo/loss_margin_mean": 27.37565803527832, "grad_norm": 8.321650505065918, "kl/avg_steps": 0.40625, "kl/beta": 0.011848926544189453, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.1274439638981532e-08, "logits/chosen": -0.27900180220603943, "logits/rejected": -0.18813619017601013, "logps/chosen": -131.6309356689453, "logps/ref_chosen": -84.01524353027344, "logps/ref_rejected": -109.6864013671875, "logps/rejected": -184.67776489257812, "loss": 1.1748, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5635058283805847, "rewards/margins": 0.3190124034881592, "rewards/rejected": -0.8825182318687439, "step": 605 }, { "epoch": 0.9160997732426304, "epsilon_dpo/beta": 0.011743154376745224, "epsilon_dpo/beta_margin_grad_mean": -0.4185470938682556, "epsilon_dpo/beta_margin_grad_std": 0.11598649621009827, "epsilon_dpo/beta_margin_mean": 0.35316967964172363, "epsilon_dpo/beta_margin_std": 0.5112137198448181, "epsilon_dpo/loss_margin_mean": 30.316177368164062, "grad_norm": 7.579667091369629, "kl/avg_steps": 0.5, "kl/beta": 0.011800984852015972, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.0885186502381016e-08, "logits/chosen": -0.10875275731086731, "logits/rejected": -0.19334736466407776, "logps/chosen": -115.47592163085938, "logps/ref_chosen": -75.84971618652344, "logps/ref_rejected": -102.4177017211914, "logps/rejected": -172.36007690429688, "loss": 1.1251, "rewards/accuracies": 0.78125, "rewards/chosen": -0.46557578444480896, "rewards/margins": 0.35316967964172363, "rewards/rejected": -0.8187454342842102, "step": 606 }, { "epoch": 0.9176114890400605, "epsilon_dpo/beta": 0.011677390895783901, "epsilon_dpo/beta_margin_grad_mean": -0.41183313727378845, "epsilon_dpo/beta_margin_grad_std": 0.12496085464954376, "epsilon_dpo/beta_margin_mean": 0.380126029253006, "epsilon_dpo/beta_margin_std": 0.5489794015884399, "epsilon_dpo/loss_margin_mean": 32.82114791870117, "grad_norm": 8.1083984375, "kl/avg_steps": 0.5625, "kl/beta": 0.011742273345589638, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.0502621921127774e-08, "logits/chosen": -0.30899912118911743, "logits/rejected": -0.4363841414451599, "logps/chosen": -133.69430541992188, "logps/ref_chosen": -87.947998046875, "logps/ref_rejected": -99.3750991821289, "logps/rejected": -177.94256591796875, "loss": 1.1127, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5348155498504639, "rewards/margins": 0.380126029253006, "rewards/rejected": -0.9149416089057922, "step": 607 }, { "epoch": 0.9191232048374905, "epsilon_dpo/beta": 0.011623021215200424, "epsilon_dpo/beta_margin_grad_mean": -0.4458061158657074, "epsilon_dpo/beta_margin_grad_std": 0.1285128891468048, "epsilon_dpo/beta_margin_mean": 0.2374372035264969, "epsilon_dpo/beta_margin_std": 0.5633123517036438, "epsilon_dpo/loss_margin_mean": 20.721681594848633, "grad_norm": 8.346819877624512, "kl/avg_steps": 0.46875, "kl/beta": 0.011676592752337456, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.0126756596375685e-08, "logits/chosen": -0.0955449789762497, "logits/rejected": -0.5488607883453369, "logps/chosen": -138.93194580078125, "logps/ref_chosen": -88.26110076904297, "logps/ref_rejected": -116.95655822753906, "logps/rejected": -188.34909057617188, "loss": 1.2379, "rewards/accuracies": 0.71875, "rewards/chosen": -0.590623676776886, "rewards/margins": 0.2374371886253357, "rewards/rejected": -0.8280608654022217, "step": 608 }, { "epoch": 0.9206349206349206, "epsilon_dpo/beta": 0.011557895690202713, "epsilon_dpo/beta_margin_grad_mean": -0.4109853506088257, "epsilon_dpo/beta_margin_grad_std": 0.11524208635091782, "epsilon_dpo/beta_margin_mean": 0.3793385326862335, "epsilon_dpo/beta_margin_std": 0.5043821930885315, "epsilon_dpo/loss_margin_mean": 33.06473159790039, "grad_norm": 8.368597030639648, "kl/avg_steps": 0.5625, "kl/beta": 0.011622114107012749, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 9.757601041885694e-09, "logits/chosen": -0.08982004970312119, "logits/rejected": -0.21673482656478882, "logps/chosen": -110.45091247558594, "logps/ref_chosen": -72.53919982910156, "logps/ref_rejected": -86.69680786132812, "logps/rejected": -157.67324829101562, "loss": 1.1025, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4392332434654236, "rewards/margins": 0.3793385326862335, "rewards/rejected": -0.8185718059539795, "step": 609 }, { "epoch": 0.9221466364323507, "epsilon_dpo/beta": 0.011511306278407574, "epsilon_dpo/beta_margin_grad_mean": -0.42309725284576416, "epsilon_dpo/beta_margin_grad_std": 0.15262646973133087, "epsilon_dpo/beta_margin_mean": 0.34350430965423584, "epsilon_dpo/beta_margin_std": 0.6798610091209412, "epsilon_dpo/loss_margin_mean": 30.249845504760742, "grad_norm": 11.12037181854248, "kl/avg_steps": 0.40625, "kl/beta": 0.011557105928659439, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 9.395165583732379e-09, "logits/chosen": -0.2149120271205902, "logits/rejected": -0.40282920002937317, "logps/chosen": -153.38609313964844, "logps/ref_chosen": -107.02760314941406, "logps/ref_rejected": -110.5920181274414, "logps/rejected": -187.20034790039062, "loss": 1.1798, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5368313789367676, "rewards/margins": 0.34350430965423584, "rewards/rejected": -0.8803356885910034, "step": 610 }, { "epoch": 0.9236583522297808, "epsilon_dpo/beta": 0.011471925303339958, "epsilon_dpo/beta_margin_grad_mean": -0.4368804097175598, "epsilon_dpo/beta_margin_grad_std": 0.11818581819534302, "epsilon_dpo/beta_margin_mean": 0.27571043372154236, "epsilon_dpo/beta_margin_std": 0.5203703045845032, "epsilon_dpo/loss_margin_mean": 24.31031608581543, "grad_norm": 8.035173416137695, "kl/avg_steps": 0.34375, "kl/beta": 0.01151034515351057, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 9.03946036001449e-09, "logits/chosen": -0.575390636920929, "logits/rejected": -0.44690048694610596, "logps/chosen": -112.02863311767578, "logps/ref_chosen": -70.45919036865234, "logps/ref_rejected": -97.04055786132812, "logps/rejected": -162.92031860351562, "loss": 1.1931, "rewards/accuracies": 0.671875, "rewards/chosen": -0.4776163399219513, "rewards/margins": 0.27571046352386475, "rewards/rejected": -0.7533267736434937, "step": 611 }, { "epoch": 0.9251700680272109, "epsilon_dpo/beta": 0.011414701119065285, "epsilon_dpo/beta_margin_grad_mean": -0.4051789343357086, "epsilon_dpo/beta_margin_grad_std": 0.1485532522201538, "epsilon_dpo/beta_margin_mean": 0.4205136001110077, "epsilon_dpo/beta_margin_std": 0.6696256995201111, "epsilon_dpo/loss_margin_mean": 37.19691848754883, "grad_norm": 6.956040382385254, "kl/avg_steps": 0.5, "kl/beta": 0.011470913887023926, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 8.690495320571839e-09, "logits/chosen": -0.20279085636138916, "logits/rejected": -0.087235227227211, "logps/chosen": -128.58370971679688, "logps/ref_chosen": -83.53645324707031, "logps/ref_rejected": -112.86213684082031, "logps/rejected": -195.10629272460938, "loss": 1.113, "rewards/accuracies": 0.75, "rewards/chosen": -0.5160987377166748, "rewards/margins": 0.4205136001110077, "rewards/rejected": -0.9366123676300049, "step": 612 }, { "epoch": 0.926681783824641, "epsilon_dpo/beta": 0.011357910931110382, "epsilon_dpo/beta_margin_grad_mean": -0.3938443064689636, "epsilon_dpo/beta_margin_grad_std": 0.14389140903949738, "epsilon_dpo/beta_margin_mean": 0.4673750102519989, "epsilon_dpo/beta_margin_std": 0.6484513878822327, "epsilon_dpo/loss_margin_mean": 41.5140266418457, "grad_norm": 7.930556774139404, "kl/avg_steps": 0.5, "kl/beta": 0.011413844302296638, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 8.348280226706722e-09, "logits/chosen": -0.13990065455436707, "logits/rejected": 0.2776636481285095, "logps/chosen": -111.08456420898438, "logps/ref_chosen": -77.24560546875, "logps/ref_rejected": -84.56202697753906, "logps/rejected": -159.91500854492188, "loss": 1.0697, "rewards/accuracies": 0.765625, "rewards/chosen": -0.38656705617904663, "rewards/margins": 0.4673749804496765, "rewards/rejected": -0.8539420366287231, "step": 613 }, { "epoch": 0.9281934996220711, "epsilon_dpo/beta": 0.011297854594886303, "epsilon_dpo/beta_margin_grad_mean": -0.41276368498802185, "epsilon_dpo/beta_margin_grad_std": 0.12965954840183258, "epsilon_dpo/beta_margin_mean": 0.3738853335380554, "epsilon_dpo/beta_margin_std": 0.5611046552658081, "epsilon_dpo/loss_margin_mean": 33.407039642333984, "grad_norm": 8.850772857666016, "kl/avg_steps": 0.53125, "kl/beta": 0.011357058770954609, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.012824650910937e-09, "logits/chosen": 0.023061083629727364, "logits/rejected": 0.16475391387939453, "logps/chosen": -129.9523162841797, "logps/ref_chosen": -85.42471313476562, "logps/ref_rejected": -95.38154602050781, "logps/rejected": -173.3162078857422, "loss": 1.1217, "rewards/accuracies": 0.75, "rewards/chosen": -0.5050544738769531, "rewards/margins": 0.3738853335380554, "rewards/rejected": -0.8789397478103638, "step": 614 }, { "epoch": 0.9297052154195011, "epsilon_dpo/beta": 0.011255805380642414, "epsilon_dpo/beta_margin_grad_mean": -0.4168703258037567, "epsilon_dpo/beta_margin_grad_std": 0.1420803964138031, "epsilon_dpo/beta_margin_mean": 0.3664174973964691, "epsilon_dpo/beta_margin_std": 0.6301956176757812, "epsilon_dpo/loss_margin_mean": 32.92771530151367, "grad_norm": 7.011085510253906, "kl/avg_steps": 0.375, "kl/beta": 0.011297043412923813, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 7.684137976598088e-09, "logits/chosen": -0.35232728719711304, "logits/rejected": -0.32909321784973145, "logps/chosen": -147.59274291992188, "logps/ref_chosen": -98.17005157470703, "logps/ref_rejected": -123.09346008300781, "logps/rejected": -205.44387817382812, "loss": 1.1459, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5581832528114319, "rewards/margins": 0.3664174973964691, "rewards/rejected": -0.9246007800102234, "step": 615 }, { "epoch": 0.9312169312169312, "epsilon_dpo/beta": 0.011213753372430801, "epsilon_dpo/beta_margin_grad_mean": -0.42112424969673157, "epsilon_dpo/beta_margin_grad_std": 0.13007456064224243, "epsilon_dpo/beta_margin_mean": 0.34851300716400146, "epsilon_dpo/beta_margin_std": 0.5745180249214172, "epsilon_dpo/loss_margin_mean": 31.411874771118164, "grad_norm": 8.707462310791016, "kl/avg_steps": 0.375, "kl/beta": 0.011254837736487389, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 7.36222939784098e-09, "logits/chosen": -0.14188328385353088, "logits/rejected": -0.38138338923454285, "logps/chosen": -123.18547058105469, "logps/ref_chosen": -78.42754364013672, "logps/ref_rejected": -102.52302551269531, "logps/rejected": -178.6928253173828, "loss": 1.145, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5035969018936157, "rewards/margins": 0.34851300716400146, "rewards/rejected": -0.852109968662262, "step": 616 }, { "epoch": 0.9327286470143613, "epsilon_dpo/beta": 0.011143824085593224, "epsilon_dpo/beta_margin_grad_mean": -0.42190423607826233, "epsilon_dpo/beta_margin_grad_std": 0.10602878034114838, "epsilon_dpo/beta_margin_mean": 0.3308263123035431, "epsilon_dpo/beta_margin_std": 0.46238380670547485, "epsilon_dpo/loss_margin_mean": 29.89263153076172, "grad_norm": 8.340224266052246, "kl/avg_steps": 0.625, "kl/beta": 0.011212789453566074, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 7.047107919114586e-09, "logits/chosen": 0.02730114758014679, "logits/rejected": 0.01917155086994171, "logps/chosen": -133.4224853515625, "logps/ref_chosen": -82.33126831054688, "logps/ref_rejected": -103.87673950195312, "logps/rejected": -184.86058044433594, "loss": 1.1331, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5706514120101929, "rewards/margins": 0.3308263123035431, "rewards/rejected": -0.9014778137207031, "step": 617 }, { "epoch": 0.9342403628117913, "epsilon_dpo/beta": 0.01110595092177391, "epsilon_dpo/beta_margin_grad_mean": -0.42988893389701843, "epsilon_dpo/beta_margin_grad_std": 0.13014353811740875, "epsilon_dpo/beta_margin_mean": 0.3075876832008362, "epsilon_dpo/beta_margin_std": 0.5768880844116211, "epsilon_dpo/loss_margin_mean": 28.036251068115234, "grad_norm": 9.315959930419922, "kl/avg_steps": 0.34375, "kl/beta": 0.011143145151436329, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 6.738782355044048e-09, "logits/chosen": -0.13639292120933533, "logits/rejected": -0.4161806106567383, "logps/chosen": -121.63025665283203, "logps/ref_chosen": -79.3497085571289, "logps/ref_rejected": -116.9796142578125, "logps/rejected": -187.29641723632812, "loss": 1.1801, "rewards/accuracies": 0.671875, "rewards/chosen": -0.4715430438518524, "rewards/margins": 0.3075876832008362, "rewards/rejected": -0.779130756855011, "step": 618 }, { "epoch": 0.9357520786092215, "epsilon_dpo/beta": 0.011060964316129684, "epsilon_dpo/beta_margin_grad_mean": -0.412675142288208, "epsilon_dpo/beta_margin_grad_std": 0.1362818479537964, "epsilon_dpo/beta_margin_mean": 0.382579505443573, "epsilon_dpo/beta_margin_std": 0.6046925783157349, "epsilon_dpo/loss_margin_mean": 34.932315826416016, "grad_norm": 6.323122501373291, "kl/avg_steps": 0.40625, "kl/beta": 0.01110497210174799, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 6.437261330158206e-09, "logits/chosen": -0.009692233055830002, "logits/rejected": -0.21985238790512085, "logps/chosen": -120.16893005371094, "logps/ref_chosen": -75.64820861816406, "logps/ref_rejected": -106.817138671875, "logps/rejected": -186.27017211914062, "loss": 1.1253, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4942760467529297, "rewards/margins": 0.3825794756412506, "rewards/rejected": -0.8768554925918579, "step": 619 }, { "epoch": 0.9372637944066515, "epsilon_dpo/beta": 0.011012754403054714, "epsilon_dpo/beta_margin_grad_mean": -0.4311310350894928, "epsilon_dpo/beta_margin_grad_std": 0.12295855581760406, "epsilon_dpo/beta_margin_mean": 0.2955702841281891, "epsilon_dpo/beta_margin_std": 0.5336819291114807, "epsilon_dpo/loss_margin_mean": 27.1448917388916, "grad_norm": 7.733489990234375, "kl/avg_steps": 0.4375, "kl/beta": 0.011060040444135666, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 6.142553278648238e-09, "logits/chosen": -0.19426310062408447, "logits/rejected": -0.00829014927148819, "logps/chosen": -125.66661071777344, "logps/ref_chosen": -82.22972106933594, "logps/ref_rejected": -85.6536865234375, "logps/rejected": -156.2354736328125, "loss": 1.1801, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4805551767349243, "rewards/margins": 0.2955702841281891, "rewards/rejected": -0.776125431060791, "step": 620 }, { "epoch": 0.9387755102040817, "epsilon_dpo/beta": 0.010961341671645641, "epsilon_dpo/beta_margin_grad_mean": -0.4333516061306, "epsilon_dpo/beta_margin_grad_std": 0.11945909261703491, "epsilon_dpo/beta_margin_mean": 0.2852727174758911, "epsilon_dpo/beta_margin_std": 0.5157901048660278, "epsilon_dpo/loss_margin_mean": 26.318817138671875, "grad_norm": 7.324526786804199, "kl/avg_steps": 0.46875, "kl/beta": 0.011011863127350807, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 5.854666444131934e-09, "logits/chosen": -0.1203552633523941, "logits/rejected": -0.34329789876937866, "logps/chosen": -121.84413146972656, "logps/ref_chosen": -75.2481460571289, "logps/ref_rejected": -113.55223083496094, "logps/rejected": -186.467041015625, "loss": 1.1846, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5119953155517578, "rewards/margins": 0.2852727174758911, "rewards/rejected": -0.7972680330276489, "step": 621 }, { "epoch": 0.9402872260015117, "epsilon_dpo/beta": 0.010906773619353771, "epsilon_dpo/beta_margin_grad_mean": -0.42391106486320496, "epsilon_dpo/beta_margin_grad_std": 0.13388563692569733, "epsilon_dpo/beta_margin_mean": 0.32644522190093994, "epsilon_dpo/beta_margin_std": 0.5833149552345276, "epsilon_dpo/loss_margin_mean": 30.275888442993164, "grad_norm": 10.996826171875, "kl/avg_steps": 0.5, "kl/beta": 0.01096048578619957, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.573608879422875e-09, "logits/chosen": -0.2559853494167328, "logits/rejected": -0.31877291202545166, "logps/chosen": -130.95132446289062, "logps/ref_chosen": -85.68588256835938, "logps/ref_rejected": -102.61013793945312, "logps/rejected": -178.15147399902344, "loss": 1.1669, "rewards/accuracies": 0.765625, "rewards/chosen": -0.49637672305107117, "rewards/margins": 0.32644522190093994, "rewards/rejected": -0.8228219747543335, "step": 622 }, { "epoch": 0.9417989417989417, "epsilon_dpo/beta": 0.010855920612812042, "epsilon_dpo/beta_margin_grad_mean": -0.42936843633651733, "epsilon_dpo/beta_margin_grad_std": 0.11718784272670746, "epsilon_dpo/beta_margin_mean": 0.3037678897380829, "epsilon_dpo/beta_margin_std": 0.5088984370231628, "epsilon_dpo/loss_margin_mean": 28.262224197387695, "grad_norm": 7.055470943450928, "kl/avg_steps": 0.46875, "kl/beta": 0.010905956849455833, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 5.299388446305342e-09, "logits/chosen": -0.12515604496002197, "logits/rejected": -0.36976397037506104, "logps/chosen": -140.57472229003906, "logps/ref_chosen": -88.4764404296875, "logps/ref_rejected": -115.94358825683594, "logps/rejected": -196.30409240722656, "loss": 1.1668, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5674319863319397, "rewards/margins": 0.3037678897380829, "rewards/rejected": -0.8711998462677002, "step": 623 }, { "epoch": 0.9433106575963719, "epsilon_dpo/beta": 0.010778130032122135, "epsilon_dpo/beta_margin_grad_mean": -0.3959210216999054, "epsilon_dpo/beta_margin_grad_std": 0.11694164574146271, "epsilon_dpo/beta_margin_mean": 0.4492229223251343, "epsilon_dpo/beta_margin_std": 0.5222044587135315, "epsilon_dpo/loss_margin_mean": 41.864505767822266, "grad_norm": 7.092720031738281, "kl/avg_steps": 0.71875, "kl/beta": 0.010855073109269142, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 5.03201281531429e-09, "logits/chosen": 0.03635944426059723, "logits/rejected": -0.2513130307197571, "logps/chosen": -106.5604248046875, "logps/ref_chosen": -70.26908874511719, "logps/ref_rejected": -98.937255859375, "logps/rejected": -177.09310913085938, "loss": 1.05, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3919287323951721, "rewards/margins": 0.4492229223251343, "rewards/rejected": -0.8411516547203064, "step": 624 }, { "epoch": 0.9448223733938019, "epsilon_dpo/beta": 0.010751740075647831, "epsilon_dpo/beta_margin_grad_mean": -0.4558924436569214, "epsilon_dpo/beta_margin_grad_std": 0.12500344216823578, "epsilon_dpo/beta_margin_mean": 0.19076864421367645, "epsilon_dpo/beta_margin_std": 0.5383822917938232, "epsilon_dpo/loss_margin_mean": 18.10923194885254, "grad_norm": 9.223965644836426, "kl/avg_steps": 0.25, "kl/beta": 0.010777609422802925, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.7714894655209174e-09, "logits/chosen": 0.09123232215642929, "logits/rejected": -0.23934797942638397, "logps/chosen": -128.52659606933594, "logps/ref_chosen": -79.21810913085938, "logps/ref_rejected": -113.8106689453125, "logps/rejected": -181.2283935546875, "loss": 1.274, "rewards/accuracies": 0.609375, "rewards/chosen": -0.5313090085983276, "rewards/margins": 0.19076864421367645, "rewards/rejected": -0.7220776677131653, "step": 625 }, { "epoch": 0.9463340891912321, "epsilon_dpo/beta": 0.0107182078063488, "epsilon_dpo/beta_margin_grad_mean": -0.4198099672794342, "epsilon_dpo/beta_margin_grad_std": 0.14759738743305206, "epsilon_dpo/beta_margin_mean": 0.35964328050613403, "epsilon_dpo/beta_margin_std": 0.6498256921768188, "epsilon_dpo/loss_margin_mean": 34.002769470214844, "grad_norm": 6.630099296569824, "kl/avg_steps": 0.3125, "kl/beta": 0.010750732384622097, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.517825684323323e-09, "logits/chosen": -0.03878095746040344, "logits/rejected": -0.2872091829776764, "logps/chosen": -108.78828430175781, "logps/ref_chosen": -66.3774642944336, "logps/ref_rejected": -118.52725219726562, "logps/rejected": -194.94082641601562, "loss": 1.1576, "rewards/accuracies": 0.65625, "rewards/chosen": -0.45677587389945984, "rewards/margins": 0.35964328050613403, "rewards/rejected": -0.8164191246032715, "step": 626 }, { "epoch": 0.9478458049886621, "epsilon_dpo/beta": 0.01065802201628685, "epsilon_dpo/beta_margin_grad_mean": -0.40233418345451355, "epsilon_dpo/beta_margin_grad_std": 0.12860505282878876, "epsilon_dpo/beta_margin_mean": 0.4263233542442322, "epsilon_dpo/beta_margin_std": 0.5683040022850037, "epsilon_dpo/loss_margin_mean": 40.29022216796875, "grad_norm": 6.485427379608154, "kl/avg_steps": 0.5625, "kl/beta": 0.01071724109351635, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.271028567242818e-09, "logits/chosen": -0.12933257222175598, "logits/rejected": -0.4183410406112671, "logps/chosen": -127.08903503417969, "logps/ref_chosen": -82.48866271972656, "logps/ref_rejected": -122.58235168457031, "logps/rejected": -207.4729461669922, "loss": 1.0801, "rewards/accuracies": 0.796875, "rewards/chosen": -0.47629666328430176, "rewards/margins": 0.4263233542442322, "rewards/rejected": -0.9026199579238892, "step": 627 }, { "epoch": 0.9493575207860923, "epsilon_dpo/beta": 0.010608398355543613, "epsilon_dpo/beta_margin_grad_mean": -0.40769657492637634, "epsilon_dpo/beta_margin_grad_std": 0.15037523210048676, "epsilon_dpo/beta_margin_mean": 0.4015233516693115, "epsilon_dpo/beta_margin_std": 0.6635918021202087, "epsilon_dpo/loss_margin_mean": 38.265560150146484, "grad_norm": 7.2923197746276855, "kl/avg_steps": 0.46875, "kl/beta": 0.0106572937220335, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.0311050177251895e-09, "logits/chosen": -0.4370567202568054, "logits/rejected": 0.030434168875217438, "logps/chosen": -116.41726684570312, "logps/ref_chosen": -75.95014190673828, "logps/ref_rejected": -99.55660247802734, "logps/rejected": -178.28929138183594, "loss": 1.128, "rewards/accuracies": 0.75, "rewards/chosen": -0.4314713478088379, "rewards/margins": 0.4015233516693115, "rewards/rejected": -0.8329946994781494, "step": 628 }, { "epoch": 0.9508692365835223, "epsilon_dpo/beta": 0.010562218725681305, "epsilon_dpo/beta_margin_grad_mean": -0.42028841376304626, "epsilon_dpo/beta_margin_grad_std": 0.1128428652882576, "epsilon_dpo/beta_margin_mean": 0.3450172245502472, "epsilon_dpo/beta_margin_std": 0.4945928454399109, "epsilon_dpo/loss_margin_mean": 32.94646453857422, "grad_norm": 8.863780975341797, "kl/avg_steps": 0.4375, "kl/beta": 0.01060757040977478, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.798061746947995e-09, "logits/chosen": -0.1133829802274704, "logits/rejected": -0.4618754982948303, "logps/chosen": -136.7560272216797, "logps/ref_chosen": -96.68063354492188, "logps/ref_rejected": -99.79010772705078, "logps/rejected": -172.81198120117188, "loss": 1.1281, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4247984290122986, "rewards/margins": 0.3450172543525696, "rewards/rejected": -0.7698156833648682, "step": 629 }, { "epoch": 0.9523809523809523, "epsilon_dpo/beta": 0.01050630770623684, "epsilon_dpo/beta_margin_grad_mean": -0.4169251024723053, "epsilon_dpo/beta_margin_grad_std": 0.11476609110832214, "epsilon_dpo/beta_margin_mean": 0.3586493134498596, "epsilon_dpo/beta_margin_std": 0.5017598271369934, "epsilon_dpo/loss_margin_mean": 34.3900260925293, "grad_norm": 5.832938194274902, "kl/avg_steps": 0.53125, "kl/beta": 0.010561364702880383, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.5719052736323806e-09, "logits/chosen": -0.20752973854541779, "logits/rejected": -0.19195277988910675, "logps/chosen": -120.63751220703125, "logps/ref_chosen": -80.01641845703125, "logps/ref_rejected": -98.34365844726562, "logps/rejected": -173.35476684570312, "loss": 1.1187, "rewards/accuracies": 0.75, "rewards/chosen": -0.4271647334098816, "rewards/margins": 0.3586493134498596, "rewards/rejected": -0.7858140468597412, "step": 630 }, { "epoch": 0.9538926681783825, "epsilon_dpo/beta": 0.010454071685671806, "epsilon_dpo/beta_margin_grad_mean": -0.40833863615989685, "epsilon_dpo/beta_margin_grad_std": 0.13670286536216736, "epsilon_dpo/beta_margin_mean": 0.404376357793808, "epsilon_dpo/beta_margin_std": 0.6010540723800659, "epsilon_dpo/loss_margin_mean": 39.03636169433594, "grad_norm": 7.70844030380249, "kl/avg_steps": 0.5, "kl/beta": 0.010505554266273975, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.352641923861144e-09, "logits/chosen": -0.5213108658790588, "logits/rejected": -0.5298713445663452, "logps/chosen": -124.63873291015625, "logps/ref_chosen": -84.44837951660156, "logps/ref_rejected": -122.06558227539062, "logps/rejected": -201.29229736328125, "loss": 1.1068, "rewards/accuracies": 0.75, "rewards/chosen": -0.4222160577774048, "rewards/margins": 0.40437638759613037, "rewards/rejected": -0.8265924453735352, "step": 631 }, { "epoch": 0.9554043839758125, "epsilon_dpo/beta": 0.01040206104516983, "epsilon_dpo/beta_margin_grad_mean": -0.4239082932472229, "epsilon_dpo/beta_margin_grad_std": 0.12038099765777588, "epsilon_dpo/beta_margin_mean": 0.3268932104110718, "epsilon_dpo/beta_margin_std": 0.5195305347442627, "epsilon_dpo/loss_margin_mean": 31.721187591552734, "grad_norm": 7.052772521972656, "kl/avg_steps": 0.5, "kl/beta": 0.01045328751206398, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.140277830901428e-09, "logits/chosen": -0.21344169974327087, "logits/rejected": -0.29157426953315735, "logps/chosen": -124.53128814697266, "logps/ref_chosen": -81.35279846191406, "logps/ref_rejected": -89.2701416015625, "logps/rejected": -164.16981506347656, "loss": 1.15, "rewards/accuracies": 0.78125, "rewards/chosen": -0.45100072026252747, "rewards/margins": 0.3268932104110718, "rewards/rejected": -0.7778939008712769, "step": 632 }, { "epoch": 0.9569160997732427, "epsilon_dpo/beta": 0.01034705899655819, "epsilon_dpo/beta_margin_grad_mean": -0.40773069858551025, "epsilon_dpo/beta_margin_grad_std": 0.12108438462018967, "epsilon_dpo/beta_margin_mean": 0.4030759036540985, "epsilon_dpo/beta_margin_std": 0.5361250638961792, "epsilon_dpo/loss_margin_mean": 39.24986267089844, "grad_norm": 7.865797996520996, "kl/avg_steps": 0.53125, "kl/beta": 0.010401281528174877, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.9348189350335007e-09, "logits/chosen": -0.14327946305274963, "logits/rejected": -0.15693995356559753, "logps/chosen": -105.8614501953125, "logps/ref_chosen": -73.86644744873047, "logps/ref_rejected": -92.09278106689453, "logps/rejected": -163.337646484375, "loss": 1.0902, "rewards/accuracies": 0.75, "rewards/chosen": -0.3324025869369507, "rewards/margins": 0.4030759036540985, "rewards/rejected": -0.7354785203933716, "step": 633 }, { "epoch": 0.9584278155706727, "epsilon_dpo/beta": 0.010321483016014099, "epsilon_dpo/beta_margin_grad_mean": -0.4732949435710907, "epsilon_dpo/beta_margin_grad_std": 0.12383994460105896, "epsilon_dpo/beta_margin_mean": 0.11720895022153854, "epsilon_dpo/beta_margin_std": 0.532695472240448, "epsilon_dpo/loss_margin_mean": 11.713577270507812, "grad_norm": 7.904653549194336, "kl/avg_steps": 0.25, "kl/beta": 0.01034631673246622, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.736270983384276e-09, "logits/chosen": -0.1411525011062622, "logits/rejected": -0.4012359380722046, "logps/chosen": -134.27059936523438, "logps/ref_chosen": -84.89143371582031, "logps/ref_rejected": -81.0973892211914, "logps/rejected": -142.1901397705078, "loss": 1.3407, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5117350816726685, "rewards/margins": 0.11720894277095795, "rewards/rejected": -0.6289440393447876, "step": 634 }, { "epoch": 0.9599395313681028, "epsilon_dpo/beta": 0.010292517952620983, "epsilon_dpo/beta_margin_grad_mean": -0.44807276129722595, "epsilon_dpo/beta_margin_grad_std": 0.1326771378517151, "epsilon_dpo/beta_margin_mean": 0.22640717029571533, "epsilon_dpo/beta_margin_std": 0.5779455304145813, "epsilon_dpo/loss_margin_mean": 22.384584426879883, "grad_norm": 6.7423930168151855, "kl/avg_steps": 0.28125, "kl/beta": 0.010320515371859074, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.5446395297668287e-09, "logits/chosen": -0.38996532559394836, "logits/rejected": -0.4668923318386078, "logps/chosen": -147.9334716796875, "logps/ref_chosen": -91.09562683105469, "logps/ref_rejected": -113.29243469238281, "logps/rejected": -192.51486206054688, "loss": 1.252, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5871849656105042, "rewards/margins": 0.22640718519687653, "rewards/rejected": -0.8135921955108643, "step": 635 }, { "epoch": 0.9614512471655329, "epsilon_dpo/beta": 0.010228270664811134, "epsilon_dpo/beta_margin_grad_mean": -0.4065358638763428, "epsilon_dpo/beta_margin_grad_std": 0.11634913831949234, "epsilon_dpo/beta_margin_mean": 0.402926504611969, "epsilon_dpo/beta_margin_std": 0.5106287598609924, "epsilon_dpo/loss_margin_mean": 39.64336013793945, "grad_norm": 6.623480319976807, "kl/avg_steps": 0.625, "kl/beta": 0.010291569866240025, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.359929934524829e-09, "logits/chosen": -0.20538179576396942, "logits/rejected": -0.4749613404273987, "logps/chosen": -111.59320068359375, "logps/ref_chosen": -73.47183990478516, "logps/ref_rejected": -103.75514221191406, "logps/rejected": -181.51986694335938, "loss": 1.0845, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3905521333217621, "rewards/margins": 0.402926504611969, "rewards/rejected": -0.7934786081314087, "step": 636 }, { "epoch": 0.9629629629629629, "epsilon_dpo/beta": 0.010177526623010635, "epsilon_dpo/beta_margin_grad_mean": -0.4286331534385681, "epsilon_dpo/beta_margin_grad_std": 0.12078320980072021, "epsilon_dpo/beta_margin_mean": 0.3108992874622345, "epsilon_dpo/beta_margin_std": 0.5280004143714905, "epsilon_dpo/loss_margin_mean": 30.848106384277344, "grad_norm": 5.800489902496338, "kl/avg_steps": 0.5, "kl/beta": 0.010227647610008717, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.1821473643827137e-09, "logits/chosen": -0.2502748966217041, "logits/rejected": -0.46591269969940186, "logps/chosen": -155.5239715576172, "logps/ref_chosen": -100.47197723388672, "logps/ref_rejected": -114.3409423828125, "logps/rejected": -200.24105834960938, "loss": 1.1651, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5617954730987549, "rewards/margins": 0.3108992874622345, "rewards/rejected": -0.8726947903633118, "step": 637 }, { "epoch": 0.9644746787603931, "epsilon_dpo/beta": 0.010130072012543678, "epsilon_dpo/beta_margin_grad_mean": -0.4317198693752289, "epsilon_dpo/beta_margin_grad_std": 0.11745569854974747, "epsilon_dpo/beta_margin_mean": 0.2932567894458771, "epsilon_dpo/beta_margin_std": 0.5088880658149719, "epsilon_dpo/loss_margin_mean": 29.258907318115234, "grad_norm": 7.930274486541748, "kl/avg_steps": 0.46875, "kl/beta": 0.01017676293849945, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.0112967923011646e-09, "logits/chosen": 0.013463929295539856, "logits/rejected": -0.2810739278793335, "logps/chosen": -139.33770751953125, "logps/ref_chosen": -90.44658660888672, "logps/ref_rejected": -111.11790466308594, "logps/rejected": -189.2679443359375, "loss": 1.1759, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4958798885345459, "rewards/margins": 0.2932567894458771, "rewards/rejected": -0.7891367077827454, "step": 638 }, { "epoch": 0.9659863945578231, "epsilon_dpo/beta": 0.01009230688214302, "epsilon_dpo/beta_margin_grad_mean": -0.41252967715263367, "epsilon_dpo/beta_margin_grad_std": 0.12411058694124222, "epsilon_dpo/beta_margin_mean": 0.38042405247688293, "epsilon_dpo/beta_margin_std": 0.540318489074707, "epsilon_dpo/loss_margin_mean": 38.056053161621094, "grad_norm": 6.623051643371582, "kl/avg_steps": 0.375, "kl/beta": 0.010129282251000404, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.847382997337943e-09, "logits/chosen": -0.16237635910511017, "logits/rejected": -0.4033578336238861, "logps/chosen": -113.76445007324219, "logps/ref_chosen": -74.6663818359375, "logps/ref_rejected": -102.23210144042969, "logps/rejected": -179.38623046875, "loss": 1.1104, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3960699737071991, "rewards/margins": 0.3804240822792053, "rewards/rejected": -0.776494026184082, "step": 639 }, { "epoch": 0.9674981103552532, "epsilon_dpo/beta": 0.010041986592113972, "epsilon_dpo/beta_margin_grad_mean": -0.4485768973827362, "epsilon_dpo/beta_margin_grad_std": 0.12330426275730133, "epsilon_dpo/beta_margin_mean": 0.21434807777404785, "epsilon_dpo/beta_margin_std": 0.5284790396690369, "epsilon_dpo/loss_margin_mean": 21.6812801361084, "grad_norm": 6.14130163192749, "kl/avg_steps": 0.5, "kl/beta": 0.010091439820826054, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.690410564514244e-09, "logits/chosen": -0.05122403800487518, "logits/rejected": -0.4697457551956177, "logps/chosen": -117.28038024902344, "logps/ref_chosen": -73.5439453125, "logps/ref_rejected": -96.87593078613281, "logps/rejected": -162.29364013671875, "loss": 1.2506, "rewards/accuracies": 0.75, "rewards/chosen": -0.440321147441864, "rewards/margins": 0.21434809267520905, "rewards/rejected": -0.6546692252159119, "step": 640 }, { "epoch": 0.9690098261526833, "epsilon_dpo/beta": 0.009992025792598724, "epsilon_dpo/beta_margin_grad_mean": -0.4373061954975128, "epsilon_dpo/beta_margin_grad_std": 0.11379314959049225, "epsilon_dpo/beta_margin_mean": 0.26784640550613403, "epsilon_dpo/beta_margin_std": 0.4950565993785858, "epsilon_dpo/loss_margin_mean": 27.0919189453125, "grad_norm": 7.052608966827393, "kl/avg_steps": 0.5, "kl/beta": 0.010041233152151108, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.5403838846864692e-09, "logits/chosen": -0.345502108335495, "logits/rejected": -0.09203990548849106, "logps/chosen": -152.26617431640625, "logps/ref_chosen": -100.62752532958984, "logps/ref_rejected": -110.3574447631836, "logps/rejected": -189.0880126953125, "loss": 1.1944, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5173914432525635, "rewards/margins": 0.26784640550613403, "rewards/rejected": -0.7852379083633423, "step": 641 }, { "epoch": 0.9705215419501134, "epsilon_dpo/beta": 0.00994543731212616, "epsilon_dpo/beta_margin_grad_mean": -0.4270346462726593, "epsilon_dpo/beta_margin_grad_std": 0.11673079431056976, "epsilon_dpo/beta_margin_mean": 0.30874237418174744, "epsilon_dpo/beta_margin_std": 0.5001073479652405, "epsilon_dpo/loss_margin_mean": 31.388866424560547, "grad_norm": 7.703180313110352, "kl/avg_steps": 0.46875, "kl/beta": 0.009991277009248734, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.3973071544233218e-09, "logits/chosen": -0.22989991307258606, "logits/rejected": -0.21155939996242523, "logps/chosen": -143.5499267578125, "logps/ref_chosen": -95.99467468261719, "logps/ref_rejected": -96.79623413085938, "logps/rejected": -175.74032592773438, "loss": 1.1611, "rewards/accuracies": 0.75, "rewards/chosen": -0.47523415088653564, "rewards/margins": 0.3087424039840698, "rewards/rejected": -0.7839765548706055, "step": 642 }, { "epoch": 0.9720332577475435, "epsilon_dpo/beta": 0.009895927272737026, "epsilon_dpo/beta_margin_grad_mean": -0.4284159243106842, "epsilon_dpo/beta_margin_grad_std": 0.12825217843055725, "epsilon_dpo/beta_margin_mean": 0.3099651634693146, "epsilon_dpo/beta_margin_std": 0.5610890984535217, "epsilon_dpo/loss_margin_mean": 31.65646743774414, "grad_norm": 6.967833995819092, "kl/avg_steps": 0.5, "kl/beta": 0.009944661520421505, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.261184375888541e-09, "logits/chosen": -0.30852293968200684, "logits/rejected": -0.6907504796981812, "logps/chosen": -134.0585174560547, "logps/ref_chosen": -89.49581909179688, "logps/ref_rejected": -114.9364013671875, "logps/rejected": -191.15557861328125, "loss": 1.1745, "rewards/accuracies": 0.75, "rewards/chosen": -0.44213464856147766, "rewards/margins": 0.3099651336669922, "rewards/rejected": -0.7520997524261475, "step": 643 }, { "epoch": 0.9735449735449735, "epsilon_dpo/beta": 0.009863686747848988, "epsilon_dpo/beta_margin_grad_mean": -0.4447920322418213, "epsilon_dpo/beta_margin_grad_std": 0.13148462772369385, "epsilon_dpo/beta_margin_mean": 0.2381734848022461, "epsilon_dpo/beta_margin_std": 0.5650452375411987, "epsilon_dpo/loss_margin_mean": 24.548532485961914, "grad_norm": 6.671364784240723, "kl/avg_steps": 0.328125, "kl/beta": 0.009895185008645058, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.1320193567288527e-09, "logits/chosen": 0.001531161367893219, "logits/rejected": -0.12236610054969788, "logps/chosen": -117.81045532226562, "logps/ref_chosen": -69.89152526855469, "logps/ref_rejected": -86.02226257324219, "logps/rejected": -158.48971557617188, "loss": 1.2387, "rewards/accuracies": 0.65625, "rewards/chosen": -0.47450461983680725, "rewards/margins": 0.2381734848022461, "rewards/rejected": -0.712678074836731, "step": 644 }, { "epoch": 0.9750566893424036, "epsilon_dpo/beta": 0.009811407886445522, "epsilon_dpo/beta_margin_grad_mean": -0.417654424905777, "epsilon_dpo/beta_margin_grad_std": 0.11769741028547287, "epsilon_dpo/beta_margin_mean": 0.35267549753189087, "epsilon_dpo/beta_margin_std": 0.5047261714935303, "epsilon_dpo/loss_margin_mean": 36.249839782714844, "grad_norm": 6.893727779388428, "kl/avg_steps": 0.53125, "kl/beta": 0.009862823411822319, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.0098157099674987e-09, "logits/chosen": -0.04132319614291191, "logits/rejected": 0.27628880739212036, "logps/chosen": -126.49514770507812, "logps/ref_chosen": -83.9632339477539, "logps/ref_rejected": -95.219970703125, "logps/rejected": -174.001708984375, "loss": 1.1251, "rewards/accuracies": 0.78125, "rewards/chosen": -0.41895389556884766, "rewards/margins": 0.35267549753189087, "rewards/rejected": -0.7716293931007385, "step": 645 }, { "epoch": 0.9765684051398337, "epsilon_dpo/beta": 0.009748812764883041, "epsilon_dpo/beta_margin_grad_mean": -0.4187504053115845, "epsilon_dpo/beta_margin_grad_std": 0.11697126179933548, "epsilon_dpo/beta_margin_mean": 0.34697264432907104, "epsilon_dpo/beta_margin_std": 0.5104357600212097, "epsilon_dpo/loss_margin_mean": 35.826202392578125, "grad_norm": 6.112748146057129, "kl/avg_steps": 0.640625, "kl/beta": 0.009810703806579113, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 8.945768539031783e-10, "logits/chosen": -0.2783118784427643, "logits/rejected": -0.2582191526889801, "logps/chosen": -134.0037841796875, "logps/ref_chosen": -83.46397399902344, "logps/ref_rejected": -108.6168212890625, "logps/rejected": -194.9828338623047, "loss": 1.1307, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4940229058265686, "rewards/margins": 0.34697264432907104, "rewards/rejected": -0.8409955501556396, "step": 646 }, { "epoch": 0.9780801209372638, "epsilon_dpo/beta": 0.009676109068095684, "epsilon_dpo/beta_margin_grad_mean": -0.3990939259529114, "epsilon_dpo/beta_margin_grad_std": 0.10093998163938522, "epsilon_dpo/beta_margin_mean": 0.42904818058013916, "epsilon_dpo/beta_margin_std": 0.44319963455200195, "epsilon_dpo/loss_margin_mean": 44.51857376098633, "grad_norm": 9.283509254455566, "kl/avg_steps": 0.75, "kl/beta": 0.009748253971338272, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 7.863060120144316e-10, "logits/chosen": -0.45879900455474854, "logits/rejected": -0.3617364466190338, "logps/chosen": -138.1648406982422, "logps/ref_chosen": -88.60772705078125, "logps/ref_rejected": -128.72372436523438, "logps/rejected": -222.79940795898438, "loss": 1.0486, "rewards/accuracies": 0.875, "rewards/chosen": -0.481179416179657, "rewards/margins": 0.42904818058013916, "rewards/rejected": -0.9102275967597961, "step": 647 }, { "epoch": 0.9795918367346939, "epsilon_dpo/beta": 0.009631294757127762, "epsilon_dpo/beta_margin_grad_mean": -0.4343121647834778, "epsilon_dpo/beta_margin_grad_std": 0.11647993326187134, "epsilon_dpo/beta_margin_mean": 0.27863410115242004, "epsilon_dpo/beta_margin_std": 0.49984973669052124, "epsilon_dpo/loss_margin_mean": 29.250940322875977, "grad_norm": 7.5205888748168945, "kl/avg_steps": 0.46875, "kl/beta": 0.009675686247646809, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 6.850062128694045e-10, "logits/chosen": -0.032209426164627075, "logits/rejected": -0.2213985025882721, "logps/chosen": -143.01754760742188, "logps/ref_chosen": -91.4552001953125, "logps/ref_rejected": -111.99140167236328, "logps/rejected": -192.80470275878906, "loss": 1.1867, "rewards/accuracies": 0.75, "rewards/chosen": -0.49837782979011536, "rewards/margins": 0.27863410115242004, "rewards/rejected": -0.7770119309425354, "step": 648 }, { "epoch": 0.981103552532124, "epsilon_dpo/beta": 0.009580339305102825, "epsilon_dpo/beta_margin_grad_mean": -0.42864057421684265, "epsilon_dpo/beta_margin_grad_std": 0.12436851859092712, "epsilon_dpo/beta_margin_mean": 0.30313241481781006, "epsilon_dpo/beta_margin_std": 0.5281475186347961, "epsilon_dpo/loss_margin_mean": 31.997285842895508, "grad_norm": 7.824967861175537, "kl/avg_steps": 0.53125, "kl/beta": 0.009630543179810047, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 5.906802900412788e-10, "logits/chosen": 0.16560648381710052, "logits/rejected": -0.0949397012591362, "logps/chosen": -118.04146575927734, "logps/ref_chosen": -72.111328125, "logps/ref_rejected": -96.00141906738281, "logps/rejected": -173.9288330078125, "loss": 1.1731, "rewards/accuracies": 0.765625, "rewards/chosen": -0.44223999977111816, "rewards/margins": 0.30313241481781006, "rewards/rejected": -0.7453724145889282, "step": 649 }, { "epoch": 0.982615268329554, "epsilon_dpo/beta": 0.009541687555611134, "epsilon_dpo/beta_margin_grad_mean": -0.42905229330062866, "epsilon_dpo/beta_margin_grad_std": 0.12155012786388397, "epsilon_dpo/beta_margin_mean": 0.30361443758010864, "epsilon_dpo/beta_margin_std": 0.5176792740821838, "epsilon_dpo/loss_margin_mean": 32.194278717041016, "grad_norm": 7.0833353996276855, "kl/avg_steps": 0.40625, "kl/beta": 0.009579651057720184, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 5.033308820289184e-10, "logits/chosen": 0.3099375069141388, "logits/rejected": -0.09590494632720947, "logps/chosen": -118.29649353027344, "logps/ref_chosen": -77.13442993164062, "logps/ref_rejected": -101.20818328857422, "logps/rejected": -174.5645294189453, "loss": 1.1698, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3945963978767395, "rewards/margins": 0.30361443758010864, "rewards/rejected": -0.6982108354568481, "step": 650 }, { "epoch": 0.9841269841269841, "epsilon_dpo/beta": 0.009497118182480335, "epsilon_dpo/beta_margin_grad_mean": -0.4275834858417511, "epsilon_dpo/beta_margin_grad_std": 0.10842897742986679, "epsilon_dpo/beta_margin_mean": 0.3093123435974121, "epsilon_dpo/beta_margin_std": 0.4653792381286621, "epsilon_dpo/loss_margin_mean": 32.86103439331055, "grad_norm": 10.239365577697754, "kl/avg_steps": 0.46875, "kl/beta": 0.009540891274809837, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.2296043218295606e-10, "logits/chosen": -0.19762015342712402, "logits/rejected": -0.52814120054245, "logps/chosen": -121.38260650634766, "logps/ref_chosen": -79.18367004394531, "logps/ref_rejected": -103.46632385253906, "logps/rejected": -178.52630615234375, "loss": 1.1523, "rewards/accuracies": 0.71875, "rewards/chosen": -0.40324467420578003, "rewards/margins": 0.3093123435974121, "rewards/rejected": -0.7125570178031921, "step": 651 }, { "epoch": 0.9856386999244142, "epsilon_dpo/beta": 0.009472084231674671, "epsilon_dpo/beta_margin_grad_mean": -0.44292983412742615, "epsilon_dpo/beta_margin_grad_std": 0.12006277590990067, "epsilon_dpo/beta_margin_mean": 0.24692460894584656, "epsilon_dpo/beta_margin_std": 0.5136936902999878, "epsilon_dpo/loss_margin_mean": 26.458864212036133, "grad_norm": 8.063827514648438, "kl/avg_steps": 0.265625, "kl/beta": 0.009496376849710941, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.4957118863768176e-10, "logits/chosen": -0.5991553068161011, "logits/rejected": -0.3117263913154602, "logps/chosen": -135.54934692382812, "logps/ref_chosen": -85.69632720947266, "logps/ref_rejected": -109.90357971191406, "logps/rejected": -186.2154541015625, "loss": 1.2177, "rewards/accuracies": 0.640625, "rewards/chosen": -0.4741317927837372, "rewards/margins": 0.24692459404468536, "rewards/rejected": -0.7210564017295837, "step": 652 }, { "epoch": 0.9871504157218443, "epsilon_dpo/beta": 0.009436645545065403, "epsilon_dpo/beta_margin_grad_mean": -0.42172205448150635, "epsilon_dpo/beta_margin_grad_std": 0.1203337237238884, "epsilon_dpo/beta_margin_mean": 0.33542388677597046, "epsilon_dpo/beta_margin_std": 0.5146583914756775, "epsilon_dpo/loss_margin_mean": 35.929100036621094, "grad_norm": 7.69780158996582, "kl/avg_steps": 0.375, "kl/beta": 0.009471219033002853, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.831652042480093e-10, "logits/chosen": -0.05788855254650116, "logits/rejected": -0.04992194101214409, "logps/chosen": -126.68917846679688, "logps/ref_chosen": -85.57878112792969, "logps/ref_rejected": -99.29658508300781, "logps/rejected": -176.33609008789062, "loss": 1.142, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3894695043563843, "rewards/margins": 0.33542391657829285, "rewards/rejected": -0.7248933911323547, "step": 653 }, { "epoch": 0.9886621315192744, "epsilon_dpo/beta": 0.009401390329003334, "epsilon_dpo/beta_margin_grad_mean": -0.44078996777534485, "epsilon_dpo/beta_margin_grad_std": 0.12072242051362991, "epsilon_dpo/beta_margin_mean": 0.2574543356895447, "epsilon_dpo/beta_margin_std": 0.5202062129974365, "epsilon_dpo/loss_margin_mean": 27.730667114257812, "grad_norm": 8.079339981079102, "kl/avg_steps": 0.375, "kl/beta": 0.009435834363102913, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.2374433653205016e-10, "logits/chosen": -0.04486121982336044, "logits/rejected": -0.5683233737945557, "logps/chosen": -131.8297119140625, "logps/ref_chosen": -83.4058837890625, "logps/ref_rejected": -116.10549926757812, "logps/rejected": -192.260009765625, "loss": 1.2098, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4570308327674866, "rewards/margins": 0.2574543356895447, "rewards/rejected": -0.7144851684570312, "step": 654 }, { "epoch": 0.9901738473167044, "epsilon_dpo/beta": 0.009354515001177788, "epsilon_dpo/beta_margin_grad_mean": -0.4245755076408386, "epsilon_dpo/beta_margin_grad_std": 0.10435869544744492, "epsilon_dpo/beta_margin_mean": 0.3234991431236267, "epsilon_dpo/beta_margin_std": 0.45337221026420593, "epsilon_dpo/loss_margin_mean": 34.833831787109375, "grad_norm": 5.97894811630249, "kl/avg_steps": 0.5, "kl/beta": 0.009400582872331142, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.7131024761923852e-10, "logits/chosen": -0.18475328385829926, "logits/rejected": -0.643438458442688, "logps/chosen": -108.57415771484375, "logps/ref_chosen": -72.27444458007812, "logps/ref_rejected": -107.04484558105469, "logps/rejected": -178.17840576171875, "loss": 1.1373, "rewards/accuracies": 0.75, "rewards/chosen": -0.33998844027519226, "rewards/margins": 0.3234991431236267, "rewards/rejected": -0.6634875535964966, "step": 655 }, { "epoch": 0.9916855631141346, "epsilon_dpo/beta": 0.009299205616116524, "epsilon_dpo/beta_margin_grad_mean": -0.4235363304615021, "epsilon_dpo/beta_margin_grad_std": 0.10802899301052094, "epsilon_dpo/beta_margin_mean": 0.32424455881118774, "epsilon_dpo/beta_margin_std": 0.4628637135028839, "epsilon_dpo/loss_margin_mean": 35.1312255859375, "grad_norm": 6.119517803192139, "kl/avg_steps": 0.59375, "kl/beta": 0.009353813715279102, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.2586440420372934e-10, "logits/chosen": -0.306098610162735, "logits/rejected": -0.34763431549072266, "logps/chosen": -147.2364501953125, "logps/ref_chosen": -96.3607177734375, "logps/ref_rejected": -111.83285522460938, "logps/rejected": -197.8397979736328, "loss": 1.1393, "rewards/accuracies": 0.78125, "rewards/chosen": -0.47428518533706665, "rewards/margins": 0.32424455881118774, "rewards/rejected": -0.7985297441482544, "step": 656 }, { "epoch": 0.9931972789115646, "epsilon_dpo/beta": 0.009247222915291786, "epsilon_dpo/beta_margin_grad_mean": -0.40973320603370667, "epsilon_dpo/beta_margin_grad_std": 0.11478579789400101, "epsilon_dpo/beta_margin_mean": 0.3866519033908844, "epsilon_dpo/beta_margin_std": 0.4957241117954254, "epsilon_dpo/loss_margin_mean": 42.10652160644531, "grad_norm": 8.461531639099121, "kl/avg_steps": 0.5625, "kl/beta": 0.009298603050410748, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 8.740807750345913e-11, "logits/chosen": -0.10227912664413452, "logits/rejected": -0.3732077479362488, "logps/chosen": -112.02146911621094, "logps/ref_chosen": -71.39437866210938, "logps/ref_rejected": -100.89120483398438, "logps/rejected": -183.62481689453125, "loss": 1.0948, "rewards/accuracies": 0.765625, "rewards/chosen": -0.37659162282943726, "rewards/margins": 0.3866519033908844, "rewards/rejected": -0.763243556022644, "step": 657 }, { "epoch": 0.9947089947089947, "epsilon_dpo/beta": 0.009207057766616344, "epsilon_dpo/beta_margin_grad_mean": -0.4259795546531677, "epsilon_dpo/beta_margin_grad_std": 0.12907136976718903, "epsilon_dpo/beta_margin_mean": 0.31728169322013855, "epsilon_dpo/beta_margin_std": 0.5570127964019775, "epsilon_dpo/loss_margin_mean": 34.85756301879883, "grad_norm": 7.888942241668701, "kl/avg_steps": 0.4375, "kl/beta": 0.009246590547263622, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 5.594234322453539e-11, "logits/chosen": -0.19734932482242584, "logits/rejected": -0.16385933756828308, "logps/chosen": -130.331787109375, "logps/ref_chosen": -87.25668334960938, "logps/ref_rejected": -103.51670837402344, "logps/rejected": -181.44937133789062, "loss": 1.1679, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3980225622653961, "rewards/margins": 0.31728169322013855, "rewards/rejected": -0.7153042554855347, "step": 658 }, { "epoch": 0.9962207105064248, "epsilon_dpo/beta": 0.009175584651529789, "epsilon_dpo/beta_margin_grad_mean": -0.4530555009841919, "epsilon_dpo/beta_margin_grad_std": 0.11484676599502563, "epsilon_dpo/beta_margin_mean": 0.19890394806861877, "epsilon_dpo/beta_margin_std": 0.4893724024295807, "epsilon_dpo/loss_margin_mean": 22.028242111206055, "grad_norm": 5.81763219833374, "kl/avg_steps": 0.34375, "kl/beta": 0.009206313639879227, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.146808153123293e-11, "logits/chosen": -0.03687793016433716, "logits/rejected": -0.44385725259780884, "logps/chosen": -132.07925415039062, "logps/ref_chosen": -80.2817153930664, "logps/ref_rejected": -101.70068359375, "logps/rejected": -175.52645874023438, "loss": 1.255, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4763162136077881, "rewards/margins": 0.19890394806861877, "rewards/rejected": -0.6752201318740845, "step": 659 }, { "epoch": 0.9977324263038548, "epsilon_dpo/beta": 0.009118344634771347, "epsilon_dpo/beta_margin_grad_mean": -0.4164464771747589, "epsilon_dpo/beta_margin_grad_std": 0.09780213981866837, "epsilon_dpo/beta_margin_mean": 0.35128533840179443, "epsilon_dpo/beta_margin_std": 0.4149182140827179, "epsilon_dpo/loss_margin_mean": 38.761817932128906, "grad_norm": 5.7514753341674805, "kl/avg_steps": 0.625, "kl/beta": 0.009174775332212448, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.3985977021235829e-11, "logits/chosen": -0.07328711450099945, "logits/rejected": -0.3468150198459625, "logps/chosen": -122.07150268554688, "logps/ref_chosen": -76.87039184570312, "logps/ref_rejected": -109.3966064453125, "logps/rejected": -193.35952758789062, "loss": 1.1069, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4130452573299408, "rewards/margins": 0.35128533840179443, "rewards/rejected": -0.7643306255340576, "step": 660 }, { "epoch": 0.999244142101285, "epsilon_dpo/beta": 0.009085915982723236, "epsilon_dpo/beta_margin_grad_mean": -0.4587934911251068, "epsilon_dpo/beta_margin_grad_std": 0.11900065094232559, "epsilon_dpo/beta_margin_mean": 0.17370952665805817, "epsilon_dpo/beta_margin_std": 0.5091543793678284, "epsilon_dpo/loss_margin_mean": 19.486536026000977, "grad_norm": 5.589955806732178, "kl/avg_steps": 0.359375, "kl/beta": 0.009117788635194302, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.4965187065971735e-12, "logits/chosen": 0.051325783133506775, "logits/rejected": -0.18619848787784576, "logps/chosen": -144.16314697265625, "logps/ref_chosen": -83.48721313476562, "logps/ref_rejected": -106.11189270019531, "logps/rejected": -186.27435302734375, "loss": 1.2825, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5530954599380493, "rewards/margins": 0.17370952665805817, "rewards/rejected": -0.7268049716949463, "step": 661 }, { "epoch": 0.999244142101285, "step": 661, "total_flos": 0.0, "train_loss": 1.1906072249390895, "train_runtime": 2288.3837, "train_samples_per_second": 18.5, "train_steps_per_second": 0.289 } ], "logging_steps": 1, "max_steps": 661, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }