{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999244142101285, "eval_steps": 100, "global_step": 661, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015117157974300832, "grad_norm": 17.933454513549805, "kl/avg_steps": 0.09375, "kl/beta": 0.10000000149011612, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 0.0, "logits/chosen": -0.8771844506263733, "logits/rejected": -0.7888585329055786, "logps/chosen": -80.20932006835938, "logps/ref_chosen": -80.27740478515625, "logps/ref_rejected": -83.5943374633789, "logps/rejected": -83.52326965332031, "loss": 1.3873, "rewards/accuracies": 0.53125, "rewards/chosen": 0.006630806718021631, "rewards/margins": -0.0005992341320961714, "rewards/rejected": 0.007230041082948446, "step": 1 }, { "epoch": 0.0030234315948601664, "grad_norm": 21.353334426879883, "kl/avg_steps": -0.15625, "kl/beta": 0.09990634024143219, "kl/n_epsilon_steps": 0.578125, "kl/p_epsilon_steps": 0.421875, "learning_rate": 7.462686567164179e-09, "logits/chosen": -0.6832054853439331, "logits/rejected": -0.5088719129562378, "logps/chosen": -74.510986328125, "logps/ref_chosen": -74.56095886230469, "logps/ref_rejected": -83.53636169433594, "logps/rejected": -83.51570892333984, "loss": 1.3843, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0048660230822861195, "rewards/margins": 0.0025865831412374973, "rewards/rejected": 0.002279440173879266, "step": 2 }, { "epoch": 0.0045351473922902496, "grad_norm": 19.950443267822266, "kl/avg_steps": 0.03125, "kl/beta": 0.10006268322467804, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.4925373134328357e-08, "logits/chosen": -0.6054874658584595, "logits/rejected": -0.3736334443092346, "logps/chosen": -82.1410140991211, "logps/ref_chosen": -82.15100860595703, "logps/ref_rejected": -109.82986450195312, "logps/rejected": -109.80192565917969, "loss": 1.3887, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0008957167156040668, "rewards/margins": -0.0020755126606673002, "rewards/rejected": 0.002971230074763298, "step": 3 }, { "epoch": 0.006046863189720333, "grad_norm": 19.876798629760742, "kl/avg_steps": 0.125, "kl/beta": 0.10003142803907394, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 2.2388059701492534e-08, "logits/chosen": -0.4454895853996277, "logits/rejected": -0.3323523998260498, "logps/chosen": -92.34318542480469, "logps/ref_chosen": -92.37549591064453, "logps/ref_rejected": -99.59554290771484, "logps/rejected": -99.51423645019531, "loss": 1.3919, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0031029037199914455, "rewards/margins": -0.005186422728002071, "rewards/rejected": 0.00828932598233223, "step": 4 }, { "epoch": 0.007558578987150416, "grad_norm": 18.935115814208984, "kl/avg_steps": 0.09375, "kl/beta": 0.0999065414071083, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 2.9850746268656714e-08, "logits/chosen": -0.6434583067893982, "logits/rejected": -0.43680721521377563, "logps/chosen": -78.93097686767578, "logps/ref_chosen": -78.84872436523438, "logps/ref_rejected": -97.88040161132812, "logps/rejected": -97.91473388671875, "loss": 1.3919, "rewards/accuracies": 0.515625, "rewards/chosen": -0.00838147010654211, "rewards/margins": -0.005112465005367994, "rewards/rejected": -0.00326900533400476, "step": 5 }, { "epoch": 0.009070294784580499, "grad_norm": 18.06861686706543, "kl/avg_steps": 0.0, "kl/beta": 0.09981296956539154, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 3.731343283582089e-08, "logits/chosen": -0.7716882228851318, "logits/rejected": -0.5386408567428589, "logps/chosen": -68.30958557128906, "logps/ref_chosen": -68.34607696533203, "logps/ref_rejected": -99.24613952636719, "logps/rejected": -99.24362182617188, "loss": 1.3836, "rewards/accuracies": 0.515625, "rewards/chosen": 0.00351733504794538, "rewards/margins": 0.0031151659786701202, "rewards/rejected": 0.0004021693021059036, "step": 6 }, { "epoch": 0.010582010582010581, "grad_norm": 17.43248748779297, "kl/avg_steps": -0.1875, "kl/beta": 0.09981296956539154, "kl/n_epsilon_steps": 0.59375, "kl/p_epsilon_steps": 0.40625, "learning_rate": 4.477611940298507e-08, "logits/chosen": -1.039565086364746, "logits/rejected": -0.6296759843826294, "logps/chosen": -69.15159606933594, "logps/ref_chosen": -69.11282348632812, "logps/ref_rejected": -84.01641845703125, "logps/rejected": -83.97854614257812, "loss": 1.3945, "rewards/accuracies": 0.421875, "rewards/chosen": -0.0039845979772508144, "rewards/margins": -0.00791984610259533, "rewards/rejected": 0.00393524719402194, "step": 7 }, { "epoch": 0.012093726379440665, "grad_norm": 18.484458923339844, "kl/avg_steps": -0.15625, "kl/beta": 0.10000047087669373, "kl/n_epsilon_steps": 0.578125, "kl/p_epsilon_steps": 0.421875, "learning_rate": 5.223880597014925e-08, "logits/chosen": -0.7085280418395996, "logits/rejected": -0.4177365303039551, "logps/chosen": -78.41571044921875, "logps/ref_chosen": -78.3912353515625, "logps/ref_rejected": -91.06254577636719, "logps/rejected": -91.00235748291016, "loss": 1.3954, "rewards/accuracies": 0.421875, "rewards/chosen": -0.0025808759965002537, "rewards/margins": -0.008768673986196518, "rewards/rejected": 0.006187797989696264, "step": 8 }, { "epoch": 0.013605442176870748, "grad_norm": 19.37607192993164, "kl/avg_steps": 0.1875, "kl/beta": 0.10015696287155151, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 5.970149253731343e-08, "logits/chosen": -0.5926854610443115, "logits/rejected": -0.6044590473175049, "logps/chosen": -69.67474365234375, "logps/ref_chosen": -69.67422485351562, "logps/ref_rejected": -105.00473022460938, "logps/rejected": -105.07916259765625, "loss": 1.3798, "rewards/accuracies": 0.578125, "rewards/chosen": -0.00018217615433968604, "rewards/margins": 0.0070345159620046616, "rewards/rejected": -0.007216691970825195, "step": 9 }, { "epoch": 0.015117157974300832, "grad_norm": 18.984508514404297, "kl/avg_steps": -0.09375, "kl/beta": 0.0999695211648941, "kl/n_epsilon_steps": 0.546875, "kl/p_epsilon_steps": 0.453125, "learning_rate": 6.71641791044776e-08, "logits/chosen": -0.67566978931427, "logits/rejected": -0.4178224802017212, "logps/chosen": -79.67657470703125, "logps/ref_chosen": -79.730712890625, "logps/ref_rejected": -105.50645446777344, "logps/rejected": -105.46436309814453, "loss": 1.3858, "rewards/accuracies": 0.46875, "rewards/chosen": 0.005256780423223972, "rewards/margins": 0.0008970340131781995, "rewards/rejected": 0.004359746817499399, "step": 10 }, { "epoch": 0.016628873771730914, "grad_norm": 17.404315948486328, "kl/avg_steps": -0.0625, "kl/beta": 0.10006333142518997, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 7.462686567164178e-08, "logits/chosen": -0.682112455368042, "logits/rejected": -0.7254103422164917, "logps/chosen": -85.43687438964844, "logps/ref_chosen": -85.41248321533203, "logps/ref_rejected": -86.50241088867188, "logps/rejected": -86.51531219482422, "loss": 1.3882, "rewards/accuracies": 0.453125, "rewards/chosen": -0.0025822517927736044, "rewards/margins": -0.0014805782120674849, "rewards/rejected": -0.0011016735807061195, "step": 11 }, { "epoch": 0.018140589569160998, "grad_norm": 17.35363006591797, "kl/avg_steps": 0.09375, "kl/beta": 0.10012590885162354, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 8.208955223880596e-08, "logits/chosen": -0.48884809017181396, "logits/rejected": -0.3806966543197632, "logps/chosen": -81.39530944824219, "logps/ref_chosen": -81.38086700439453, "logps/ref_rejected": -89.88151550292969, "logps/rejected": -89.9115219116211, "loss": 1.3855, "rewards/accuracies": 0.546875, "rewards/chosen": -0.0015910749789327383, "rewards/margins": 0.0012488359352573752, "rewards/rejected": -0.0028399100992828608, "step": 12 }, { "epoch": 0.019652305366591082, "grad_norm": 17.843292236328125, "kl/avg_steps": 0.0625, "kl/beta": 0.10003212839365005, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 8.955223880597014e-08, "logits/chosen": -1.0486931800842285, "logits/rejected": -0.7209100723266602, "logps/chosen": -63.15821075439453, "logps/ref_chosen": -63.17030715942383, "logps/ref_rejected": -105.61166381835938, "logps/rejected": -105.63218688964844, "loss": 1.3835, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0010905354283750057, "rewards/margins": 0.0030125719495117664, "rewards/rejected": -0.0019220358226448298, "step": 13 }, { "epoch": 0.021164021164021163, "grad_norm": 20.182865142822266, "kl/avg_steps": 0.15625, "kl/beta": 0.09996964782476425, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 9.701492537313432e-08, "logits/chosen": -0.6667978763580322, "logits/rejected": -0.4419565498828888, "logps/chosen": -80.64845275878906, "logps/ref_chosen": -80.71014404296875, "logps/ref_rejected": -89.86041259765625, "logps/rejected": -89.85292053222656, "loss": 1.3814, "rewards/accuracies": 0.609375, "rewards/chosen": 0.006051511503756046, "rewards/margins": 0.005182279273867607, "rewards/rejected": 0.0008692322298884392, "step": 14 }, { "epoch": 0.022675736961451247, "grad_norm": 20.247482299804688, "kl/avg_steps": -0.125, "kl/beta": 0.09981369227170944, "kl/n_epsilon_steps": 0.5625, "kl/p_epsilon_steps": 0.4375, "learning_rate": 1.044776119402985e-07, "logits/chosen": -0.8077883720397949, "logits/rejected": -0.4688650667667389, "logps/chosen": -82.10345458984375, "logps/ref_chosen": -82.00294494628906, "logps/ref_rejected": -106.43550109863281, "logps/rejected": -106.45130157470703, "loss": 1.3954, "rewards/accuracies": 0.421875, "rewards/chosen": -0.010214600712060928, "rewards/margins": -0.008754991926252842, "rewards/rejected": -0.00145960901863873, "step": 15 }, { "epoch": 0.02418745275888133, "grad_norm": 17.199460983276367, "kl/avg_steps": 0.03125, "kl/beta": 0.09993861615657806, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.1194029850746268e-07, "logits/chosen": -0.6257915496826172, "logits/rejected": -0.41689813137054443, "logps/chosen": -62.30339813232422, "logps/ref_chosen": -62.308345794677734, "logps/ref_rejected": -89.6508560180664, "logps/rejected": -89.64524841308594, "loss": 1.387, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0003566534724086523, "rewards/margins": -0.00036794866900891066, "rewards/rejected": 0.0007246022578328848, "step": 16 }, { "epoch": 0.025699168556311415, "grad_norm": 18.40418243408203, "kl/avg_steps": 0.125, "kl/beta": 0.09990739077329636, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 1.1940298507462686e-07, "logits/chosen": -0.6596513390541077, "logits/rejected": -0.38339459896087646, "logps/chosen": -85.23394775390625, "logps/ref_chosen": -85.16903686523438, "logps/ref_rejected": -102.57087707519531, "logps/rejected": -102.61199951171875, "loss": 1.3894, "rewards/accuracies": 0.578125, "rewards/chosen": -0.006622787099331617, "rewards/margins": -0.0026723374612629414, "rewards/rejected": -0.003950449638068676, "step": 17 }, { "epoch": 0.027210884353741496, "grad_norm": 17.053964614868164, "kl/avg_steps": 0.25, "kl/beta": 0.09978266060352325, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.2686567164179106e-07, "logits/chosen": -0.8401739597320557, "logits/rejected": -0.48542100191116333, "logps/chosen": -63.1472282409668, "logps/ref_chosen": -63.17793273925781, "logps/ref_rejected": -86.06461334228516, "logps/rejected": -86.12118530273438, "loss": 1.3781, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002997747389599681, "rewards/margins": 0.008445605635643005, "rewards/rejected": -0.005447858478873968, "step": 18 }, { "epoch": 0.02872260015117158, "grad_norm": 19.71549415588379, "kl/avg_steps": -0.078125, "kl/beta": 0.09953382611274719, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.453125, "learning_rate": 1.343283582089552e-07, "logits/chosen": -0.5802021026611328, "logits/rejected": -0.36945077776908875, "logps/chosen": -85.82483673095703, "logps/ref_chosen": -85.82405090332031, "logps/ref_rejected": -100.07136535644531, "logps/rejected": -100.070556640625, "loss": 1.3872, "rewards/accuracies": 0.484375, "rewards/chosen": -0.00019685056759044528, "rewards/margins": -0.00046660611405968666, "rewards/rejected": 0.00026975583750754595, "step": 19 }, { "epoch": 0.030234315948601664, "grad_norm": 18.155420303344727, "kl/avg_steps": 0.03125, "kl/beta": 0.09961164742708206, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.4179104477611938e-07, "logits/chosen": -0.5410428643226624, "logits/rejected": -0.44256073236465454, "logps/chosen": -73.6261978149414, "logps/ref_chosen": -73.58621215820312, "logps/ref_rejected": -91.21690368652344, "logps/rejected": -91.28337860107422, "loss": 1.3843, "rewards/accuracies": 0.53125, "rewards/chosen": -0.004116400144994259, "rewards/margins": 0.002343452535569668, "rewards/rejected": -0.006459852214902639, "step": 20 }, { "epoch": 0.031746031746031744, "grad_norm": 18.056482315063477, "kl/avg_steps": 0.15625, "kl/beta": 0.09958053380250931, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 1.4925373134328355e-07, "logits/chosen": -0.615408182144165, "logits/rejected": -0.520226776599884, "logps/chosen": -81.95823669433594, "logps/ref_chosen": -81.97251892089844, "logps/ref_rejected": -98.05976867675781, "logps/rejected": -98.11122131347656, "loss": 1.3804, "rewards/accuracies": 0.578125, "rewards/chosen": 0.0012293007457628846, "rewards/margins": 0.006264813244342804, "rewards/rejected": -0.005035512149333954, "step": 21 }, { "epoch": 0.03325774754346183, "grad_norm": 18.43136978149414, "kl/avg_steps": -0.03125, "kl/beta": 0.09942518174648285, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 1.5671641791044775e-07, "logits/chosen": -0.7960255742073059, "logits/rejected": -0.3484349548816681, "logps/chosen": -76.95167541503906, "logps/ref_chosen": -76.99579620361328, "logps/ref_rejected": -95.76089477539062, "logps/rejected": -95.7391357421875, "loss": 1.3847, "rewards/accuracies": 0.484375, "rewards/chosen": 0.004259100183844566, "rewards/margins": 0.0019459626637399197, "rewards/rejected": 0.0023131368216127157, "step": 22 }, { "epoch": 0.03476946334089191, "grad_norm": 18.915191650390625, "kl/avg_steps": 0.28125, "kl/beta": 0.09945625811815262, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.6417910447761193e-07, "logits/chosen": -0.5395127534866333, "logits/rejected": -0.37187278270721436, "logps/chosen": -84.71544647216797, "logps/ref_chosen": -84.76856994628906, "logps/ref_rejected": -107.28266906738281, "logps/rejected": -107.30066680908203, "loss": 1.3798, "rewards/accuracies": 0.578125, "rewards/chosen": 0.005197981372475624, "rewards/margins": 0.006793309934437275, "rewards/rejected": -0.0015953283291310072, "step": 23 }, { "epoch": 0.036281179138321996, "grad_norm": 17.060243606567383, "kl/avg_steps": 0.03125, "kl/beta": 0.09917732328176498, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.716417910447761e-07, "logits/chosen": -0.8160616159439087, "logits/rejected": -0.6523994207382202, "logps/chosen": -69.83349609375, "logps/ref_chosen": -69.87112426757812, "logps/ref_rejected": -84.02084350585938, "logps/rejected": -83.9853744506836, "loss": 1.3866, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0035986052826046944, "rewards/margins": -3.9631209801882505e-05, "rewards/rejected": 0.003638236550614238, "step": 24 }, { "epoch": 0.03779289493575208, "grad_norm": 19.301118850708008, "kl/avg_steps": 0.265625, "kl/beta": 0.09914634376764297, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.7910447761194027e-07, "logits/chosen": -0.5545772314071655, "logits/rejected": -0.5116233825683594, "logps/chosen": -78.25363159179688, "logps/ref_chosen": -78.22694396972656, "logps/ref_rejected": -106.65234375, "logps/rejected": -106.760986328125, "loss": 1.379, "rewards/accuracies": 0.625, "rewards/chosen": -0.002777719870209694, "rewards/margins": 0.007790995761752129, "rewards/rejected": -0.010568715631961823, "step": 25 }, { "epoch": 0.039304610733182165, "grad_norm": 17.692121505737305, "kl/avg_steps": 0.0625, "kl/beta": 0.09888368099927902, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 1.8656716417910447e-07, "logits/chosen": -0.431125283241272, "logits/rejected": -0.19942858815193176, "logps/chosen": -74.57691192626953, "logps/ref_chosen": -74.59750366210938, "logps/ref_rejected": -93.57858276367188, "logps/rejected": -93.59805297851562, "loss": 1.3829, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0019292905926704407, "rewards/margins": 0.003692114260047674, "rewards/rejected": -0.0017628234345465899, "step": 26 }, { "epoch": 0.04081632653061224, "grad_norm": 18.422821044921875, "kl/avg_steps": 0.125, "kl/beta": 0.09882191568613052, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 1.9402985074626865e-07, "logits/chosen": -0.6598864793777466, "logits/rejected": -0.3999100923538208, "logps/chosen": -78.63863372802734, "logps/ref_chosen": -78.64625549316406, "logps/ref_rejected": -92.33645629882812, "logps/rejected": -92.38688659667969, "loss": 1.3812, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0006331197218969464, "rewards/margins": 0.0054315500892698765, "rewards/rejected": -0.004798430018126965, "step": 27 }, { "epoch": 0.042328042328042326, "grad_norm": 17.46875762939453, "kl/avg_steps": -0.03125, "kl/beta": 0.09869854152202606, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 2.0149253731343282e-07, "logits/chosen": -0.8402580618858337, "logits/rejected": -0.7967926263809204, "logps/chosen": -76.87187957763672, "logps/ref_chosen": -76.91271209716797, "logps/ref_rejected": -88.48194885253906, "logps/rejected": -88.45233154296875, "loss": 1.3859, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0038373656570911407, "rewards/margins": 0.00078454555477947, "rewards/rejected": 0.003052819985896349, "step": 28 }, { "epoch": 0.04383975812547241, "grad_norm": 20.94962501525879, "kl/avg_steps": 0.1875, "kl/beta": 0.09872939437627792, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 2.08955223880597e-07, "logits/chosen": -0.38888347148895264, "logits/rejected": -0.36869269609451294, "logps/chosen": -89.60147094726562, "logps/ref_chosen": -89.62060546875, "logps/ref_rejected": -100.57090759277344, "logps/rejected": -100.54659271240234, "loss": 1.3879, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0016972769517451525, "rewards/margins": -0.0009317040676251054, "rewards/rejected": 0.0026289813686162233, "step": 29 }, { "epoch": 0.045351473922902494, "grad_norm": 18.70415687561035, "kl/avg_steps": 0.125, "kl/beta": 0.09854462742805481, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 2.1641791044776117e-07, "logits/chosen": -0.8333492279052734, "logits/rejected": -0.5384379625320435, "logps/chosen": -68.81825256347656, "logps/ref_chosen": -68.82381439208984, "logps/ref_rejected": -104.7047119140625, "logps/rejected": -104.7557373046875, "loss": 1.3817, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0004366333014331758, "rewards/margins": 0.005281157325953245, "rewards/rejected": -0.0048445239663124084, "step": 30 }, { "epoch": 0.04686318972033258, "grad_norm": 20.447372436523438, "kl/avg_steps": -0.046875, "kl/beta": 0.0984215959906578, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.46875, "learning_rate": 2.2388059701492537e-07, "logits/chosen": -0.7040875554084778, "logits/rejected": -0.4996650815010071, "logps/chosen": -86.09111022949219, "logps/ref_chosen": -86.06916809082031, "logps/ref_rejected": -116.66395568847656, "logps/rejected": -116.6534423828125, "loss": 1.3903, "rewards/accuracies": 0.4375, "rewards/chosen": -0.002270359545946121, "rewards/margins": -0.0035220435820519924, "rewards/rejected": 0.0012516845017671585, "step": 31 }, { "epoch": 0.04837490551776266, "grad_norm": 18.30170440673828, "kl/avg_steps": 0.125, "kl/beta": 0.09846775233745575, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 2.3134328358208954e-07, "logits/chosen": -1.0229980945587158, "logits/rejected": -0.5279667377471924, "logps/chosen": -87.55634307861328, "logps/ref_chosen": -87.59809112548828, "logps/ref_rejected": -100.26905822753906, "logps/rejected": -100.24147033691406, "loss": 1.3857, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003950035199522972, "rewards/margins": 0.0010602055117487907, "rewards/rejected": 0.0028898296877741814, "step": 32 }, { "epoch": 0.049886621315192746, "grad_norm": 19.37946319580078, "kl/avg_steps": -0.0625, "kl/beta": 0.0983448252081871, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 2.388059701492537e-07, "logits/chosen": -0.8228363394737244, "logits/rejected": -0.6981616616249084, "logps/chosen": -83.27375793457031, "logps/ref_chosen": -83.29850769042969, "logps/ref_rejected": -94.60990142822266, "logps/rejected": -94.55514526367188, "loss": 1.3899, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0023084937129169703, "rewards/margins": -0.0032619782723486423, "rewards/rejected": 0.005570471752434969, "step": 33 }, { "epoch": 0.05139833711262283, "grad_norm": 17.70010757446289, "kl/avg_steps": 0.25, "kl/beta": 0.09840632975101471, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.4626865671641786e-07, "logits/chosen": -0.6430321335792542, "logits/rejected": -0.46988445520401, "logps/chosen": -70.10933685302734, "logps/ref_chosen": -70.15070343017578, "logps/ref_rejected": -84.4693832397461, "logps/rejected": -84.48771667480469, "loss": 1.381, "rewards/accuracies": 0.625, "rewards/chosen": 0.003950835205614567, "rewards/margins": 0.0056062545627355576, "rewards/rejected": -0.001655419822782278, "step": 34 }, { "epoch": 0.05291005291005291, "grad_norm": 17.64505386352539, "kl/avg_steps": 0.15625, "kl/beta": 0.09816092252731323, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 2.537313432835821e-07, "logits/chosen": -0.7675759792327881, "logits/rejected": -0.5877223014831543, "logps/chosen": -78.1800537109375, "logps/ref_chosen": -78.25238037109375, "logps/ref_rejected": -91.06356811523438, "logps/rejected": -91.0887680053711, "loss": 1.3775, "rewards/accuracies": 0.59375, "rewards/chosen": 0.006912318058311939, "rewards/margins": 0.009255615994334221, "rewards/rejected": -0.002343298401683569, "step": 35 }, { "epoch": 0.05442176870748299, "grad_norm": 17.668521881103516, "kl/avg_steps": 0.03125, "kl/beta": 0.09800779074430466, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 2.611940298507462e-07, "logits/chosen": -0.9533746838569641, "logits/rejected": -0.4546297788619995, "logps/chosen": -67.0625228881836, "logps/ref_chosen": -67.06676483154297, "logps/ref_rejected": -99.34661865234375, "logps/rejected": -99.37528228759766, "loss": 1.3839, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0003041817108169198, "rewards/margins": 0.0029306563083082438, "rewards/rejected": -0.0026264747139066458, "step": 36 }, { "epoch": 0.055933484504913075, "grad_norm": 23.152936935424805, "kl/avg_steps": -0.03125, "kl/beta": 0.0979771688580513, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 2.686567164179104e-07, "logits/chosen": -0.739529550075531, "logits/rejected": -0.6823672652244568, "logps/chosen": -75.89892578125, "logps/ref_chosen": -75.92698669433594, "logps/ref_rejected": -130.34371948242188, "logps/rejected": -130.28778076171875, "loss": 1.3899, "rewards/accuracies": 0.46875, "rewards/chosen": 0.002641711849719286, "rewards/margins": -0.0030628300737589598, "rewards/rejected": 0.005704541690647602, "step": 37 }, { "epoch": 0.05744520030234316, "grad_norm": 17.994104385375977, "kl/avg_steps": 0.015625, "kl/beta": 0.09800779819488525, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.5, "learning_rate": 2.761194029850746e-07, "logits/chosen": -0.3506224751472473, "logits/rejected": -0.31289514899253845, "logps/chosen": -83.65824127197266, "logps/ref_chosen": -83.65460205078125, "logps/ref_rejected": -89.15221405029297, "logps/rejected": -89.1767349243164, "loss": 1.385, "rewards/accuracies": 0.515625, "rewards/chosen": -0.00048196763964369893, "rewards/margins": 0.0017501943511888385, "rewards/rejected": -0.0022321625147014856, "step": 38 }, { "epoch": 0.05895691609977324, "grad_norm": 18.549318313598633, "kl/avg_steps": 0.125, "kl/beta": 0.09799248725175858, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 2.8358208955223876e-07, "logits/chosen": -0.5775608420372009, "logits/rejected": -0.3955717086791992, "logps/chosen": -76.12467956542969, "logps/ref_chosen": -76.18706512451172, "logps/ref_rejected": -94.39262390136719, "logps/rejected": -94.3853530883789, "loss": 1.3816, "rewards/accuracies": 0.546875, "rewards/chosen": 0.005926312878727913, "rewards/margins": 0.005100561771541834, "rewards/rejected": 0.0008257507579401135, "step": 39 }, { "epoch": 0.06046863189720333, "grad_norm": 17.475339889526367, "kl/avg_steps": 0.140625, "kl/beta": 0.09787014871835709, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.5625, "learning_rate": 2.9104477611940296e-07, "logits/chosen": -0.5079457759857178, "logits/rejected": -0.4361386001110077, "logps/chosen": -77.43675231933594, "logps/ref_chosen": -77.43476867675781, "logps/ref_rejected": -98.58720397949219, "logps/rejected": -98.69015502929688, "loss": 1.377, "rewards/accuracies": 0.578125, "rewards/chosen": -0.00033624155912548304, "rewards/margins": 0.009605846367776394, "rewards/rejected": -0.009942087344825268, "step": 40 }, { "epoch": 0.06198034769463341, "grad_norm": 18.129384994506836, "kl/avg_steps": 0.203125, "kl/beta": 0.09773271530866623, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 2.985074626865671e-07, "logits/chosen": -0.6582231521606445, "logits/rejected": -0.6316337585449219, "logps/chosen": -86.85847473144531, "logps/ref_chosen": -86.87641143798828, "logps/ref_rejected": -101.0856704711914, "logps/rejected": -101.16006469726562, "loss": 1.3779, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0016077004838734865, "rewards/margins": 0.008732382208108902, "rewards/rejected": -0.007124680560082197, "step": 41 }, { "epoch": 0.06349206349206349, "grad_norm": 17.808015823364258, "kl/avg_steps": -0.078125, "kl/beta": 0.09753459692001343, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.453125, "learning_rate": 3.059701492537313e-07, "logits/chosen": -0.6215388774871826, "logits/rejected": -0.5111829042434692, "logps/chosen": -79.35958099365234, "logps/ref_chosen": -79.35625457763672, "logps/ref_rejected": -91.5488052368164, "logps/rejected": -91.5380859375, "loss": 1.3884, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0004895464517176151, "rewards/margins": -0.0016764979809522629, "rewards/rejected": 0.001186951994895935, "step": 42 }, { "epoch": 0.06500377928949358, "grad_norm": 19.075096130371094, "kl/avg_steps": -0.0625, "kl/beta": 0.09761085361242294, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 3.134328358208955e-07, "logits/chosen": -0.2507287263870239, "logits/rejected": -0.5635038614273071, "logps/chosen": -90.81982421875, "logps/ref_chosen": -90.81220245361328, "logps/ref_rejected": -94.16317749023438, "logps/rejected": -94.09054565429688, "loss": 1.3948, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0008870699675753713, "rewards/margins": -0.008112498559057713, "rewards/rejected": 0.007225428242236376, "step": 43 }, { "epoch": 0.06651549508692366, "grad_norm": 18.593828201293945, "kl/avg_steps": 0.046875, "kl/beta": 0.09767189621925354, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.515625, "learning_rate": 3.2089552238805965e-07, "logits/chosen": -0.8580632209777832, "logits/rejected": -0.6987817287445068, "logps/chosen": -88.23231506347656, "logps/ref_chosen": -88.27932739257812, "logps/ref_rejected": -101.14324951171875, "logps/rejected": -101.09764099121094, "loss": 1.3868, "rewards/accuracies": 0.515625, "rewards/chosen": 0.004492661450058222, "rewards/margins": -0.00015667756088078022, "rewards/rejected": 0.004649339243769646, "step": 44 }, { "epoch": 0.06802721088435375, "grad_norm": 18.8914852142334, "kl/avg_steps": 0.125, "kl/beta": 0.09762613475322723, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 3.2835820895522385e-07, "logits/chosen": -0.7590723037719727, "logits/rejected": -0.40023982524871826, "logps/chosen": -78.38350677490234, "logps/ref_chosen": -78.40264892578125, "logps/ref_rejected": -109.39339447021484, "logps/rejected": -109.42718505859375, "loss": 1.382, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0017465527635067701, "rewards/margins": 0.004819178022444248, "rewards/rejected": -0.003072625258937478, "step": 45 }, { "epoch": 0.06953892668178382, "grad_norm": 17.96132469177246, "kl/avg_steps": 0.0625, "kl/beta": 0.09750425815582275, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 3.3582089552238805e-07, "logits/chosen": -0.6594468355178833, "logits/rejected": -0.7436229586601257, "logps/chosen": -77.98482513427734, "logps/ref_chosen": -78.08491516113281, "logps/ref_rejected": -97.42544555664062, "logps/rejected": -97.40345764160156, "loss": 1.3795, "rewards/accuracies": 0.53125, "rewards/chosen": 0.009642795659601688, "rewards/margins": 0.007294987328350544, "rewards/rejected": 0.002347808564081788, "step": 46 }, { "epoch": 0.0710506424792139, "grad_norm": 18.438560485839844, "kl/avg_steps": 0.25, "kl/beta": 0.0974433571100235, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.432835820895522e-07, "logits/chosen": -0.5594514012336731, "logits/rejected": -0.30259305238723755, "logps/chosen": -70.72454833984375, "logps/ref_chosen": -70.78988647460938, "logps/ref_rejected": -91.17266845703125, "logps/rejected": -91.22081756591797, "loss": 1.3759, "rewards/accuracies": 0.625, "rewards/chosen": 0.006220364943146706, "rewards/margins": 0.010731121525168419, "rewards/rejected": -0.0045107570476830006, "step": 47 }, { "epoch": 0.07256235827664399, "grad_norm": 16.562816619873047, "kl/avg_steps": 0.0625, "kl/beta": 0.09720035642385483, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 3.507462686567164e-07, "logits/chosen": -0.771056056022644, "logits/rejected": -0.6000367403030396, "logps/chosen": -66.6251220703125, "logps/ref_chosen": -66.67327880859375, "logps/ref_rejected": -79.28543090820312, "logps/rejected": -79.26315307617188, "loss": 1.3842, "rewards/accuracies": 0.5, "rewards/chosen": 0.004568049218505621, "rewards/margins": 0.0022856390569359064, "rewards/rejected": 0.002282409928739071, "step": 48 }, { "epoch": 0.07407407407407407, "grad_norm": 17.03924560546875, "kl/avg_steps": 0.125, "kl/beta": 0.0971396416425705, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 3.5820895522388055e-07, "logits/chosen": -0.5533872842788696, "logits/rejected": -0.48292940855026245, "logps/chosen": -75.0789794921875, "logps/ref_chosen": -75.17504119873047, "logps/ref_rejected": -80.5369873046875, "logps/rejected": -80.46534729003906, "loss": 1.3845, "rewards/accuracies": 0.578125, "rewards/chosen": 0.009206226095557213, "rewards/margins": 0.0020941859111189842, "rewards/rejected": 0.007112039718776941, "step": 49 }, { "epoch": 0.07558578987150416, "grad_norm": 17.23259925842285, "kl/avg_steps": 0.125, "kl/beta": 0.09701836854219437, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 3.6567164179104475e-07, "logits/chosen": -0.6811853647232056, "logits/rejected": -0.4545362591743469, "logps/chosen": -71.20023345947266, "logps/ref_chosen": -71.2314224243164, "logps/ref_rejected": -87.59088134765625, "logps/rejected": -87.6037368774414, "loss": 1.3825, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0029269284568727016, "rewards/margins": 0.004028110299259424, "rewards/rejected": -0.001101181609556079, "step": 50 }, { "epoch": 0.07709750566893424, "grad_norm": 18.064058303833008, "kl/avg_steps": 0.15625, "kl/beta": 0.09689724445343018, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 3.7313432835820895e-07, "logits/chosen": -0.7829879522323608, "logits/rejected": -0.6104872226715088, "logps/chosen": -78.68687438964844, "logps/ref_chosen": -78.69171142578125, "logps/ref_rejected": -100.78950500488281, "logps/rejected": -100.80244445800781, "loss": 1.3853, "rewards/accuracies": 0.546875, "rewards/chosen": 0.00030520849395543337, "rewards/margins": 0.0013936648610979319, "rewards/rejected": -0.0010884563671424985, "step": 51 }, { "epoch": 0.07860922146636433, "grad_norm": 19.703731536865234, "kl/avg_steps": 0.0, "kl/beta": 0.09674607962369919, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 3.805970149253731e-07, "logits/chosen": -0.6990611553192139, "logits/rejected": -0.5098797678947449, "logps/chosen": -89.09143829345703, "logps/ref_chosen": -89.09419250488281, "logps/ref_rejected": -116.87468719482422, "logps/rejected": -116.89561462402344, "loss": 1.3848, "rewards/accuracies": 0.5, "rewards/chosen": 0.00012077903375029564, "rewards/margins": 0.0019711265340447426, "rewards/rejected": -0.001850348082371056, "step": 52 }, { "epoch": 0.0801209372637944, "grad_norm": 16.809465408325195, "kl/avg_steps": -0.09375, "kl/beta": 0.09674607962369919, "kl/n_epsilon_steps": 0.546875, "kl/p_epsilon_steps": 0.453125, "learning_rate": 3.880597014925373e-07, "logits/chosen": -0.8433110117912292, "logits/rejected": -0.894573450088501, "logps/chosen": -74.12965393066406, "logps/ref_chosen": -74.21418762207031, "logps/ref_rejected": -75.71167755126953, "logps/rejected": -75.67427062988281, "loss": 1.3826, "rewards/accuracies": 0.46875, "rewards/chosen": 0.00799822248518467, "rewards/margins": 0.004199557937681675, "rewards/rejected": 0.00379866361618042, "step": 53 }, { "epoch": 0.08163265306122448, "grad_norm": 15.830225944519043, "kl/avg_steps": 0.21875, "kl/beta": 0.0968368649482727, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 3.9552238805970144e-07, "logits/chosen": -0.8751938939094543, "logits/rejected": -0.7324712872505188, "logps/chosen": -65.55152893066406, "logps/ref_chosen": -65.63475799560547, "logps/ref_rejected": -76.4462890625, "logps/rejected": -76.46516418457031, "loss": 1.3771, "rewards/accuracies": 0.59375, "rewards/chosen": 0.007865255698561668, "rewards/margins": 0.009572221897542477, "rewards/rejected": -0.0017069653840735555, "step": 54 }, { "epoch": 0.08314436885865457, "grad_norm": 19.163211822509766, "kl/avg_steps": 0.21875, "kl/beta": 0.09662549942731857, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.0298507462686564e-07, "logits/chosen": -0.4196467101573944, "logits/rejected": -0.2608944773674011, "logps/chosen": -68.71656799316406, "logps/ref_chosen": -68.7640380859375, "logps/ref_rejected": -108.80075073242188, "logps/rejected": -108.87657928466797, "loss": 1.3752, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0043769595213234425, "rewards/margins": 0.011570327915251255, "rewards/rejected": -0.007193367928266525, "step": 55 }, { "epoch": 0.08465608465608465, "grad_norm": 16.41425132751465, "kl/avg_steps": 0.03125, "kl/beta": 0.09641458839178085, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 4.1044776119402984e-07, "logits/chosen": -0.6438100337982178, "logits/rejected": -0.563813328742981, "logps/chosen": -74.7386703491211, "logps/ref_chosen": -74.7939453125, "logps/ref_rejected": -81.83535766601562, "logps/rejected": -81.83403015136719, "loss": 1.3819, "rewards/accuracies": 0.5, "rewards/chosen": 0.005185459740459919, "rewards/margins": 0.0048543717712163925, "rewards/rejected": 0.0003310886677354574, "step": 56 }, { "epoch": 0.08616780045351474, "grad_norm": 18.06746482849121, "kl/avg_steps": 0.375, "kl/beta": 0.09638447314500809, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.17910447761194e-07, "logits/chosen": -0.8478030562400818, "logits/rejected": -0.8586157560348511, "logps/chosen": -74.48088073730469, "logps/ref_chosen": -74.5794677734375, "logps/ref_rejected": -105.61981964111328, "logps/rejected": -105.72442626953125, "loss": 1.3678, "rewards/accuracies": 0.6875, "rewards/chosen": 0.009298881515860558, "rewards/margins": 0.019189005717635155, "rewards/rejected": -0.009890124201774597, "step": 57 }, { "epoch": 0.08767951625094482, "grad_norm": 18.519271850585938, "kl/avg_steps": -0.125, "kl/beta": 0.09602437913417816, "kl/n_epsilon_steps": 0.5625, "kl/p_epsilon_steps": 0.4375, "learning_rate": 4.253731343283582e-07, "logits/chosen": -0.589920163154602, "logits/rejected": -0.5984715819358826, "logps/chosen": -92.21888732910156, "logps/ref_chosen": -92.24464416503906, "logps/ref_rejected": -103.18975830078125, "logps/rejected": -103.20128631591797, "loss": 1.3835, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0023432730231434107, "rewards/margins": 0.0032569742761552334, "rewards/rejected": -0.0009137009037658572, "step": 58 }, { "epoch": 0.08919123204837491, "grad_norm": 16.03529930114746, "kl/avg_steps": 0.375, "kl/beta": 0.09614455699920654, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.3283582089552234e-07, "logits/chosen": -0.4612119793891907, "logits/rejected": -0.7404814958572388, "logps/chosen": -66.95186614990234, "logps/ref_chosen": -67.12688446044922, "logps/ref_rejected": -91.69569396972656, "logps/rejected": -91.7687759399414, "loss": 1.3636, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01665889285504818, "rewards/margins": 0.023446228355169296, "rewards/rejected": -0.006787334103137255, "step": 59 }, { "epoch": 0.09070294784580499, "grad_norm": 17.765792846679688, "kl/avg_steps": 0.125, "kl/beta": 0.09578536450862885, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.4029850746268654e-07, "logits/chosen": -0.7013375759124756, "logits/rejected": -0.5072432160377502, "logps/chosen": -79.59794616699219, "logps/ref_chosen": -79.74327850341797, "logps/ref_rejected": -77.89244079589844, "logps/rejected": -77.80746459960938, "loss": 1.3818, "rewards/accuracies": 0.578125, "rewards/chosen": 0.013693554326891899, "rewards/margins": 0.005363212898373604, "rewards/rejected": 0.00833034235984087, "step": 60 }, { "epoch": 0.09221466364323508, "grad_norm": 15.700193405151367, "kl/avg_steps": 0.21875, "kl/beta": 0.09566578269004822, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.4776119402985074e-07, "logits/chosen": -1.1494168043136597, "logits/rejected": -0.5027548670768738, "logps/chosen": -65.95541381835938, "logps/ref_chosen": -66.08685302734375, "logps/ref_rejected": -88.1458740234375, "logps/rejected": -88.13238525390625, "loss": 1.3758, "rewards/accuracies": 0.625, "rewards/chosen": 0.012441026046872139, "rewards/margins": 0.01096752192825079, "rewards/rejected": 0.0014735042350366712, "step": 61 }, { "epoch": 0.09372637944066516, "grad_norm": 16.980772018432617, "kl/avg_steps": 0.3125, "kl/beta": 0.09545697271823883, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.552238805970149e-07, "logits/chosen": -0.628500759601593, "logits/rejected": -0.4428566098213196, "logps/chosen": -80.88948059082031, "logps/ref_chosen": -81.0108871459961, "logps/ref_rejected": -95.50444793701172, "logps/rejected": -95.56391143798828, "loss": 1.37, "rewards/accuracies": 0.671875, "rewards/chosen": 0.011412292718887329, "rewards/margins": 0.016900725662708282, "rewards/rejected": -0.005488432943820953, "step": 62 }, { "epoch": 0.09523809523809523, "grad_norm": 18.504274368286133, "kl/avg_steps": 0.234375, "kl/beta": 0.09515959769487381, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.626865671641791e-07, "logits/chosen": -0.5986104011535645, "logits/rejected": -0.4796867370605469, "logps/chosen": -78.40611267089844, "logps/ref_chosen": -78.57593536376953, "logps/ref_rejected": -99.71000671386719, "logps/rejected": -99.65341186523438, "loss": 1.3766, "rewards/accuracies": 0.640625, "rewards/chosen": 0.01594492793083191, "rewards/margins": 0.01040346547961235, "rewards/rejected": 0.005541461519896984, "step": 63 }, { "epoch": 0.09674981103552532, "grad_norm": 15.803566932678223, "kl/avg_steps": 0.09375, "kl/beta": 0.09493708610534668, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 4.701492537313433e-07, "logits/chosen": -0.7529503703117371, "logits/rejected": -0.6055833697319031, "logps/chosen": -69.16105651855469, "logps/ref_chosen": -69.24063110351562, "logps/ref_rejected": -84.14842987060547, "logps/rejected": -84.07394409179688, "loss": 1.387, "rewards/accuracies": 0.5, "rewards/chosen": 0.007316782139241695, "rewards/margins": 8.302222704514861e-05, "rewards/rejected": 0.007233759853988886, "step": 64 }, { "epoch": 0.0982615268329554, "grad_norm": 17.927310943603516, "kl/avg_steps": 0.0625, "kl/beta": 0.0948481634259224, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 4.776119402985074e-07, "logits/chosen": -0.7595020532608032, "logits/rejected": -0.637890100479126, "logps/chosen": -83.99519348144531, "logps/ref_chosen": -84.0351333618164, "logps/ref_rejected": -96.42926788330078, "logps/rejected": -96.46531677246094, "loss": 1.3802, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0035090160090476274, "rewards/margins": 0.006791706196963787, "rewards/rejected": -0.0032826901879161596, "step": 65 }, { "epoch": 0.09977324263038549, "grad_norm": 17.392560958862305, "kl/avg_steps": 0.34375, "kl/beta": 0.09478892385959625, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.850746268656717e-07, "logits/chosen": -0.8872799277305603, "logits/rejected": -0.9815646409988403, "logps/chosen": -87.69140625, "logps/ref_chosen": -87.79239654541016, "logps/ref_rejected": -95.26547241210938, "logps/rejected": -95.44229888916016, "loss": 1.3614, "rewards/accuracies": 0.671875, "rewards/chosen": 0.009347852319478989, "rewards/margins": 0.02584635280072689, "rewards/rejected": -0.016498498618602753, "step": 66 }, { "epoch": 0.10128495842781557, "grad_norm": 17.885372161865234, "kl/avg_steps": 0.171875, "kl/beta": 0.09446420520544052, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.925373134328357e-07, "logits/chosen": -0.8714014887809753, "logits/rejected": -0.7710602283477783, "logps/chosen": -77.86466979980469, "logps/ref_chosen": -78.00114440917969, "logps/ref_rejected": -96.03421020507812, "logps/rejected": -96.05657958984375, "loss": 1.3726, "rewards/accuracies": 0.578125, "rewards/chosen": 0.012656296603381634, "rewards/margins": 0.014563833363354206, "rewards/rejected": -0.001907536992803216, "step": 67 }, { "epoch": 0.10279667422524566, "grad_norm": 18.616788864135742, "kl/avg_steps": 0.375, "kl/beta": 0.09430211782455444, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 5e-07, "logits/chosen": -0.5614684820175171, "logits/rejected": -0.508707582950592, "logps/chosen": -95.97196960449219, "logps/ref_chosen": -96.04268646240234, "logps/ref_rejected": -110.91169738769531, "logps/rejected": -111.02496337890625, "loss": 1.3705, "rewards/accuracies": 0.671875, "rewards/chosen": 0.006450447719544172, "rewards/margins": 0.01685434952378273, "rewards/rejected": -0.010403899475932121, "step": 68 }, { "epoch": 0.10430839002267574, "grad_norm": 18.57022476196289, "kl/avg_steps": 0.40625, "kl/beta": 0.0939498096704483, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.999965034812934e-07, "logits/chosen": -0.6774875521659851, "logits/rejected": -0.5702620148658752, "logps/chosen": -84.87574768066406, "logps/ref_chosen": -85.11125183105469, "logps/ref_rejected": -107.57357025146484, "logps/rejected": -107.622802734375, "loss": 1.3609, "rewards/accuracies": 0.734375, "rewards/chosen": 0.02191595546901226, "rewards/margins": 0.026317887008190155, "rewards/rejected": -0.004401930142194033, "step": 69 }, { "epoch": 0.10582010582010581, "grad_norm": 17.01119041442871, "kl/avg_steps": 0.28125, "kl/beta": 0.09356968104839325, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.999860140229787e-07, "logits/chosen": -0.5370590686798096, "logits/rejected": -0.4853627681732178, "logps/chosen": -81.58915710449219, "logps/ref_chosen": -81.87960815429688, "logps/ref_rejected": -92.63243103027344, "logps/rejected": -92.62098693847656, "loss": 1.3617, "rewards/accuracies": 0.609375, "rewards/chosen": 0.026898501440882683, "rewards/margins": 0.025649238377809525, "rewards/rejected": 0.001249261200428009, "step": 70 }, { "epoch": 0.1073318216175359, "grad_norm": 16.591291427612305, "kl/avg_steps": 0.15625, "kl/beta": 0.0933072566986084, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.999685319184688e-07, "logits/chosen": -0.8710042238235474, "logits/rejected": -0.695549488067627, "logps/chosen": -79.66912841796875, "logps/ref_chosen": -79.74766540527344, "logps/ref_rejected": -83.39110565185547, "logps/rejected": -83.38461303710938, "loss": 1.3808, "rewards/accuracies": 0.59375, "rewards/chosen": 0.007052659057080746, "rewards/margins": 0.006284390110522509, "rewards/rejected": 0.0007682680152356625, "step": 71 }, { "epoch": 0.10884353741496598, "grad_norm": 17.93051528930664, "kl/avg_steps": 0.4375, "kl/beta": 0.0931616947054863, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.999440576567755e-07, "logits/chosen": -0.8033581376075745, "logits/rejected": -0.8477033376693726, "logps/chosen": -72.75706481933594, "logps/ref_chosen": -73.04458618164062, "logps/ref_rejected": -92.64720153808594, "logps/rejected": -92.60933685302734, "loss": 1.3645, "rewards/accuracies": 0.71875, "rewards/chosen": 0.02642909437417984, "rewards/margins": 0.022784588858485222, "rewards/rejected": 0.0036445085424929857, "step": 72 }, { "epoch": 0.11035525321239607, "grad_norm": 18.030593872070312, "kl/avg_steps": -0.0625, "kl/beta": 0.09275588393211365, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 4.999125919224965e-07, "logits/chosen": -0.773788571357727, "logits/rejected": -0.8637920022010803, "logps/chosen": -87.6549072265625, "logps/ref_chosen": -87.71681213378906, "logps/ref_rejected": -96.93572998046875, "logps/rejected": -96.90829467773438, "loss": 1.3848, "rewards/accuracies": 0.46875, "rewards/chosen": 0.005404962692409754, "rewards/margins": 0.0026616945397108793, "rewards/rejected": 0.0027432686183601618, "step": 73 }, { "epoch": 0.11186696900982615, "grad_norm": 16.754348754882812, "kl/avg_steps": 0.3125, "kl/beta": 0.0928138941526413, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.998741355957963e-07, "logits/chosen": -0.7274940013885498, "logits/rejected": -0.4740767776966095, "logps/chosen": -66.65380859375, "logps/ref_chosen": -67.07321166992188, "logps/ref_rejected": -96.53402709960938, "logps/rejected": -96.39031982421875, "loss": 1.3624, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03866203874349594, "rewards/margins": 0.025037020444869995, "rewards/rejected": 0.013625022023916245, "step": 74 }, { "epoch": 0.11337868480725624, "grad_norm": 15.731212615966797, "kl/avg_steps": 0.25, "kl/beta": 0.09252475202083588, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.998286897523808e-07, "logits/chosen": -0.85367751121521, "logits/rejected": -0.7722653150558472, "logps/chosen": -61.570716857910156, "logps/ref_chosen": -61.80186462402344, "logps/ref_rejected": -82.37368774414062, "logps/rejected": -82.36898803710938, "loss": 1.367, "rewards/accuracies": 0.640625, "rewards/chosen": 0.021144213154911995, "rewards/margins": 0.020450696349143982, "rewards/rejected": 0.0006935172714293003, "step": 75 }, { "epoch": 0.11489040060468632, "grad_norm": 16.436174392700195, "kl/avg_steps": 0.25, "kl/beta": 0.09229401499032974, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.997762556634679e-07, "logits/chosen": -0.8176724314689636, "logits/rejected": -0.6024997234344482, "logps/chosen": -69.61163330078125, "logps/ref_chosen": -69.92233276367188, "logps/ref_rejected": -97.08378601074219, "logps/rejected": -97.02426147460938, "loss": 1.3652, "rewards/accuracies": 0.59375, "rewards/chosen": 0.028404513373970985, "rewards/margins": 0.022574743255972862, "rewards/rejected": 0.005829768255352974, "step": 76 }, { "epoch": 0.1164021164021164, "grad_norm": 16.809951782226562, "kl/avg_steps": 0.3125, "kl/beta": 0.09206385910511017, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.99716834795752e-07, "logits/chosen": -1.176077127456665, "logits/rejected": -0.774753749370575, "logps/chosen": -70.80705261230469, "logps/ref_chosen": -71.206298828125, "logps/ref_rejected": -95.22071075439453, "logps/rejected": -95.2851791381836, "loss": 1.346, "rewards/accuracies": 0.65625, "rewards/chosen": 0.036420077085494995, "rewards/margins": 0.04206133261322975, "rewards/rejected": -0.005641256459057331, "step": 77 }, { "epoch": 0.11791383219954649, "grad_norm": 16.423564910888672, "kl/avg_steps": 0.21875, "kl/beta": 0.09177705645561218, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.996504288113623e-07, "logits/chosen": -0.6873102188110352, "logits/rejected": -0.5276945233345032, "logps/chosen": -84.00184631347656, "logps/ref_chosen": -84.40055847167969, "logps/ref_rejected": -95.41949462890625, "logps/rejected": -95.31796264648438, "loss": 1.3609, "rewards/accuracies": 0.609375, "rewards/chosen": 0.03620798885822296, "rewards/margins": 0.026718970388174057, "rewards/rejected": 0.009489016607403755, "step": 78 }, { "epoch": 0.11942554799697656, "grad_norm": 17.653905868530273, "kl/avg_steps": 0.34375, "kl/beta": 0.09157673269510269, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.995770395678171e-07, "logits/chosen": -0.7041028738021851, "logits/rejected": -0.6291458010673523, "logps/chosen": -65.53144836425781, "logps/ref_chosen": -65.93923950195312, "logps/ref_rejected": -102.92240905761719, "logps/rejected": -102.9657211303711, "loss": 1.3484, "rewards/accuracies": 0.671875, "rewards/chosen": 0.036987803876399994, "rewards/margins": 0.040554363280534744, "rewards/rejected": -0.0035665626637637615, "step": 79 }, { "epoch": 0.12093726379440665, "grad_norm": 15.999527931213379, "kl/avg_steps": 0.21875, "kl/beta": 0.09126301109790802, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.994966691179711e-07, "logits/chosen": -0.6971664428710938, "logits/rejected": -0.7449191212654114, "logps/chosen": -78.33244323730469, "logps/ref_chosen": -78.61624908447266, "logps/ref_rejected": -99.9122314453125, "logps/rejected": -99.92466735839844, "loss": 1.3617, "rewards/accuracies": 0.625, "rewards/chosen": 0.02554541453719139, "rewards/margins": 0.02635783888399601, "rewards/rejected": -0.0008124255109578371, "step": 80 }, { "epoch": 0.12244897959183673, "grad_norm": 16.314882278442383, "kl/avg_steps": 0.28125, "kl/beta": 0.09106381237506866, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.994093197099587e-07, "logits/chosen": -0.8343544006347656, "logits/rejected": -0.7506792545318604, "logps/chosen": -79.16024780273438, "logps/ref_chosen": -79.49640655517578, "logps/ref_rejected": -94.52413940429688, "logps/rejected": -94.53294372558594, "loss": 1.3575, "rewards/accuracies": 0.671875, "rewards/chosen": 0.030200045555830002, "rewards/margins": 0.030745631083846092, "rewards/rejected": -0.0005455873906612396, "step": 81 }, { "epoch": 0.12396069538926682, "grad_norm": 16.32975959777832, "kl/avg_steps": 0.5, "kl/beta": 0.09080841392278671, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.993149937871306e-07, "logits/chosen": -0.7907916903495789, "logits/rejected": -0.6614448428153992, "logps/chosen": -64.36497497558594, "logps/ref_chosen": -64.97168731689453, "logps/ref_rejected": -86.69085693359375, "logps/rejected": -86.62161254882812, "loss": 1.3408, "rewards/accuracies": 0.75, "rewards/chosen": 0.054587192833423615, "rewards/margins": 0.04804161936044693, "rewards/rejected": 0.0065455688163638115, "step": 82 }, { "epoch": 0.1254724111866969, "grad_norm": 16.859764099121094, "kl/avg_steps": 0.40625, "kl/beta": 0.09035663306713104, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.992136939879856e-07, "logits/chosen": -0.8817363977432251, "logits/rejected": -0.891059398651123, "logps/chosen": -72.4176254272461, "logps/ref_chosen": -72.92498779296875, "logps/ref_rejected": -92.27165222167969, "logps/rejected": -92.21333312988281, "loss": 1.3487, "rewards/accuracies": 0.71875, "rewards/chosen": 0.04530956968665123, "rewards/margins": 0.03982119634747505, "rewards/rejected": 0.005488371476531029, "step": 83 }, { "epoch": 0.12698412698412698, "grad_norm": 17.500118255615234, "kl/avg_steps": 0.46875, "kl/beta": 0.08999104052782059, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.991054231460969e-07, "logits/chosen": -0.6272699236869812, "logits/rejected": -0.5706059336662292, "logps/chosen": -81.32762145996094, "logps/ref_chosen": -81.79109191894531, "logps/ref_rejected": -99.20896911621094, "logps/rejected": -99.24348449707031, "loss": 1.3445, "rewards/accuracies": 0.734375, "rewards/chosen": 0.041328877210617065, "rewards/margins": 0.04405710846185684, "rewards/rejected": -0.0027282284572720528, "step": 84 }, { "epoch": 0.12849584278155707, "grad_norm": 15.726845741271973, "kl/avg_steps": 0.5, "kl/beta": 0.08957117795944214, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.989901842900325e-07, "logits/chosen": -1.0663063526153564, "logits/rejected": -1.011609435081482, "logps/chosen": -67.32447814941406, "logps/ref_chosen": -67.94148254394531, "logps/ref_rejected": -85.76875305175781, "logps/rejected": -85.65890502929688, "loss": 1.3438, "rewards/accuracies": 0.765625, "rewards/chosen": 0.054649144411087036, "rewards/margins": 0.0446481890976429, "rewards/rejected": 0.010000954382121563, "step": 85 }, { "epoch": 0.13000755857898716, "grad_norm": 15.644314765930176, "kl/avg_steps": 0.15625, "kl/beta": 0.08912555128335953, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.988679806432711e-07, "logits/chosen": -0.9160196781158447, "logits/rejected": -0.9157437086105347, "logps/chosen": -78.93154907226562, "logps/ref_chosen": -79.21485900878906, "logps/ref_rejected": -88.69877624511719, "logps/rejected": -88.68402099609375, "loss": 1.3645, "rewards/accuracies": 0.578125, "rewards/chosen": 0.024859676137566566, "rewards/margins": 0.023322567343711853, "rewards/rejected": 0.0015371122863143682, "step": 86 }, { "epoch": 0.13151927437641722, "grad_norm": 16.862993240356445, "kl/avg_steps": 0.40625, "kl/beta": 0.08898650854825974, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.987388156241114e-07, "logits/chosen": -0.8834874629974365, "logits/rejected": -1.0349664688110352, "logps/chosen": -83.93854522705078, "logps/ref_chosen": -84.45362854003906, "logps/ref_rejected": -103.438232421875, "logps/rejected": -103.58685302734375, "loss": 1.3317, "rewards/accuracies": 0.703125, "rewards/chosen": 0.04530634358525276, "rewards/margins": 0.05805563926696777, "rewards/rejected": -0.012749293819069862, "step": 87 }, { "epoch": 0.1330309901738473, "grad_norm": 16.096044540405273, "kl/avg_steps": 0.28125, "kl/beta": 0.08862645924091339, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.986026928455767e-07, "logits/chosen": -1.078216314315796, "logits/rejected": -0.7487344741821289, "logps/chosen": -80.88137817382812, "logps/ref_chosen": -81.27230834960938, "logps/ref_rejected": -89.51646423339844, "logps/rejected": -89.49003601074219, "loss": 1.3573, "rewards/accuracies": 0.640625, "rewards/chosen": 0.03411562368273735, "rewards/margins": 0.031519923359155655, "rewards/rejected": 0.0025957003235816956, "step": 88 }, { "epoch": 0.1345427059712774, "grad_norm": 16.19232749938965, "kl/avg_steps": 0.4375, "kl/beta": 0.08837790042161942, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.984596161153135e-07, "logits/chosen": -0.9088029861450195, "logits/rejected": -0.9495557546615601, "logps/chosen": -57.38954162597656, "logps/ref_chosen": -58.142333984375, "logps/ref_rejected": -102.53756713867188, "logps/rejected": -102.58346557617188, "loss": 1.321, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06586841493844986, "rewards/margins": 0.06961293518543243, "rewards/rejected": -0.003744515124708414, "step": 89 }, { "epoch": 0.1360544217687075, "grad_norm": 17.48473358154297, "kl/avg_steps": 0.40625, "kl/beta": 0.08799292892217636, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.983095894354857e-07, "logits/chosen": -0.8699663281440735, "logits/rejected": -0.855407178401947, "logps/chosen": -74.75665283203125, "logps/ref_chosen": -75.26505279541016, "logps/ref_rejected": -104.32841491699219, "logps/rejected": -104.36683654785156, "loss": 1.3422, "rewards/accuracies": 0.703125, "rewards/chosen": 0.04425486922264099, "rewards/margins": 0.04724379628896713, "rewards/rejected": -0.00298893079161644, "step": 90 }, { "epoch": 0.13756613756613756, "grad_norm": 15.36704158782959, "kl/avg_steps": 0.1875, "kl/beta": 0.08763690292835236, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.98152617002662e-07, "logits/chosen": -0.9106171131134033, "logits/rejected": -1.0182747840881348, "logps/chosen": -68.7718505859375, "logps/ref_chosen": -69.33902740478516, "logps/ref_rejected": -90.31411743164062, "logps/rejected": -90.28929901123047, "loss": 1.3438, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04909837990999222, "rewards/margins": 0.04654748737812042, "rewards/rejected": 0.0025508906692266464, "step": 91 }, { "epoch": 0.13907785336356765, "grad_norm": 16.497882843017578, "kl/avg_steps": 0.28125, "kl/beta": 0.08747289329767227, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.979887032076988e-07, "logits/chosen": -0.8337477445602417, "logits/rejected": -0.7373151779174805, "logps/chosen": -71.86427307128906, "logps/ref_chosen": -72.4566650390625, "logps/ref_rejected": -91.6706771850586, "logps/rejected": -91.6868896484375, "loss": 1.3376, "rewards/accuracies": 0.640625, "rewards/chosen": 0.051354095339775085, "rewards/margins": 0.052345022559165955, "rewards/rejected": -0.00099092535674572, "step": 92 }, { "epoch": 0.14058956916099774, "grad_norm": 14.10353946685791, "kl/avg_steps": 0.25, "kl/beta": 0.08722756803035736, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.978178526356172e-07, "logits/chosen": -1.1317577362060547, "logits/rejected": -0.7728261947631836, "logps/chosen": -63.39311218261719, "logps/ref_chosen": -64.08897399902344, "logps/ref_rejected": -75.09095764160156, "logps/rejected": -74.87936401367188, "loss": 1.3502, "rewards/accuracies": 0.640625, "rewards/chosen": 0.06010336056351662, "rewards/margins": 0.04119066148996353, "rewards/rejected": 0.018912700936198235, "step": 93 }, { "epoch": 0.1421012849584278, "grad_norm": 18.503324508666992, "kl/avg_steps": 0.53125, "kl/beta": 0.08701004087924957, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.976400700654751e-07, "logits/chosen": -1.0736751556396484, "logits/rejected": -1.2253625392913818, "logps/chosen": -78.85262298583984, "logps/ref_chosen": -79.67372131347656, "logps/ref_rejected": -94.64076232910156, "logps/rejected": -94.75550842285156, "loss": 1.3126, "rewards/accuracies": 0.78125, "rewards/chosen": 0.07063993811607361, "rewards/margins": 0.08020106703042984, "rewards/rejected": -0.009561131708323956, "step": 94 }, { "epoch": 0.1436130007558579, "grad_norm": 16.122169494628906, "kl/avg_steps": 0.28125, "kl/beta": 0.08655024319887161, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.974553604702332e-07, "logits/chosen": -0.7826769351959229, "logits/rejected": -0.6646933555603027, "logps/chosen": -78.21084594726562, "logps/ref_chosen": -78.65760803222656, "logps/ref_rejected": -109.40481567382812, "logps/rejected": -109.658203125, "loss": 1.3318, "rewards/accuracies": 0.625, "rewards/chosen": 0.03827132284641266, "rewards/margins": 0.05950234830379486, "rewards/rejected": -0.021231018006801605, "step": 95 }, { "epoch": 0.14512471655328799, "grad_norm": 16.467144012451172, "kl/avg_steps": 0.25, "kl/beta": 0.08630750328302383, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.972637290166157e-07, "logits/chosen": -1.1068183183670044, "logits/rejected": -0.791649580001831, "logps/chosen": -77.20147705078125, "logps/ref_chosen": -77.70825958251953, "logps/ref_rejected": -104.36044311523438, "logps/rejected": -104.63987731933594, "loss": 1.3252, "rewards/accuracies": 0.640625, "rewards/chosen": 0.0430414117872715, "rewards/margins": 0.06671500205993652, "rewards/rejected": -0.02367359772324562, "step": 96 }, { "epoch": 0.14663643235071808, "grad_norm": 16.57407569885254, "kl/avg_steps": 0.1875, "kl/beta": 0.0860922709107399, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.970651810649666e-07, "logits/chosen": -0.7208718061447144, "logits/rejected": -0.7762876152992249, "logps/chosen": -84.24117279052734, "logps/ref_chosen": -84.58918762207031, "logps/ref_rejected": -99.25704956054688, "logps/rejected": -99.26144409179688, "loss": 1.3618, "rewards/accuracies": 0.578125, "rewards/chosen": 0.02926001325249672, "rewards/margins": 0.02920977585017681, "rewards/rejected": 5.023973062634468e-05, "step": 97 }, { "epoch": 0.14814814814814814, "grad_norm": 15.020103454589844, "kl/avg_steps": 0.3125, "kl/beta": 0.08593115210533142, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.968597221690985e-07, "logits/chosen": -0.9258188009262085, "logits/rejected": -0.6664605140686035, "logps/chosen": -74.06013488769531, "logps/ref_chosen": -74.42477416992188, "logps/ref_rejected": -88.93840026855469, "logps/rejected": -88.95329284667969, "loss": 1.3584, "rewards/accuracies": 0.65625, "rewards/chosen": 0.030702810734510422, "rewards/margins": 0.03164215385913849, "rewards/rejected": -0.0009393435902893543, "step": 98 }, { "epoch": 0.14965986394557823, "grad_norm": 15.487700462341309, "kl/avg_steps": 0.28125, "kl/beta": 0.08566345274448395, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.966473580761389e-07, "logits/chosen": -0.9926242232322693, "logits/rejected": -0.7351720333099365, "logps/chosen": -75.0016098022461, "logps/ref_chosen": -75.5974349975586, "logps/ref_rejected": -98.2310791015625, "logps/rejected": -98.35629272460938, "loss": 1.3342, "rewards/accuracies": 0.65625, "rewards/chosen": 0.05039945989847183, "rewards/margins": 0.06047532707452774, "rewards/rejected": -0.010075867176055908, "step": 99 }, { "epoch": 0.15117157974300832, "grad_norm": 16.720571517944336, "kl/avg_steps": 0.40625, "kl/beta": 0.08542320132255554, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.964280947263676e-07, "logits/chosen": -0.7586959004402161, "logits/rejected": -0.699402391910553, "logps/chosen": -98.01969909667969, "logps/ref_chosen": -98.55859375, "logps/ref_rejected": -106.01295471191406, "logps/rejected": -106.32823181152344, "loss": 1.3249, "rewards/accuracies": 0.703125, "rewards/chosen": 0.04501022771000862, "rewards/margins": 0.07151120156049728, "rewards/rejected": -0.026500973850488663, "step": 100 }, { "epoch": 0.15117157974300832, "eval_kl/n_epsilon_steps": 0.32262325286865234, "eval_kl/p_epsilon_steps": 0.6760563254356384, "eval_logits/chosen": -0.7699905037879944, "eval_logits/rejected": -0.7965146899223328, "eval_logps/chosen": -86.22242736816406, "eval_logps/ref_chosen": -86.90177917480469, "eval_logps/ref_rejected": -96.69639587402344, "eval_logps/rejected": -96.91645812988281, "eval_loss": 0.6599990129470825, "eval_rewards/accuracies": 0.6822183132171631, "eval_rewards/chosen": 0.05698024854063988, "eval_rewards/margins": 0.07509617507457733, "eval_rewards/rejected": -0.018115932121872902, "eval_runtime": 46.9988, "eval_samples_per_second": 49.001, "eval_steps_per_second": 1.532, "step": 100 }, { "epoch": 0.15268329554043839, "grad_norm": 13.873627662658691, "kl/avg_steps": 0.5625, "kl/beta": 0.08507757633924484, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.96201938253052e-07, "logits/chosen": -1.3703796863555908, "logits/rejected": -0.8814256191253662, "logps/chosen": -68.59088134765625, "logps/ref_chosen": -69.45216369628906, "logps/ref_rejected": -88.04588317871094, "logps/rejected": -88.23822021484375, "loss": 1.3053, "rewards/accuracies": 0.765625, "rewards/chosen": 0.07237797975540161, "rewards/margins": 0.08829944580793381, "rewards/rejected": -0.015921467915177345, "step": 101 }, { "epoch": 0.15419501133786848, "grad_norm": 14.738540649414062, "kl/avg_steps": 0.28125, "kl/beta": 0.08460168540477753, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.959688949822748e-07, "logits/chosen": -0.8969976305961609, "logits/rejected": -0.8516714572906494, "logps/chosen": -79.81106567382812, "logps/ref_chosen": -80.35308837890625, "logps/ref_rejected": -90.61380004882812, "logps/rejected": -90.76178741455078, "loss": 1.337, "rewards/accuracies": 0.65625, "rewards/chosen": 0.045297279953956604, "rewards/margins": 0.057028163224458694, "rewards/rejected": -0.011730881407856941, "step": 102 }, { "epoch": 0.15570672713529857, "grad_norm": 14.868295669555664, "kl/avg_steps": 0.5, "kl/beta": 0.08436441421508789, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.957289714327572e-07, "logits/chosen": -1.2872700691223145, "logits/rejected": -0.8925309777259827, "logps/chosen": -78.43871307373047, "logps/ref_chosen": -79.30392456054688, "logps/ref_rejected": -93.745361328125, "logps/rejected": -94.05059814453125, "loss": 1.2989, "rewards/accuracies": 0.734375, "rewards/chosen": 0.0718870759010315, "rewards/margins": 0.09726110845804214, "rewards/rejected": -0.025374025106430054, "step": 103 }, { "epoch": 0.15721844293272866, "grad_norm": 15.962653160095215, "kl/avg_steps": 0.40625, "kl/beta": 0.08394469320774078, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.954821743156767e-07, "logits/chosen": -0.6702672243118286, "logits/rejected": -0.5458837747573853, "logps/chosen": -73.63473510742188, "logps/ref_chosen": -74.50674438476562, "logps/ref_rejected": -116.09912872314453, "logps/rejected": -116.38943481445312, "loss": 1.3024, "rewards/accuracies": 0.703125, "rewards/chosen": 0.07236441969871521, "rewards/margins": 0.09585089981555939, "rewards/rejected": -0.023486483842134476, "step": 104 }, { "epoch": 0.15873015873015872, "grad_norm": 15.859411239624023, "kl/avg_steps": 0.3125, "kl/beta": 0.08360504359006882, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.952285105344791e-07, "logits/chosen": -0.8980883955955505, "logits/rejected": -0.9743342995643616, "logps/chosen": -87.27688598632812, "logps/ref_chosen": -87.76654815673828, "logps/ref_rejected": -108.07927703857422, "logps/rejected": -108.57989501953125, "loss": 1.3207, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03998672217130661, "rewards/margins": 0.08090025186538696, "rewards/rejected": -0.04091353714466095, "step": 105 }, { "epoch": 0.1602418745275888, "grad_norm": 14.278593063354492, "kl/avg_steps": 0.375, "kl/beta": 0.08334459364414215, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.949679871846857e-07, "logits/chosen": -0.9452663660049438, "logits/rejected": -0.9867152571678162, "logps/chosen": -75.48688507080078, "logps/ref_chosen": -76.38548278808594, "logps/ref_rejected": -81.63407897949219, "logps/rejected": -81.66070556640625, "loss": 1.3223, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07385978102684021, "rewards/margins": 0.07553449273109436, "rewards/rejected": -0.001674711238592863, "step": 106 }, { "epoch": 0.1617535903250189, "grad_norm": 15.913289070129395, "kl/avg_steps": 0.34375, "kl/beta": 0.08303321897983551, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.947006115536947e-07, "logits/chosen": -1.1638550758361816, "logits/rejected": -0.8853709697723389, "logps/chosen": -95.84579467773438, "logps/ref_chosen": -96.14849853515625, "logps/ref_rejected": -107.0481185913086, "logps/rejected": -107.37059020996094, "loss": 1.346, "rewards/accuracies": 0.65625, "rewards/chosen": 0.024201638996601105, "rewards/margins": 0.05040515959262848, "rewards/rejected": -0.026203524321317673, "step": 107 }, { "epoch": 0.16326530612244897, "grad_norm": 14.315526008605957, "kl/avg_steps": 0.3125, "kl/beta": 0.08274877071380615, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.944263911205772e-07, "logits/chosen": -1.0581843852996826, "logits/rejected": -1.0409530401229858, "logps/chosen": -84.61740112304688, "logps/ref_chosen": -85.39241027832031, "logps/ref_rejected": -97.79592895507812, "logps/rejected": -97.88604736328125, "loss": 1.3261, "rewards/accuracies": 0.640625, "rewards/chosen": 0.06313550472259521, "rewards/margins": 0.07004686444997787, "rewards/rejected": -0.0069113681092858315, "step": 108 }, { "epoch": 0.16477702191987906, "grad_norm": 15.245014190673828, "kl/avg_steps": 0.21875, "kl/beta": 0.08249098807573318, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.941453335558681e-07, "logits/chosen": -1.0355124473571777, "logits/rejected": -1.1666637659072876, "logps/chosen": -78.03228759765625, "logps/ref_chosen": -78.99874877929688, "logps/ref_rejected": -100.79278564453125, "logps/rejected": -101.17771911621094, "loss": 1.2925, "rewards/accuracies": 0.625, "rewards/chosen": 0.0786685198545456, "rewards/margins": 0.10959336161613464, "rewards/rejected": -0.03092484548687935, "step": 109 }, { "epoch": 0.16628873771730915, "grad_norm": 16.8065242767334, "kl/avg_steps": 0.15625, "kl/beta": 0.08231092989444733, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.938574467213517e-07, "logits/chosen": -1.206465244293213, "logits/rejected": -1.1248822212219238, "logps/chosen": -96.55227661132812, "logps/ref_chosen": -96.9527816772461, "logps/ref_rejected": -91.44450378417969, "logps/rejected": -91.25479125976562, "loss": 1.3809, "rewards/accuracies": 0.578125, "rewards/chosen": 0.031805217266082764, "rewards/margins": 0.015741102397441864, "rewards/rejected": 0.0160641111433506, "step": 110 }, { "epoch": 0.16780045351473924, "grad_norm": 13.76848030090332, "kl/avg_steps": 0.125, "kl/beta": 0.08218251913785934, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.935627386698418e-07, "logits/chosen": -1.2548036575317383, "logits/rejected": -1.29953134059906, "logps/chosen": -69.22273254394531, "logps/ref_chosen": -70.01641845703125, "logps/ref_rejected": -92.87696838378906, "logps/rejected": -93.05228424072266, "loss": 1.3207, "rewards/accuracies": 0.578125, "rewards/chosen": 0.06431609392166138, "rewards/margins": 0.07787607610225677, "rewards/rejected": -0.013559989631175995, "step": 111 }, { "epoch": 0.1693121693121693, "grad_norm": 16.01951026916504, "kl/avg_steps": 0.46875, "kl/beta": 0.0820799171924591, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.932612176449559e-07, "logits/chosen": -1.020503044128418, "logits/rejected": -0.8089717626571655, "logps/chosen": -76.8432846069336, "logps/ref_chosen": -77.80027770996094, "logps/ref_rejected": -123.10624694824219, "logps/rejected": -123.45081329345703, "loss": 1.2966, "rewards/accuracies": 0.703125, "rewards/chosen": 0.07723334431648254, "rewards/margins": 0.10491590946912766, "rewards/rejected": -0.02768256887793541, "step": 112 }, { "epoch": 0.1708238851095994, "grad_norm": 13.632536888122559, "kl/avg_steps": 0.1875, "kl/beta": 0.08169696480035782, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.929528920808854e-07, "logits/chosen": -1.1828765869140625, "logits/rejected": -0.7840421199798584, "logps/chosen": -69.23200225830078, "logps/ref_chosen": -70.54346466064453, "logps/ref_rejected": -88.79286193847656, "logps/rejected": -88.31471252441406, "loss": 1.3326, "rewards/accuracies": 0.609375, "rewards/chosen": 0.10599475353956223, "rewards/margins": 0.06640227884054184, "rewards/rejected": 0.039592474699020386, "step": 113 }, { "epoch": 0.17233560090702948, "grad_norm": 15.452795028686523, "kl/avg_steps": 0.3125, "kl/beta": 0.08154407143592834, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.92637770602159e-07, "logits/chosen": -0.8517060875892639, "logits/rejected": -1.1199955940246582, "logps/chosen": -82.729736328125, "logps/ref_chosen": -83.9239501953125, "logps/ref_rejected": -92.85765075683594, "logps/rejected": -93.04132843017578, "loss": 1.296, "rewards/accuracies": 0.65625, "rewards/chosen": 0.09607648849487305, "rewards/margins": 0.11020876467227936, "rewards/rejected": -0.014132272452116013, "step": 114 }, { "epoch": 0.17384731670445955, "grad_norm": 13.919018745422363, "kl/avg_steps": 0.25, "kl/beta": 0.08129003643989563, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.923158620234019e-07, "logits/chosen": -1.2890362739562988, "logits/rejected": -1.096760869026184, "logps/chosen": -68.5827407836914, "logps/ref_chosen": -69.82767486572266, "logps/ref_rejected": -96.51564025878906, "logps/rejected": -96.44905090332031, "loss": 1.3068, "rewards/accuracies": 0.625, "rewards/chosen": 0.10026754438877106, "rewards/margins": 0.09398935735225677, "rewards/rejected": 0.006278195418417454, "step": 115 }, { "epoch": 0.17535903250188964, "grad_norm": 15.033320426940918, "kl/avg_steps": 0.46875, "kl/beta": 0.08108732104301453, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.91987175349089e-07, "logits/chosen": -1.1350982189178467, "logits/rejected": -0.7556982636451721, "logps/chosen": -64.77731323242188, "logps/ref_chosen": -66.19773864746094, "logps/ref_rejected": -90.88304138183594, "logps/rejected": -91.1404800415039, "loss": 1.2686, "rewards/accuracies": 0.75, "rewards/chosen": 0.11404135823249817, "rewards/margins": 0.13410945236682892, "rewards/rejected": -0.020068105310201645, "step": 116 }, { "epoch": 0.17687074829931973, "grad_norm": 13.566264152526855, "kl/avg_steps": 0.46875, "kl/beta": 0.08070899546146393, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.916517197732933e-07, "logits/chosen": -1.1341867446899414, "logits/rejected": -1.0868911743164062, "logps/chosen": -70.42806243896484, "logps/ref_chosen": -72.15988159179688, "logps/ref_rejected": -85.30296325683594, "logps/rejected": -85.21932220458984, "loss": 1.2738, "rewards/accuracies": 0.734375, "rewards/chosen": 0.13824619352817535, "rewards/margins": 0.13081501424312592, "rewards/rejected": 0.007431183010339737, "step": 117 }, { "epoch": 0.17838246409674982, "grad_norm": 13.367738723754883, "kl/avg_steps": 0.34375, "kl/beta": 0.0803324356675148, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.913095046794281e-07, "logits/chosen": -0.7425364851951599, "logits/rejected": -0.7578328847885132, "logps/chosen": -69.99385833740234, "logps/ref_chosen": -71.47773742675781, "logps/ref_rejected": -96.95051574707031, "logps/rejected": -96.75798034667969, "loss": 1.3011, "rewards/accuracies": 0.65625, "rewards/chosen": 0.11790382117033005, "rewards/margins": 0.10169126838445663, "rewards/rejected": 0.016212543472647667, "step": 118 }, { "epoch": 0.17989417989417988, "grad_norm": 13.800806045532227, "kl/avg_steps": 0.125, "kl/beta": 0.08005724102258682, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.909605396399855e-07, "logits/chosen": -1.1756548881530762, "logits/rejected": -1.0593156814575195, "logps/chosen": -76.794677734375, "logps/ref_chosen": -78.2727279663086, "logps/ref_rejected": -94.71317291259766, "logps/rejected": -94.62212371826172, "loss": 1.3024, "rewards/accuracies": 0.578125, "rewards/chosen": 0.11696292459964752, "rewards/margins": 0.10860587656497955, "rewards/rejected": 0.008357064798474312, "step": 119 }, { "epoch": 0.18140589569160998, "grad_norm": 14.784750938415527, "kl/avg_steps": 0.40625, "kl/beta": 0.07995729893445969, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.906048344162676e-07, "logits/chosen": -0.9472384452819824, "logits/rejected": -0.9431383013725281, "logps/chosen": -76.48007202148438, "logps/ref_chosen": -78.43108367919922, "logps/ref_rejected": -100.2771987915039, "logps/rejected": -100.35806274414062, "loss": 1.2469, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1544308066368103, "rewards/margins": 0.1602558195590973, "rewards/rejected": -0.005825025029480457, "step": 120 }, { "epoch": 0.18291761148904007, "grad_norm": 15.232147216796875, "kl/avg_steps": 0.3125, "kl/beta": 0.07963377982378006, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.902423989581143e-07, "logits/chosen": -1.2334175109863281, "logits/rejected": -0.9664556384086609, "logps/chosen": -72.31959533691406, "logps/ref_chosen": -74.08768463134766, "logps/ref_rejected": -118.6731948852539, "logps/rejected": -118.37752532958984, "loss": 1.2912, "rewards/accuracies": 0.671875, "rewards/chosen": 0.13956449925899506, "rewards/margins": 0.11507533490657806, "rewards/rejected": 0.0244891494512558, "step": 121 }, { "epoch": 0.18442932728647016, "grad_norm": 14.288521766662598, "kl/avg_steps": 0.34375, "kl/beta": 0.07938570529222488, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.898732434036243e-07, "logits/chosen": -1.187713384628296, "logits/rejected": -1.2080814838409424, "logps/chosen": -77.47132873535156, "logps/ref_chosen": -79.36761474609375, "logps/ref_rejected": -92.42371368408203, "logps/rejected": -92.08001708984375, "loss": 1.2906, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1492442637681961, "rewards/margins": 0.1209423691034317, "rewards/rejected": 0.028301900252699852, "step": 122 }, { "epoch": 0.18594104308390022, "grad_norm": 13.947713851928711, "kl/avg_steps": 0.46875, "kl/beta": 0.07911375164985657, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.894973780788722e-07, "logits/chosen": -1.215649127960205, "logits/rejected": -1.056673526763916, "logps/chosen": -69.75860595703125, "logps/ref_chosen": -71.91705322265625, "logps/ref_rejected": -96.36418151855469, "logps/rejected": -95.97323608398438, "loss": 1.2698, "rewards/accuracies": 0.734375, "rewards/chosen": 0.16937178373336792, "rewards/margins": 0.13755583763122559, "rewards/rejected": 0.031815946102142334, "step": 123 }, { "epoch": 0.1874527588813303, "grad_norm": 14.389055252075195, "kl/avg_steps": 0.4375, "kl/beta": 0.07874463498592377, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.89114813497619e-07, "logits/chosen": -1.3429356813430786, "logits/rejected": -0.8390483856201172, "logps/chosen": -69.33705139160156, "logps/ref_chosen": -71.72529602050781, "logps/ref_rejected": -111.17984771728516, "logps/rejected": -110.77123260498047, "loss": 1.2621, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1867365539073944, "rewards/margins": 0.15325342118740082, "rewards/rejected": 0.0334831178188324, "step": 124 }, { "epoch": 0.1889644746787604, "grad_norm": 15.154690742492676, "kl/avg_steps": 0.53125, "kl/beta": 0.07840162515640259, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.887255603610184e-07, "logits/chosen": -1.1709169149398804, "logits/rejected": -1.1800987720489502, "logps/chosen": -78.87779235839844, "logps/ref_chosen": -81.55532836914062, "logps/ref_rejected": -110.9144287109375, "logps/rejected": -110.4371566772461, "loss": 1.2443, "rewards/accuracies": 0.75, "rewards/chosen": 0.20813393592834473, "rewards/margins": 0.16979669034481049, "rewards/rejected": 0.03833724558353424, "step": 125 }, { "epoch": 0.19047619047619047, "grad_norm": 14.45041275024414, "kl/avg_steps": 0.21875, "kl/beta": 0.07798732072114944, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.883296295573176e-07, "logits/chosen": -1.2791764736175537, "logits/rejected": -0.9790908098220825, "logps/chosen": -83.41371154785156, "logps/ref_chosen": -87.07349395751953, "logps/ref_rejected": -85.05271911621094, "logps/rejected": -83.06053161621094, "loss": 1.2928, "rewards/accuracies": 0.59375, "rewards/chosen": 0.28314852714538574, "rewards/margins": 0.12732331454753876, "rewards/rejected": 0.15582521259784698, "step": 126 }, { "epoch": 0.19198790627362056, "grad_norm": 13.543227195739746, "kl/avg_steps": 0.40625, "kl/beta": 0.07781709730625153, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.87927032161552e-07, "logits/chosen": -1.2604224681854248, "logits/rejected": -1.135559320449829, "logps/chosen": -76.91162109375, "logps/ref_chosen": -80.4578857421875, "logps/ref_rejected": -90.50740051269531, "logps/rejected": -89.30085754394531, "loss": 1.2325, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2738530933856964, "rewards/margins": 0.17955777049064636, "rewards/rejected": 0.09429533034563065, "step": 127 }, { "epoch": 0.19349962207105065, "grad_norm": 15.481466293334961, "kl/avg_steps": 0.34375, "kl/beta": 0.07750224322080612, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.875177794352363e-07, "logits/chosen": -1.0098241567611694, "logits/rejected": -1.1978508234024048, "logps/chosen": -82.52719116210938, "logps/ref_chosen": -85.77519226074219, "logps/ref_rejected": -112.63516235351562, "logps/rejected": -110.98802185058594, "loss": 1.307, "rewards/accuracies": 0.609375, "rewards/chosen": 0.24998116493225098, "rewards/margins": 0.12104400992393494, "rewards/rejected": 0.12893712520599365, "step": 128 }, { "epoch": 0.19501133786848074, "grad_norm": 14.332625389099121, "kl/avg_steps": 0.1875, "kl/beta": 0.07723674178123474, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.871018828260491e-07, "logits/chosen": -1.243614673614502, "logits/rejected": -1.0912466049194336, "logps/chosen": -81.95756530761719, "logps/ref_chosen": -84.94615173339844, "logps/ref_rejected": -85.36473846435547, "logps/rejected": -84.05223083496094, "loss": 1.2993, "rewards/accuracies": 0.609375, "rewards/chosen": 0.2291855812072754, "rewards/margins": 0.12623238563537598, "rewards/rejected": 0.10295319557189941, "step": 129 }, { "epoch": 0.1965230536659108, "grad_norm": 14.440750122070312, "kl/avg_steps": 0.28125, "kl/beta": 0.07709219306707382, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.866793539675126e-07, "logits/chosen": -1.017459750175476, "logits/rejected": -0.9253628253936768, "logps/chosen": -75.57014465332031, "logps/ref_chosen": -79.01844787597656, "logps/ref_rejected": -97.63998413085938, "logps/rejected": -96.27738952636719, "loss": 1.2618, "rewards/accuracies": 0.65625, "rewards/chosen": 0.2643080949783325, "rewards/margins": 0.15799805521965027, "rewards/rejected": 0.10631005465984344, "step": 130 }, { "epoch": 0.1980347694633409, "grad_norm": 13.493081092834473, "kl/avg_steps": 0.375, "kl/beta": 0.07687597721815109, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.86250204678667e-07, "logits/chosen": -1.1057569980621338, "logits/rejected": -1.128687858581543, "logps/chosen": -64.50733184814453, "logps/ref_chosen": -68.24565887451172, "logps/ref_rejected": -97.99555969238281, "logps/rejected": -96.99533081054688, "loss": 1.2242, "rewards/accuracies": 0.703125, "rewards/chosen": 0.28541046380996704, "rewards/margins": 0.2072276771068573, "rewards/rejected": 0.07818278670310974, "step": 131 }, { "epoch": 0.19954648526077098, "grad_norm": 13.580437660217285, "kl/avg_steps": 0.4375, "kl/beta": 0.07658877223730087, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.858144469637408e-07, "logits/chosen": -1.3431222438812256, "logits/rejected": -1.4877030849456787, "logps/chosen": -78.46906280517578, "logps/ref_chosen": -82.0653305053711, "logps/ref_rejected": -89.47691345214844, "logps/rejected": -87.79734802246094, "loss": 1.2726, "rewards/accuracies": 0.71875, "rewards/chosen": 0.27282848954200745, "rewards/margins": 0.14421497285366058, "rewards/rejected": 0.12861351668834686, "step": 132 }, { "epoch": 0.20105820105820105, "grad_norm": 13.570717811584473, "kl/avg_steps": 0.3125, "kl/beta": 0.07625515758991241, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.853720930118138e-07, "logits/chosen": -1.475085735321045, "logits/rejected": -1.0344434976577759, "logps/chosen": -79.60618591308594, "logps/ref_chosen": -83.70661163330078, "logps/ref_rejected": -89.3868179321289, "logps/rejected": -87.3953628540039, "loss": 1.2664, "rewards/accuracies": 0.625, "rewards/chosen": 0.3101305663585663, "rewards/margins": 0.15772530436515808, "rewards/rejected": 0.1524052619934082, "step": 133 }, { "epoch": 0.20256991685563114, "grad_norm": 12.60647964477539, "kl/avg_steps": 0.5, "kl/beta": 0.0760175958275795, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.849231551964771e-07, "logits/chosen": -1.4141769409179688, "logits/rejected": -1.1397137641906738, "logps/chosen": -66.85868835449219, "logps/ref_chosen": -71.57601165771484, "logps/ref_rejected": -92.34259033203125, "logps/rejected": -90.73544311523438, "loss": 1.2025, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3563517928123474, "rewards/margins": 0.23303887248039246, "rewards/rejected": 0.12331293523311615, "step": 134 }, { "epoch": 0.20408163265306123, "grad_norm": 12.527703285217285, "kl/avg_steps": 0.375, "kl/beta": 0.07563940435647964, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.844676460754862e-07, "logits/chosen": -1.4790236949920654, "logits/rejected": -1.222767949104309, "logps/chosen": -61.59791946411133, "logps/ref_chosen": -66.39884948730469, "logps/ref_rejected": -81.38636016845703, "logps/rejected": -78.73994445800781, "loss": 1.2616, "rewards/accuracies": 0.6875, "rewards/chosen": 0.36091580986976624, "rewards/margins": 0.16017356514930725, "rewards/rejected": 0.2007422298192978, "step": 135 }, { "epoch": 0.20559334845049132, "grad_norm": 13.989602088928223, "kl/avg_steps": 0.1875, "kl/beta": 0.07535681128501892, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.840055783904106e-07, "logits/chosen": -1.0112760066986084, "logits/rejected": -1.174780011177063, "logps/chosen": -83.18777465820312, "logps/ref_chosen": -86.75381469726562, "logps/ref_rejected": -113.35548400878906, "logps/rejected": -112.59575653076172, "loss": 1.2478, "rewards/accuracies": 0.640625, "rewards/chosen": 0.267022967338562, "rewards/margins": 0.20755240321159363, "rewards/rejected": 0.05947057530283928, "step": 136 }, { "epoch": 0.20710506424792138, "grad_norm": 12.38171100616455, "kl/avg_steps": 0.328125, "kl/beta": 0.07521578669548035, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.835369650662767e-07, "logits/chosen": -1.440555453300476, "logits/rejected": -1.2636154890060425, "logps/chosen": -67.68683624267578, "logps/ref_chosen": -72.21119689941406, "logps/ref_rejected": -88.30802917480469, "logps/rejected": -86.6800765991211, "loss": 1.2184, "rewards/accuracies": 0.671875, "rewards/chosen": 0.3381859362125397, "rewards/margins": 0.2144036889076233, "rewards/rejected": 0.12378223240375519, "step": 137 }, { "epoch": 0.20861678004535147, "grad_norm": 12.980399131774902, "kl/avg_steps": 0.34375, "kl/beta": 0.07496979087591171, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.830618192112065e-07, "logits/chosen": -1.4844608306884766, "logits/rejected": -1.660388469696045, "logps/chosen": -71.28809356689453, "logps/ref_chosen": -74.54273223876953, "logps/ref_rejected": -84.63614654541016, "logps/rejected": -83.41443634033203, "loss": 1.2868, "rewards/accuracies": 0.640625, "rewards/chosen": 0.24151670932769775, "rewards/margins": 0.1492181122303009, "rewards/rejected": 0.09229859709739685, "step": 138 }, { "epoch": 0.21012849584278157, "grad_norm": 15.18370246887207, "kl/avg_steps": 0.375, "kl/beta": 0.07471296191215515, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.825801541160509e-07, "logits/chosen": -1.3889563083648682, "logits/rejected": -1.331032156944275, "logps/chosen": -85.1347427368164, "logps/ref_chosen": -87.63740539550781, "logps/ref_rejected": -101.3896484375, "logps/rejected": -101.09443664550781, "loss": 1.2746, "rewards/accuracies": 0.71875, "rewards/chosen": 0.18462873995304108, "rewards/margins": 0.16127237677574158, "rewards/rejected": 0.023356378078460693, "step": 139 }, { "epoch": 0.21164021164021163, "grad_norm": 15.858207702636719, "kl/avg_steps": 0.28125, "kl/beta": 0.0744338408112526, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.820919832540181e-07, "logits/chosen": -1.2838711738586426, "logits/rejected": -1.091439127922058, "logps/chosen": -77.99700927734375, "logps/ref_chosen": -81.32339477539062, "logps/ref_rejected": -99.72755432128906, "logps/rejected": -99.94612121582031, "loss": 1.1982, "rewards/accuracies": 0.640625, "rewards/chosen": 0.24433737993240356, "rewards/margins": 0.25949618220329285, "rewards/rejected": -0.015158784575760365, "step": 140 }, { "epoch": 0.21315192743764172, "grad_norm": 12.245986938476562, "kl/avg_steps": 0.375, "kl/beta": 0.07422508299350739, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.815973202802966e-07, "logits/chosen": -1.2865171432495117, "logits/rejected": -1.0338687896728516, "logps/chosen": -75.33793640136719, "logps/ref_chosen": -78.08534240722656, "logps/ref_rejected": -101.70516967773438, "logps/rejected": -102.56867218017578, "loss": 1.195, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20178569853305817, "rewards/margins": 0.2635989189147949, "rewards/rejected": -0.06181325763463974, "step": 141 }, { "epoch": 0.2146636432350718, "grad_norm": 12.931549072265625, "kl/avg_steps": 0.34375, "kl/beta": 0.07394777238368988, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.810961790316729e-07, "logits/chosen": -1.2304003238677979, "logits/rejected": -1.1563172340393066, "logps/chosen": -80.3508529663086, "logps/ref_chosen": -82.84616088867188, "logps/ref_rejected": -95.14713287353516, "logps/rejected": -95.14163970947266, "loss": 1.2561, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1831914186477661, "rewards/margins": 0.1807531714439392, "rewards/rejected": 0.0024382397532463074, "step": 142 }, { "epoch": 0.2161753590325019, "grad_norm": 15.332486152648926, "kl/avg_steps": 0.171875, "kl/beta": 0.07369445264339447, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.805885735261454e-07, "logits/chosen": -1.1605253219604492, "logits/rejected": -1.1731536388397217, "logps/chosen": -78.10973358154297, "logps/ref_chosen": -80.29791259765625, "logps/ref_rejected": -87.44291687011719, "logps/rejected": -87.5304946899414, "loss": 1.2913, "rewards/accuracies": 0.59375, "rewards/chosen": 0.15912334620952606, "rewards/margins": 0.16363541781902313, "rewards/rejected": -0.004512062296271324, "step": 143 }, { "epoch": 0.21768707482993196, "grad_norm": 18.32091522216797, "kl/avg_steps": 0.03125, "kl/beta": 0.07356800884008408, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 4.800745179625307e-07, "logits/chosen": -1.2224823236465454, "logits/rejected": -0.8715156316757202, "logps/chosen": -77.77732849121094, "logps/ref_chosen": -79.09428405761719, "logps/ref_rejected": -92.42912292480469, "logps/rejected": -93.04374694824219, "loss": 1.3382, "rewards/accuracies": 0.5, "rewards/chosen": 0.09343221783638, "rewards/margins": 0.1373026967048645, "rewards/rejected": -0.04387049004435539, "step": 144 }, { "epoch": 0.21919879062736206, "grad_norm": 15.246169090270996, "kl/avg_steps": 0.46875, "kl/beta": 0.07354502379894257, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.795540267200686e-07, "logits/chosen": -1.3926103115081787, "logits/rejected": -1.1138077974319458, "logps/chosen": -95.6426773071289, "logps/ref_chosen": -97.7087173461914, "logps/ref_rejected": -97.63011169433594, "logps/rejected": -98.70494079589844, "loss": 1.2545, "rewards/accuracies": 0.734375, "rewards/chosen": 0.14838823676109314, "rewards/margins": 0.22605197131633759, "rewards/rejected": -0.07766373455524445, "step": 145 }, { "epoch": 0.22071050642479215, "grad_norm": 13.784503936767578, "kl/avg_steps": 0.28125, "kl/beta": 0.07320188730955124, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.790271143580173e-07, "logits/chosen": -1.4019311666488647, "logits/rejected": -1.174739122390747, "logps/chosen": -74.28959655761719, "logps/ref_chosen": -76.56293487548828, "logps/ref_rejected": -83.78160095214844, "logps/rejected": -83.92266845703125, "loss": 1.2839, "rewards/accuracies": 0.640625, "rewards/chosen": 0.16467270255088806, "rewards/margins": 0.1724810153245926, "rewards/rejected": -0.007808296009898186, "step": 146 }, { "epoch": 0.2222222222222222, "grad_norm": 15.15932559967041, "kl/avg_steps": 0.125, "kl/beta": 0.072996586561203, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.784937956152489e-07, "logits/chosen": -1.3146681785583496, "logits/rejected": -1.2722773551940918, "logps/chosen": -82.0851821899414, "logps/ref_chosen": -83.24113464355469, "logps/ref_rejected": -97.50960540771484, "logps/rejected": -98.47145080566406, "loss": 1.3017, "rewards/accuracies": 0.609375, "rewards/chosen": 0.08227235823869705, "rewards/margins": 0.15057937800884247, "rewards/rejected": -0.06830701977014542, "step": 147 }, { "epoch": 0.2237339380196523, "grad_norm": 12.173563957214355, "kl/avg_steps": 0.4375, "kl/beta": 0.07290545105934143, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.779540854098347e-07, "logits/chosen": -1.4075052738189697, "logits/rejected": -1.5484192371368408, "logps/chosen": -64.48944091796875, "logps/ref_chosen": -66.36277770996094, "logps/ref_rejected": -87.66487121582031, "logps/rejected": -89.90396118164062, "loss": 1.1962, "rewards/accuracies": 0.71875, "rewards/chosen": 0.13385669887065887, "rewards/margins": 0.2946438789367676, "rewards/rejected": -0.1607871800661087, "step": 148 }, { "epoch": 0.2252456538170824, "grad_norm": 13.464912414550781, "kl/avg_steps": 0.28125, "kl/beta": 0.07258788496255875, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.774079988386296e-07, "logits/chosen": -1.3259451389312744, "logits/rejected": -1.4282548427581787, "logps/chosen": -72.2156982421875, "logps/ref_chosen": -72.0576171875, "logps/ref_rejected": -83.94097900390625, "logps/rejected": -86.57736206054688, "loss": 1.273, "rewards/accuracies": 0.640625, "rewards/chosen": -0.013719309121370316, "rewards/margins": 0.17610222101211548, "rewards/rejected": -0.1898215413093567, "step": 149 }, { "epoch": 0.22675736961451248, "grad_norm": 12.965806007385254, "kl/avg_steps": 0.34375, "kl/beta": 0.0723842978477478, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.768555511768486e-07, "logits/chosen": -1.2750177383422852, "logits/rejected": -0.8741225004196167, "logps/chosen": -85.07838439941406, "logps/ref_chosen": -85.52684783935547, "logps/ref_rejected": -108.37450408935547, "logps/rejected": -112.33922576904297, "loss": 1.1922, "rewards/accuracies": 0.703125, "rewards/chosen": 0.02943560853600502, "rewards/margins": 0.3137895464897156, "rewards/rejected": -0.28435391187667847, "step": 150 }, { "epoch": 0.22826908541194255, "grad_norm": 12.630627632141113, "kl/avg_steps": 0.34375, "kl/beta": 0.07213633507490158, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.762967578776406e-07, "logits/chosen": -1.2928177118301392, "logits/rejected": -1.1232836246490479, "logps/chosen": -66.61726379394531, "logps/ref_chosen": -69.160888671875, "logps/ref_rejected": -91.42207336425781, "logps/rejected": -93.68106079101562, "loss": 1.1445, "rewards/accuracies": 0.671875, "rewards/chosen": 0.17991137504577637, "rewards/margins": 0.34157001972198486, "rewards/rejected": -0.1616586446762085, "step": 151 }, { "epoch": 0.22978080120937264, "grad_norm": 13.797011375427246, "kl/avg_steps": 0.375, "kl/beta": 0.07188921421766281, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.757316345716553e-07, "logits/chosen": -1.2502892017364502, "logits/rejected": -1.2125251293182373, "logps/chosen": -71.50172424316406, "logps/ref_chosen": -72.48135375976562, "logps/ref_rejected": -94.44818878173828, "logps/rejected": -97.02812957763672, "loss": 1.232, "rewards/accuracies": 0.703125, "rewards/chosen": 0.06772876530885696, "rewards/margins": 0.2509690523147583, "rewards/rejected": -0.18324029445648193, "step": 152 }, { "epoch": 0.23129251700680273, "grad_norm": 12.633346557617188, "kl/avg_steps": 0.28125, "kl/beta": 0.0716206356883049, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.751601970666064e-07, "logits/chosen": -1.2384766340255737, "logits/rejected": -1.0395536422729492, "logps/chosen": -89.7574691772461, "logps/ref_chosen": -89.6655044555664, "logps/ref_rejected": -90.67737579345703, "logps/rejected": -94.66575622558594, "loss": 1.1962, "rewards/accuracies": 0.6875, "rewards/chosen": -0.008383519947528839, "rewards/margins": 0.2745283842086792, "rewards/rejected": -0.28291189670562744, "step": 153 }, { "epoch": 0.2328042328042328, "grad_norm": 14.03810977935791, "kl/avg_steps": 0.328125, "kl/beta": 0.07141976803541183, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.745824613468292e-07, "logits/chosen": -1.3895330429077148, "logits/rejected": -1.4276092052459717, "logps/chosen": -75.47744750976562, "logps/ref_chosen": -76.58096313476562, "logps/ref_rejected": -78.18670654296875, "logps/rejected": -79.85231018066406, "loss": 1.2873, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07643654942512512, "rewards/margins": 0.19298705458641052, "rewards/rejected": -0.1165504902601242, "step": 154 }, { "epoch": 0.23431594860166288, "grad_norm": 18.652502059936523, "kl/avg_steps": 0.28125, "kl/beta": 0.07118618488311768, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.7399844357283393e-07, "logits/chosen": -1.4403748512268066, "logits/rejected": -1.173947811126709, "logps/chosen": -83.46440124511719, "logps/ref_chosen": -82.65617370605469, "logps/ref_rejected": -95.52484130859375, "logps/rejected": -100.07963562011719, "loss": 1.2294, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0596730001270771, "rewards/margins": 0.2618723511695862, "rewards/rejected": -0.3215453624725342, "step": 155 }, { "epoch": 0.23582766439909297, "grad_norm": 14.09681510925293, "kl/avg_steps": 0.375, "kl/beta": 0.0709865391254425, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.7340816008085305e-07, "logits/chosen": -1.2728259563446045, "logits/rejected": -1.0183078050613403, "logps/chosen": -87.7572250366211, "logps/ref_chosen": -87.66494750976562, "logps/ref_rejected": -108.24376678466797, "logps/rejected": -113.68663787841797, "loss": 1.12, "rewards/accuracies": 0.734375, "rewards/chosen": -0.009032588452100754, "rewards/margins": 0.37457698583602905, "rewards/rejected": -0.3836095631122589, "step": 156 }, { "epoch": 0.23733938019652306, "grad_norm": 12.62127685546875, "kl/avg_steps": 0.15625, "kl/beta": 0.070721335709095, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.728116273823847e-07, "logits/chosen": -1.6038012504577637, "logits/rejected": -1.3034052848815918, "logps/chosen": -69.88165283203125, "logps/ref_chosen": -70.77095794677734, "logps/ref_rejected": -78.78271484375, "logps/rejected": -81.07191467285156, "loss": 1.2581, "rewards/accuracies": 0.625, "rewards/chosen": 0.060784123837947845, "rewards/margins": 0.2202034294605255, "rewards/rejected": -0.15941932797431946, "step": 157 }, { "epoch": 0.23885109599395313, "grad_norm": 14.012279510498047, "kl/avg_steps": 0.28125, "kl/beta": 0.07061100751161575, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.7220886216373085e-07, "logits/chosen": -1.6464459896087646, "logits/rejected": -1.4208977222442627, "logps/chosen": -80.11083984375, "logps/ref_chosen": -81.21516418457031, "logps/ref_rejected": -97.8381118774414, "logps/rejected": -100.73680877685547, "loss": 1.229, "rewards/accuracies": 0.640625, "rewards/chosen": 0.07514195144176483, "rewards/margins": 0.277182936668396, "rewards/rejected": -0.20204098522663116, "step": 158 }, { "epoch": 0.24036281179138322, "grad_norm": 12.303658485412598, "kl/avg_steps": 0.5, "kl/beta": 0.07041297107934952, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.715998812855304e-07, "logits/chosen": -1.1418673992156982, "logits/rejected": -1.0512657165527344, "logps/chosen": -71.02163696289062, "logps/ref_chosen": -72.33412170410156, "logps/ref_rejected": -89.49591064453125, "logps/rejected": -92.77830505371094, "loss": 1.179, "rewards/accuracies": 0.75, "rewards/chosen": 0.08971140533685684, "rewards/margins": 0.31856340169906616, "rewards/rejected": -0.22885200381278992, "step": 159 }, { "epoch": 0.2418745275888133, "grad_norm": 11.714944839477539, "kl/avg_steps": 0.375, "kl/beta": 0.07006265223026276, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.7098470178228755e-07, "logits/chosen": -1.4561386108398438, "logits/rejected": -1.5497028827667236, "logps/chosen": -62.24732971191406, "logps/ref_chosen": -63.26386260986328, "logps/ref_rejected": -82.2786636352539, "logps/rejected": -85.34880828857422, "loss": 1.1985, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0691169872879982, "rewards/margins": 0.28153932094573975, "rewards/rejected": -0.21242231130599976, "step": 160 }, { "epoch": 0.24338624338624337, "grad_norm": 11.859792709350586, "kl/avg_steps": 0.40625, "kl/beta": 0.06980089843273163, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.703633408618955e-07, "logits/chosen": -1.2435733079910278, "logits/rejected": -1.202815294265747, "logps/chosen": -69.17000579833984, "logps/ref_chosen": -70.69304656982422, "logps/ref_rejected": -82.73606872558594, "logps/rejected": -85.28659057617188, "loss": 1.2059, "rewards/accuracies": 0.6875, "rewards/chosen": 0.10437165200710297, "rewards/margins": 0.2798129916191101, "rewards/rejected": -0.17544135451316833, "step": 161 }, { "epoch": 0.24489795918367346, "grad_norm": 13.439538955688477, "kl/avg_steps": 0.546875, "kl/beta": 0.06951848417520523, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.697358159051549e-07, "logits/chosen": -1.508550763130188, "logits/rejected": -1.2809114456176758, "logps/chosen": -88.38290405273438, "logps/ref_chosen": -89.3046646118164, "logps/ref_rejected": -114.05778503417969, "logps/rejected": -119.21257781982422, "loss": 1.087, "rewards/accuracies": 0.796875, "rewards/chosen": 0.0630064308643341, "rewards/margins": 0.4171755909919739, "rewards/rejected": -0.35416918992996216, "step": 162 }, { "epoch": 0.24640967498110355, "grad_norm": 12.317628860473633, "kl/avg_steps": 0.34375, "kl/beta": 0.06914036720991135, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.691021444652876e-07, "logits/chosen": -1.2010798454284668, "logits/rejected": -0.9982933402061462, "logps/chosen": -65.79276275634766, "logps/ref_chosen": -68.61221313476562, "logps/ref_rejected": -89.03155517578125, "logps/rejected": -91.13777160644531, "loss": 1.1662, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19236725568771362, "rewards/margins": 0.33524468541145325, "rewards/rejected": -0.14287742972373962, "step": 163 }, { "epoch": 0.24792139077853365, "grad_norm": 13.418086051940918, "kl/avg_steps": 0.40625, "kl/beta": 0.06890351325273514, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.6846234426744624e-07, "logits/chosen": -1.324411392211914, "logits/rejected": -1.2855700254440308, "logps/chosen": -70.44233703613281, "logps/ref_chosen": -73.55903625488281, "logps/ref_rejected": -94.16201782226562, "logps/rejected": -97.29086303710938, "loss": 1.1162, "rewards/accuracies": 0.734375, "rewards/chosen": 0.21184256672859192, "rewards/margins": 0.42402780055999756, "rewards/rejected": -0.21218520402908325, "step": 164 }, { "epoch": 0.2494331065759637, "grad_norm": 12.428820610046387, "kl/avg_steps": 0.40625, "kl/beta": 0.06862472742795944, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.678164332082175e-07, "logits/chosen": -1.278984785079956, "logits/rejected": -1.2863500118255615, "logps/chosen": -66.50820922851562, "logps/ref_chosen": -68.67132568359375, "logps/ref_rejected": -85.95690155029297, "logps/rejected": -89.47266387939453, "loss": 1.1273, "rewards/accuracies": 0.765625, "rewards/chosen": 0.14679834246635437, "rewards/margins": 0.38445547223091125, "rewards/rejected": -0.23765714466571808, "step": 165 }, { "epoch": 0.2509448223733938, "grad_norm": 13.696086883544922, "kl/avg_steps": 0.34375, "kl/beta": 0.06834706664085388, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.6716442935512214e-07, "logits/chosen": -1.145601511001587, "logits/rejected": -1.1215341091156006, "logps/chosen": -79.69151306152344, "logps/ref_chosen": -80.89754486083984, "logps/ref_rejected": -111.91075134277344, "logps/rejected": -115.30693817138672, "loss": 1.1943, "rewards/accuracies": 0.671875, "rewards/chosen": 0.08006976544857025, "rewards/margins": 0.3093419671058655, "rewards/rejected": -0.22927218675613403, "step": 166 }, { "epoch": 0.25245653817082386, "grad_norm": 11.959296226501465, "kl/avg_steps": 0.40625, "kl/beta": 0.06811293214559555, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -1.280385136604309, "logits/rejected": -1.2254626750946045, "logps/chosen": -73.93049621582031, "logps/ref_chosen": -76.73136138916016, "logps/ref_rejected": -92.57389068603516, "logps/rejected": -95.07466888427734, "loss": 1.1425, "rewards/accuracies": 0.734375, "rewards/chosen": 0.18968580663204193, "rewards/margins": 0.356110155582428, "rewards/rejected": -0.16642434895038605, "step": 167 }, { "epoch": 0.25396825396825395, "grad_norm": 12.643539428710938, "kl/avg_steps": 0.296875, "kl/beta": 0.06783734261989594, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.6584221638904767e-07, "logits/chosen": -1.1431384086608887, "logits/rejected": -1.2114157676696777, "logps/chosen": -81.43280029296875, "logps/ref_chosen": -82.63671112060547, "logps/ref_rejected": -96.72691345214844, "logps/rejected": -99.67520904541016, "loss": 1.1902, "rewards/accuracies": 0.71875, "rewards/chosen": 0.08026599138975143, "rewards/margins": 0.2774825096130371, "rewards/rejected": -0.19721652567386627, "step": 168 }, { "epoch": 0.25547996976568405, "grad_norm": 13.48161792755127, "kl/avg_steps": 0.390625, "kl/beta": 0.06763654202222824, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.651720442612075e-07, "logits/chosen": -1.4378364086151123, "logits/rejected": -1.0551780462265015, "logps/chosen": -76.00579833984375, "logps/ref_chosen": -78.87673950195312, "logps/ref_rejected": -94.18919372558594, "logps/rejected": -97.01301574707031, "loss": 1.1499, "rewards/accuracies": 0.671875, "rewards/chosen": 0.19150575995445251, "rewards/margins": 0.37936002016067505, "rewards/rejected": -0.18785430490970612, "step": 169 }, { "epoch": 0.25699168556311414, "grad_norm": 15.122897148132324, "kl/avg_steps": 0.25, "kl/beta": 0.0673733651638031, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.6449585330874425e-07, "logits/chosen": -1.3555153608322144, "logits/rejected": -1.1191372871398926, "logps/chosen": -71.77020263671875, "logps/ref_chosen": -73.35820007324219, "logps/ref_rejected": -76.85078430175781, "logps/rejected": -78.78128814697266, "loss": 1.3308, "rewards/accuracies": 0.671875, "rewards/chosen": 0.10407811403274536, "rewards/margins": 0.23043477535247803, "rewards/rejected": -0.12635666131973267, "step": 170 }, { "epoch": 0.2585034013605442, "grad_norm": 12.071377754211426, "kl/avg_steps": 0.4375, "kl/beta": 0.06720535457134247, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.6381366244617224e-07, "logits/chosen": -1.0791172981262207, "logits/rejected": -1.18746817111969, "logps/chosen": -79.33969116210938, "logps/ref_chosen": -80.43226623535156, "logps/ref_rejected": -96.99999237060547, "logps/rejected": -102.03294372558594, "loss": 1.1541, "rewards/accuracies": 0.71875, "rewards/chosen": 0.06989102065563202, "rewards/margins": 0.4047943949699402, "rewards/rejected": -0.334903359413147, "step": 171 }, { "epoch": 0.2600151171579743, "grad_norm": 11.951263427734375, "kl/avg_steps": 0.3125, "kl/beta": 0.06691261380910873, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.631254907558365e-07, "logits/chosen": -1.3960208892822266, "logits/rejected": -1.1161054372787476, "logps/chosen": -70.59374237060547, "logps/ref_chosen": -70.45406341552734, "logps/ref_rejected": -99.85604095458984, "logps/rejected": -105.46135711669922, "loss": 1.1674, "rewards/accuracies": 0.6875, "rewards/chosen": -0.012096043676137924, "rewards/margins": 0.3599253296852112, "rewards/rejected": -0.3720213770866394, "step": 172 }, { "epoch": 0.2615268329554044, "grad_norm": 12.691923141479492, "kl/avg_steps": 0.3125, "kl/beta": 0.066704161465168, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.624313574873786e-07, "logits/chosen": -1.4205788373947144, "logits/rejected": -1.4068093299865723, "logps/chosen": -72.95709228515625, "logps/ref_chosen": -72.15026092529297, "logps/ref_rejected": -94.10212707519531, "logps/rejected": -100.36970520019531, "loss": 1.2102, "rewards/accuracies": 0.6875, "rewards/chosen": -0.055580783635377884, "rewards/margins": 0.35744237899780273, "rewards/rejected": -0.4130231440067291, "step": 173 }, { "epoch": 0.26303854875283444, "grad_norm": 12.431907653808594, "kl/avg_steps": 0.53125, "kl/beta": 0.0664963573217392, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.61731282057198e-07, "logits/chosen": -1.0261675119400024, "logits/rejected": -1.1597849130630493, "logps/chosen": -76.77759552001953, "logps/ref_chosen": -75.99628448486328, "logps/ref_rejected": -106.2359619140625, "logps/rejected": -114.14846801757812, "loss": 1.1048, "rewards/accuracies": 0.734375, "rewards/chosen": -0.054633866995573044, "rewards/margins": 0.4673388600349426, "rewards/rejected": -0.5219727158546448, "step": 174 }, { "epoch": 0.26455026455026454, "grad_norm": 13.340142250061035, "kl/avg_steps": 0.375, "kl/beta": 0.06614496558904648, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.6102528404790965e-07, "logits/chosen": -1.486039400100708, "logits/rejected": -1.1863949298858643, "logps/chosen": -86.00851440429688, "logps/ref_chosen": -84.51177978515625, "logps/ref_rejected": -104.46299743652344, "logps/rejected": -111.91407775878906, "loss": 1.1833, "rewards/accuracies": 0.734375, "rewards/chosen": -0.10132718086242676, "rewards/margins": 0.38702377676963806, "rewards/rejected": -0.4883509576320648, "step": 175 }, { "epoch": 0.2660619803476946, "grad_norm": 15.169533729553223, "kl/avg_steps": 0.25, "kl/beta": 0.06589784473180771, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.603133832077953e-07, "logits/chosen": -1.232452392578125, "logits/rejected": -1.3203933238983154, "logps/chosen": -102.6416244506836, "logps/ref_chosen": -98.2034912109375, "logps/ref_rejected": -103.2023696899414, "logps/rejected": -111.3276596069336, "loss": 1.2962, "rewards/accuracies": 0.640625, "rewards/chosen": -0.294032484292984, "rewards/margins": 0.23688749969005585, "rewards/rejected": -0.5309199690818787, "step": 176 }, { "epoch": 0.2675736961451247, "grad_norm": 18.41356086730957, "kl/avg_steps": 0.5, "kl/beta": 0.06573351472616196, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.5959559945025183e-07, "logits/chosen": -1.1291613578796387, "logits/rejected": -1.0879511833190918, "logps/chosen": -77.68388366699219, "logps/ref_chosen": -78.029541015625, "logps/ref_rejected": -112.57099914550781, "logps/rejected": -122.57363891601562, "loss": 0.9551, "rewards/accuracies": 0.78125, "rewards/chosen": 0.021007169038057327, "rewards/margins": 0.6723326444625854, "rewards/rejected": -0.6513254642486572, "step": 177 }, { "epoch": 0.2690854119425548, "grad_norm": 16.913576126098633, "kl/avg_steps": 0.46875, "kl/beta": 0.06540647894144058, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.588719528532341e-07, "logits/chosen": -1.6763261556625366, "logits/rejected": -1.5302388668060303, "logps/chosen": -81.11467742919922, "logps/ref_chosen": -79.48869323730469, "logps/ref_rejected": -96.62449645996094, "logps/rejected": -104.88471984863281, "loss": 1.1116, "rewards/accuracies": 0.765625, "rewards/chosen": -0.10739612579345703, "rewards/margins": 0.4278530478477478, "rewards/rejected": -0.5352491736412048, "step": 178 }, { "epoch": 0.2705971277399849, "grad_norm": 15.363626480102539, "kl/avg_steps": 0.375, "kl/beta": 0.06510131806135178, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.581424636586928e-07, "logits/chosen": -1.5718255043029785, "logits/rejected": -1.4922645092010498, "logps/chosen": -84.71263885498047, "logps/ref_chosen": -84.5088119506836, "logps/ref_rejected": -93.07945251464844, "logps/rejected": -99.22616577148438, "loss": 1.19, "rewards/accuracies": 0.6875, "rewards/chosen": -0.01574593409895897, "rewards/margins": 0.37997791171073914, "rewards/rejected": -0.3957238495349884, "step": 179 }, { "epoch": 0.272108843537415, "grad_norm": 13.242083549499512, "kl/avg_steps": 0.125, "kl/beta": 0.0648581013083458, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 4.5740715227200897e-07, "logits/chosen": -1.630727767944336, "logits/rejected": -1.671911597251892, "logps/chosen": -74.23190307617188, "logps/ref_chosen": -74.56455993652344, "logps/ref_rejected": -81.02265930175781, "logps/rejected": -86.11099243164062, "loss": 1.1924, "rewards/accuracies": 0.609375, "rewards/chosen": 0.019411645829677582, "rewards/margins": 0.34585025906562805, "rewards/rejected": -0.3264386057853699, "step": 180 }, { "epoch": 0.273620559334845, "grad_norm": 16.082420349121094, "kl/avg_steps": 0.28125, "kl/beta": 0.06477712839841843, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.566660392614228e-07, "logits/chosen": -1.7474174499511719, "logits/rejected": -1.6977311372756958, "logps/chosen": -76.37571716308594, "logps/ref_chosen": -78.77166748046875, "logps/ref_rejected": -98.29750061035156, "logps/rejected": -103.26253509521484, "loss": 1.0762, "rewards/accuracies": 0.734375, "rewards/chosen": 0.15446849167346954, "rewards/margins": 0.47106197476387024, "rewards/rejected": -0.3165934979915619, "step": 181 }, { "epoch": 0.2751322751322751, "grad_norm": 14.591492652893066, "kl/avg_steps": 0.5625, "kl/beta": 0.06459545344114304, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.5591914535745817e-07, "logits/chosen": -1.095639944076538, "logits/rejected": -1.1765986680984497, "logps/chosen": -75.13510131835938, "logps/ref_chosen": -75.67765045166016, "logps/ref_rejected": -107.47894287109375, "logps/rejected": -116.218017578125, "loss": 1.0182, "rewards/accuracies": 0.734375, "rewards/chosen": 0.03239423781633377, "rewards/margins": 0.5916929244995117, "rewards/rejected": -0.5592987537384033, "step": 182 }, { "epoch": 0.2766439909297052, "grad_norm": 18.293054580688477, "kl/avg_steps": 0.09375, "kl/beta": 0.06423413753509521, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 4.551664914523433e-07, "logits/chosen": -1.3249409198760986, "logits/rejected": -1.243220567703247, "logps/chosen": -83.7846908569336, "logps/ref_chosen": -79.99969482421875, "logps/ref_rejected": -89.35220336914062, "logps/rejected": -96.42832946777344, "loss": 1.3087, "rewards/accuracies": 0.625, "rewards/chosen": -0.24508771300315857, "rewards/margins": 0.20615342259407043, "rewards/rejected": -0.451241135597229, "step": 183 }, { "epoch": 0.2781557067271353, "grad_norm": 11.336007118225098, "kl/avg_steps": 0.40625, "kl/beta": 0.06417397409677505, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.544080985994258e-07, "logits/chosen": -1.4438279867172241, "logits/rejected": -1.3008147478103638, "logps/chosen": -60.90853500366211, "logps/ref_chosen": -62.133941650390625, "logps/ref_rejected": -84.44404602050781, "logps/rejected": -90.04169464111328, "loss": 1.0963, "rewards/accuracies": 0.75, "rewards/chosen": 0.07818011939525604, "rewards/margins": 0.43237122893333435, "rewards/rejected": -0.3541911244392395, "step": 184 }, { "epoch": 0.2796674225245654, "grad_norm": 11.554147720336914, "kl/avg_steps": 0.375, "kl/beta": 0.06391432136297226, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.5364398801258394e-07, "logits/chosen": -1.1598920822143555, "logits/rejected": -1.2415722608566284, "logps/chosen": -69.17321014404297, "logps/ref_chosen": -67.93175506591797, "logps/ref_rejected": -83.76744079589844, "logps/rejected": -92.03234100341797, "loss": 1.1778, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08316050469875336, "rewards/margins": 0.44052523374557495, "rewards/rejected": -0.5236857533454895, "step": 185 }, { "epoch": 0.2811791383219955, "grad_norm": 13.363798141479492, "kl/avg_steps": 0.234375, "kl/beta": 0.06367553770542145, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.5287418106563354e-07, "logits/chosen": -1.4864418506622314, "logits/rejected": -1.4083607196807861, "logps/chosen": -86.89867401123047, "logps/ref_chosen": -86.22174072265625, "logps/ref_rejected": -100.42019653320312, "logps/rejected": -108.28703308105469, "loss": 1.1692, "rewards/accuracies": 0.640625, "rewards/chosen": -0.045348234474658966, "rewards/margins": 0.4500497281551361, "rewards/rejected": -0.49539801478385925, "step": 186 }, { "epoch": 0.28269085411942557, "grad_norm": 14.167108535766602, "kl/avg_steps": 0.5, "kl/beta": 0.06352664530277252, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.520986992917297e-07, "logits/chosen": -1.370615005493164, "logits/rejected": -1.32820463180542, "logps/chosen": -96.74392700195312, "logps/ref_chosen": -92.81202697753906, "logps/ref_rejected": -117.28926086425781, "logps/rejected": -128.07684326171875, "loss": 1.1352, "rewards/accuracies": 0.75, "rewards/chosen": -0.2524537742137909, "rewards/margins": 0.42868465185165405, "rewards/rejected": -0.6811383962631226, "step": 187 }, { "epoch": 0.2842025699168556, "grad_norm": 13.300924301147461, "kl/avg_steps": 0.34375, "kl/beta": 0.06321059167385101, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.5131756438276466e-07, "logits/chosen": -1.242083191871643, "logits/rejected": -1.1624287366867065, "logps/chosen": -90.72341918945312, "logps/ref_chosen": -87.85247802734375, "logps/ref_rejected": -94.58252716064453, "logps/rejected": -103.83467102050781, "loss": 1.1445, "rewards/accuracies": 0.671875, "rewards/chosen": -0.18243508040905, "rewards/margins": 0.397586464881897, "rewards/rejected": -0.5800215005874634, "step": 188 }, { "epoch": 0.2857142857142857, "grad_norm": 16.380733489990234, "kl/avg_steps": 0.359375, "kl/beta": 0.06299405544996262, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.5053079818876096e-07, "logits/chosen": -0.905377984046936, "logits/rejected": -1.0313442945480347, "logps/chosen": -96.79391479492188, "logps/ref_chosen": -95.00414276123047, "logps/ref_rejected": -90.50090789794922, "logps/rejected": -97.81324768066406, "loss": 1.2432, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11515786498785019, "rewards/margins": 0.34047895669937134, "rewards/rejected": -0.4556368589401245, "step": 189 }, { "epoch": 0.2872260015117158, "grad_norm": 15.347829818725586, "kl/avg_steps": 0.625, "kl/beta": 0.06276848167181015, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.4973842271726024e-07, "logits/chosen": -1.1943118572235107, "logits/rejected": -1.4775378704071045, "logps/chosen": -71.35322570800781, "logps/ref_chosen": -70.79264831542969, "logps/ref_rejected": -122.56155395507812, "logps/rejected": -132.76144409179688, "loss": 1.004, "rewards/accuracies": 0.828125, "rewards/chosen": -0.035754382610321045, "rewards/margins": 0.5974862575531006, "rewards/rejected": -0.6332406401634216, "step": 190 }, { "epoch": 0.2887377173091459, "grad_norm": 15.84787654876709, "kl/avg_steps": 0.40625, "kl/beta": 0.06237861141562462, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.48940460132708e-07, "logits/chosen": -1.1038767099380493, "logits/rejected": -1.179697871208191, "logps/chosen": -97.53190612792969, "logps/ref_chosen": -92.15048217773438, "logps/ref_rejected": -106.4153060913086, "logps/rejected": -117.19906616210938, "loss": 1.2247, "rewards/accuracies": 0.703125, "rewards/chosen": -0.33690476417541504, "rewards/margins": 0.3303537666797638, "rewards/rejected": -0.6672585010528564, "step": 191 }, { "epoch": 0.29024943310657597, "grad_norm": 11.231852531433105, "kl/avg_steps": 0.25, "kl/beta": 0.062126222997903824, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.481369327558329e-07, "logits/chosen": -1.3626068830490112, "logits/rejected": -1.3309613466262817, "logps/chosen": -74.0158920288086, "logps/ref_chosen": -69.51527404785156, "logps/ref_rejected": -80.15898132324219, "logps/rejected": -88.65760040283203, "loss": 1.2594, "rewards/accuracies": 0.640625, "rewards/chosen": -0.2809726595878601, "rewards/margins": 0.243299663066864, "rewards/rejected": -0.5242723226547241, "step": 192 }, { "epoch": 0.29176114890400606, "grad_norm": 11.15489673614502, "kl/avg_steps": 0.375, "kl/beta": 0.06197129562497139, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.47327863063023e-07, "logits/chosen": -1.5196974277496338, "logits/rejected": -1.1110320091247559, "logps/chosen": -75.85136413574219, "logps/ref_chosen": -73.43277740478516, "logps/ref_rejected": -77.81238555908203, "logps/rejected": -88.97630310058594, "loss": 1.0604, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15187571942806244, "rewards/margins": 0.5345999002456665, "rewards/rejected": -0.6864755749702454, "step": 193 }, { "epoch": 0.29327286470143615, "grad_norm": 14.358479499816895, "kl/avg_steps": 0.1875, "kl/beta": 0.06173977255821228, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -1.535353660583496, "logits/rejected": -1.5219361782073975, "logps/chosen": -83.10891723632812, "logps/ref_chosen": -76.63236999511719, "logps/ref_rejected": -85.67449188232422, "logps/rejected": -96.16249084472656, "loss": 1.3277, "rewards/accuracies": 0.625, "rewards/chosen": -0.4020426273345947, "rewards/margins": 0.24061188101768494, "rewards/rejected": -0.642654538154602, "step": 194 }, { "epoch": 0.2947845804988662, "grad_norm": 12.973784446716309, "kl/avg_steps": 0.4375, "kl/beta": 0.06162422522902489, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.4569318740967043e-07, "logits/chosen": -1.4127130508422852, "logits/rejected": -1.2396118640899658, "logps/chosen": -97.07945251464844, "logps/ref_chosen": -89.43354797363281, "logps/ref_rejected": -91.25908660888672, "logps/rejected": -105.1020278930664, "loss": 1.1539, "rewards/accuracies": 0.71875, "rewards/chosen": -0.47174888849258423, "rewards/margins": 0.3756607174873352, "rewards/rejected": -0.8474096059799194, "step": 195 }, { "epoch": 0.2962962962962963, "grad_norm": 12.20384693145752, "kl/avg_steps": 0.4375, "kl/beta": 0.061355795711278915, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.448676271745197e-07, "logits/chosen": -1.1774715185165405, "logits/rejected": -1.1764438152313232, "logps/chosen": -81.13077545166016, "logps/ref_chosen": -75.47528076171875, "logps/ref_rejected": -99.37582397460938, "logps/rejected": -110.72274780273438, "loss": 1.19, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34683603048324585, "rewards/margins": 0.3434637486934662, "rewards/rejected": -0.6902998089790344, "step": 196 }, { "epoch": 0.29780801209372637, "grad_norm": 16.509716033935547, "kl/avg_steps": 0.53125, "kl/beta": 0.06108853220939636, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.440366160729392e-07, "logits/chosen": -1.4987233877182007, "logits/rejected": -1.3290870189666748, "logps/chosen": -72.13752746582031, "logps/ref_chosen": -67.57392883300781, "logps/ref_rejected": -89.97993469238281, "logps/rejected": -102.06908416748047, "loss": 1.1802, "rewards/accuracies": 0.75, "rewards/chosen": -0.2799578905105591, "rewards/margins": 0.4513537287712097, "rewards/rejected": -0.7313116788864136, "step": 197 }, { "epoch": 0.29931972789115646, "grad_norm": 12.162671089172363, "kl/avg_steps": 0.5, "kl/beta": 0.06076571345329285, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.432001773500957e-07, "logits/chosen": -1.3527452945709229, "logits/rejected": -1.3725178241729736, "logps/chosen": -82.77947998046875, "logps/ref_chosen": -77.36013793945312, "logps/ref_rejected": -90.55670166015625, "logps/rejected": -103.86079406738281, "loss": 1.089, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3297055959701538, "rewards/margins": 0.47221100330352783, "rewards/rejected": -0.8019165992736816, "step": 198 }, { "epoch": 0.30083144368858655, "grad_norm": 13.684673309326172, "kl/avg_steps": 0.34375, "kl/beta": 0.060463398694992065, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.4235833440297856e-07, "logits/chosen": -1.1889785528182983, "logits/rejected": -1.427518606185913, "logps/chosen": -79.80760955810547, "logps/ref_chosen": -73.050048828125, "logps/ref_rejected": -95.21923065185547, "logps/rejected": -108.72315216064453, "loss": 1.2043, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41020143032073975, "rewards/margins": 0.40017008781433105, "rewards/rejected": -0.8103715181350708, "step": 199 }, { "epoch": 0.30234315948601664, "grad_norm": 12.483220100402832, "kl/avg_steps": 0.40625, "kl/beta": 0.060256268829107285, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.415111107797445e-07, "logits/chosen": -1.449273705482483, "logits/rejected": -1.5978740453720093, "logps/chosen": -77.9295883178711, "logps/ref_chosen": -73.75833129882812, "logps/ref_rejected": -105.00157165527344, "logps/rejected": -117.43846893310547, "loss": 1.1555, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2541738748550415, "rewards/margins": 0.4897688627243042, "rewards/rejected": -0.7439427375793457, "step": 200 }, { "epoch": 0.30234315948601664, "eval_kl/n_epsilon_steps": 0.3050176203250885, "eval_kl/p_epsilon_steps": 0.6923415660858154, "eval_logits/chosen": -1.2620441913604736, "eval_logits/rejected": -1.321791410446167, "eval_logps/chosen": -91.24237060546875, "eval_logps/ref_chosen": -86.90177917480469, "eval_logps/ref_rejected": -96.69639587402344, "eval_logps/rejected": -108.5374526977539, "eval_loss": 0.5709093809127808, "eval_rewards/accuracies": 0.7090669274330139, "eval_rewards/chosen": -0.2619553506374359, "eval_rewards/margins": 0.4430922269821167, "eval_rewards/rejected": -0.7050475478172302, "eval_runtime": 46.8793, "eval_samples_per_second": 49.126, "eval_steps_per_second": 1.536, "step": 200 }, { "epoch": 0.30385487528344673, "grad_norm": 13.348389625549316, "kl/avg_steps": 0.53125, "kl/beta": 0.060012467205524445, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.4065853017905953e-07, "logits/chosen": -1.494527816772461, "logits/rejected": -1.5776469707489014, "logps/chosen": -84.34037017822266, "logps/ref_chosen": -79.4841079711914, "logps/ref_rejected": -100.94434356689453, "logps/rejected": -114.36886596679688, "loss": 1.0716, "rewards/accuracies": 0.71875, "rewards/chosen": -0.29187309741973877, "rewards/margins": 0.50721275806427, "rewards/rejected": -0.7990858554840088, "step": 201 }, { "epoch": 0.30536659108087677, "grad_norm": 15.9328031539917, "kl/avg_steps": 0.359375, "kl/beta": 0.05969533324241638, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.3980061644943575e-07, "logits/chosen": -1.2178311347961426, "logits/rejected": -1.3383135795593262, "logps/chosen": -69.4677963256836, "logps/ref_chosen": -66.83952331542969, "logps/ref_rejected": -93.05116271972656, "logps/rejected": -103.94529724121094, "loss": 1.0943, "rewards/accuracies": 0.671875, "rewards/chosen": -0.15965795516967773, "rewards/margins": 0.48617392778396606, "rewards/rejected": -0.645831823348999, "step": 202 }, { "epoch": 0.30687830687830686, "grad_norm": 13.11849594116211, "kl/avg_steps": 0.5, "kl/beta": 0.05948157235980034, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.3893739358856455e-07, "logits/chosen": -1.241285800933838, "logits/rejected": -1.5145585536956787, "logps/chosen": -84.9322509765625, "logps/ref_chosen": -80.32998657226562, "logps/ref_rejected": -113.52803039550781, "logps/rejected": -125.73237609863281, "loss": 1.1076, "rewards/accuracies": 0.75, "rewards/chosen": -0.27520662546157837, "rewards/margins": 0.4461778998374939, "rewards/rejected": -0.7213845252990723, "step": 203 }, { "epoch": 0.30839002267573695, "grad_norm": 14.792080879211426, "kl/avg_steps": 0.4375, "kl/beta": 0.05918564647436142, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.380688857426449e-07, "logits/chosen": -1.321890115737915, "logits/rejected": -1.2647485733032227, "logps/chosen": -69.00665283203125, "logps/ref_chosen": -66.68875885009766, "logps/ref_rejected": -85.07586669921875, "logps/rejected": -95.97193145751953, "loss": 1.0778, "rewards/accuracies": 0.75, "rewards/chosen": -0.13773512840270996, "rewards/margins": 0.5010201930999756, "rewards/rejected": -0.6387553215026855, "step": 204 }, { "epoch": 0.30990173847316704, "grad_norm": 13.94343376159668, "kl/avg_steps": 0.3125, "kl/beta": 0.058927834033966064, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.3719511720570814e-07, "logits/chosen": -1.4380276203155518, "logits/rejected": -1.2264991998672485, "logps/chosen": -90.58882141113281, "logps/ref_chosen": -86.5195083618164, "logps/ref_rejected": -112.55375671386719, "logps/rejected": -123.56883239746094, "loss": 1.1952, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24183359742164612, "rewards/margins": 0.4022985100746155, "rewards/rejected": -0.6441320180892944, "step": 205 }, { "epoch": 0.31141345427059713, "grad_norm": 13.377684593200684, "kl/avg_steps": 0.28125, "kl/beta": 0.05874425917863846, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.363161124189387e-07, "logits/chosen": -1.1816599369049072, "logits/rejected": -1.1749916076660156, "logps/chosen": -92.03233337402344, "logps/ref_chosen": -88.68557739257812, "logps/ref_rejected": -97.75945281982422, "logps/rejected": -106.03539276123047, "loss": 1.2678, "rewards/accuracies": 0.640625, "rewards/chosen": -0.199264794588089, "rewards/margins": 0.2834780514240265, "rewards/rejected": -0.4827428460121155, "step": 206 }, { "epoch": 0.3129251700680272, "grad_norm": 13.501809120178223, "kl/avg_steps": 0.46875, "kl/beta": 0.05857950448989868, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.3543189596998986e-07, "logits/chosen": -1.3738679885864258, "logits/rejected": -1.7910804748535156, "logps/chosen": -90.10077667236328, "logps/ref_chosen": -85.12134552001953, "logps/ref_rejected": -103.34955596923828, "logps/rejected": -116.64409637451172, "loss": 1.0961, "rewards/accuracies": 0.734375, "rewards/chosen": -0.2933153212070465, "rewards/margins": 0.4806947708129883, "rewards/rejected": -0.7740100622177124, "step": 207 }, { "epoch": 0.3144368858654573, "grad_norm": 13.599878311157227, "kl/avg_steps": 0.21875, "kl/beta": 0.058306194841861725, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.3454249259229664e-07, "logits/chosen": -1.3307597637176514, "logits/rejected": -1.007921576499939, "logps/chosen": -79.94390869140625, "logps/ref_chosen": -78.84121704101562, "logps/ref_rejected": -89.8250503540039, "logps/rejected": -95.58148193359375, "loss": 1.2927, "rewards/accuracies": 0.609375, "rewards/chosen": -0.06700462847948074, "rewards/margins": 0.2648945748806, "rewards/rejected": -0.3318992257118225, "step": 208 }, { "epoch": 0.31594860166288735, "grad_norm": 14.028180122375488, "kl/avg_steps": 0.359375, "kl/beta": 0.05817892774939537, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.336479271643833e-07, "logits/chosen": -1.3499562740325928, "logits/rejected": -1.2825746536254883, "logps/chosen": -86.71353149414062, "logps/ref_chosen": -85.98588562011719, "logps/ref_rejected": -107.1638412475586, "logps/rejected": -117.89958190917969, "loss": 1.0535, "rewards/accuracies": 0.71875, "rewards/chosen": -0.045121632516384125, "rewards/margins": 0.5746475458145142, "rewards/rejected": -0.6197690963745117, "step": 209 }, { "epoch": 0.31746031746031744, "grad_norm": 14.91519546508789, "kl/avg_steps": 0.40625, "kl/beta": 0.057970594614744186, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.327482247091679e-07, "logits/chosen": -1.2512688636779785, "logits/rejected": -1.3389875888824463, "logps/chosen": -72.40476989746094, "logps/ref_chosen": -71.75653076171875, "logps/ref_rejected": -102.47966003417969, "logps/rejected": -113.26123046875, "loss": 1.0252, "rewards/accuracies": 0.734375, "rewards/chosen": -0.0397222638130188, "rewards/margins": 0.5798994302749634, "rewards/rejected": -0.6196216940879822, "step": 210 }, { "epoch": 0.31897203325774753, "grad_norm": 12.08520221710205, "kl/avg_steps": 0.34375, "kl/beta": 0.05773604288697243, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.3184341039326217e-07, "logits/chosen": -1.4723682403564453, "logits/rejected": -1.5072834491729736, "logps/chosen": -70.78600311279297, "logps/ref_chosen": -70.95170593261719, "logps/ref_rejected": -108.51902770996094, "logps/rejected": -116.66868591308594, "loss": 1.1059, "rewards/accuracies": 0.703125, "rewards/chosen": 0.008105363696813583, "rewards/margins": 0.4733632802963257, "rewards/rejected": -0.4652579426765442, "step": 211 }, { "epoch": 0.3204837490551776, "grad_norm": 15.995363235473633, "kl/avg_steps": 0.3125, "kl/beta": 0.05753825604915619, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.309335095262675e-07, "logits/chosen": -1.3575096130371094, "logits/rejected": -1.1737537384033203, "logps/chosen": -72.88034057617188, "logps/ref_chosen": -74.34010314941406, "logps/ref_rejected": -97.58259582519531, "logps/rejected": -106.42656707763672, "loss": 1.0578, "rewards/accuracies": 0.75, "rewards/chosen": 0.08113045990467072, "rewards/margins": 0.5850059390068054, "rewards/rejected": -0.5038754940032959, "step": 212 }, { "epoch": 0.3219954648526077, "grad_norm": 12.164731979370117, "kl/avg_steps": 0.25, "kl/beta": 0.057359009981155396, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.3001854756006724e-07, "logits/chosen": -1.9412446022033691, "logits/rejected": -1.4302163124084473, "logps/chosen": -78.09148406982422, "logps/ref_chosen": -80.2526626586914, "logps/ref_rejected": -94.76947021484375, "logps/rejected": -100.47601318359375, "loss": 1.1675, "rewards/accuracies": 0.671875, "rewards/chosen": 0.12260451167821884, "rewards/margins": 0.44440123438835144, "rewards/rejected": -0.321796715259552, "step": 213 }, { "epoch": 0.3235071806500378, "grad_norm": 22.89181137084961, "kl/avg_steps": 0.28125, "kl/beta": 0.05721597000956535, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.290985500881143e-07, "logits/chosen": -1.61836576461792, "logits/rejected": -1.6363917589187622, "logps/chosen": -76.35650634765625, "logps/ref_chosen": -77.9675064086914, "logps/ref_rejected": -84.0354232788086, "logps/rejected": -89.8424072265625, "loss": 1.1613, "rewards/accuracies": 0.71875, "rewards/chosen": 0.08922252058982849, "rewards/margins": 0.41794130206108093, "rewards/rejected": -0.32871878147125244, "step": 214 }, { "epoch": 0.3250188964474679, "grad_norm": 11.631714820861816, "kl/avg_steps": 0.4375, "kl/beta": 0.05705549940466881, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.281735428447157e-07, "logits/chosen": -1.3567637205123901, "logits/rejected": -1.3765416145324707, "logps/chosen": -81.66747283935547, "logps/ref_chosen": -81.2047348022461, "logps/ref_rejected": -116.18414306640625, "logps/rejected": -127.70268249511719, "loss": 1.0129, "rewards/accuracies": 0.71875, "rewards/chosen": -0.027690857648849487, "rewards/margins": 0.6228872537612915, "rewards/rejected": -0.6505780220031738, "step": 215 }, { "epoch": 0.32653061224489793, "grad_norm": 13.688364028930664, "kl/avg_steps": 0.5, "kl/beta": 0.05680697038769722, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.2724355170431247e-07, "logits/chosen": -1.2972124814987183, "logits/rejected": -1.4084941148757935, "logps/chosen": -84.6046142578125, "logps/ref_chosen": -83.57113647460938, "logps/ref_rejected": -112.51902770996094, "logps/rejected": -122.14108276367188, "loss": 1.1093, "rewards/accuracies": 0.71875, "rewards/chosen": -0.06136210635304451, "rewards/margins": 0.48042571544647217, "rewards/rejected": -0.54178786277771, "step": 216 }, { "epoch": 0.328042328042328, "grad_norm": 13.525312423706055, "kl/avg_steps": 0.46875, "kl/beta": 0.05652434751391411, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.26308602680756e-07, "logits/chosen": -1.4486957788467407, "logits/rejected": -1.5017526149749756, "logps/chosen": -80.17437744140625, "logps/ref_chosen": -77.01390075683594, "logps/ref_rejected": -105.28099822998047, "logps/rejected": -118.03376770019531, "loss": 1.0978, "rewards/accuracies": 0.765625, "rewards/chosen": -0.1803884506225586, "rewards/margins": 0.5344241261482239, "rewards/rejected": -0.7148125767707825, "step": 217 }, { "epoch": 0.3295540438397581, "grad_norm": 13.993760108947754, "kl/avg_steps": 0.21875, "kl/beta": 0.05626062676310539, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.253687219265803e-07, "logits/chosen": -1.5941836833953857, "logits/rejected": -1.464249610900879, "logps/chosen": -95.8490219116211, "logps/ref_chosen": -92.47299194335938, "logps/ref_rejected": -92.80751037597656, "logps/rejected": -102.18400573730469, "loss": 1.2869, "rewards/accuracies": 0.640625, "rewards/chosen": -0.19337476789951324, "rewards/margins": 0.32971978187561035, "rewards/rejected": -0.5230945944786072, "step": 218 }, { "epoch": 0.3310657596371882, "grad_norm": 12.02961254119873, "kl/avg_steps": 0.484375, "kl/beta": 0.056137826293706894, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.2442393573227043e-07, "logits/chosen": -1.676293134689331, "logits/rejected": -1.6631247997283936, "logps/chosen": -79.24311828613281, "logps/ref_chosen": -77.10382080078125, "logps/ref_rejected": -92.34390258789062, "logps/rejected": -103.1561279296875, "loss": 1.0932, "rewards/accuracies": 0.75, "rewards/chosen": -0.12065555900335312, "rewards/margins": 0.48062241077423096, "rewards/rejected": -0.6012779474258423, "step": 219 }, { "epoch": 0.3325774754346183, "grad_norm": 12.431059837341309, "kl/avg_steps": 0.375, "kl/beta": 0.05586721748113632, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.234742705255272e-07, "logits/chosen": -1.2905054092407227, "logits/rejected": -1.2640047073364258, "logps/chosen": -63.75226974487305, "logps/ref_chosen": -62.48020935058594, "logps/ref_rejected": -86.93277740478516, "logps/rejected": -95.7583236694336, "loss": 1.1614, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07252918183803558, "rewards/margins": 0.415139377117157, "rewards/rejected": -0.48766857385635376, "step": 220 }, { "epoch": 0.3340891912320484, "grad_norm": 11.70767879486084, "kl/avg_steps": 0.421875, "kl/beta": 0.055658500641584396, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.22519752870528e-07, "logits/chosen": -1.5557457208633423, "logits/rejected": -1.4986720085144043, "logps/chosen": -80.43194580078125, "logps/ref_chosen": -78.35491943359375, "logps/ref_rejected": -108.17631530761719, "logps/rejected": -118.90814971923828, "loss": 1.1518, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11834853887557983, "rewards/margins": 0.47405463457107544, "rewards/rejected": -0.5924031734466553, "step": 221 }, { "epoch": 0.3356009070294785, "grad_norm": 14.381481170654297, "kl/avg_steps": 0.4375, "kl/beta": 0.05542467534542084, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.2156040946718343e-07, "logits/chosen": -1.5387976169586182, "logits/rejected": -1.5302550792694092, "logps/chosen": -80.1806640625, "logps/ref_chosen": -77.2734375, "logps/ref_rejected": -126.41007995605469, "logps/rejected": -140.3523406982422, "loss": 1.0157, "rewards/accuracies": 0.703125, "rewards/chosen": -0.16337642073631287, "rewards/margins": 0.6037923097610474, "rewards/rejected": -0.7671687602996826, "step": 222 }, { "epoch": 0.3371126228269085, "grad_norm": 10.387847900390625, "kl/avg_steps": 0.5625, "kl/beta": 0.055183250457048416, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.2059626715039065e-07, "logits/chosen": -1.778557538986206, "logits/rejected": -1.7442163228988647, "logps/chosen": -81.94721984863281, "logps/ref_chosen": -78.4210205078125, "logps/ref_rejected": -101.38420867919922, "logps/rejected": -116.384033203125, "loss": 1.0063, "rewards/accuracies": 0.828125, "rewards/chosen": -0.19502390921115875, "rewards/margins": 0.6252959966659546, "rewards/rejected": -0.8203198909759521, "step": 223 }, { "epoch": 0.3386243386243386, "grad_norm": 15.818403244018555, "kl/avg_steps": 0.53125, "kl/beta": 0.05487458035349846, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.1962735288928304e-07, "logits/chosen": -1.4391138553619385, "logits/rejected": -1.5035314559936523, "logps/chosen": -84.6730728149414, "logps/ref_chosen": -79.36337280273438, "logps/ref_rejected": -89.99789428710938, "logps/rejected": -102.62284088134766, "loss": 1.1549, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2928454875946045, "rewards/margins": 0.39513200521469116, "rewards/rejected": -0.6879774928092957, "step": 224 }, { "epoch": 0.3401360544217687, "grad_norm": 14.311567306518555, "kl/avg_steps": 0.375, "kl/beta": 0.054584600031375885, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.186536937864752e-07, "logits/chosen": -1.5679316520690918, "logits/rejected": -1.7045098543167114, "logps/chosen": -94.7939453125, "logps/ref_chosen": -88.9960708618164, "logps/ref_rejected": -127.55032348632812, "logps/rejected": -143.58074951171875, "loss": 1.1249, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3172207772731781, "rewards/margins": 0.5503044128417969, "rewards/rejected": -0.8675251603126526, "step": 225 }, { "epoch": 0.3416477702191988, "grad_norm": 10.593521118164062, "kl/avg_steps": 0.375, "kl/beta": 0.05438067018985748, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.176753170773052e-07, "logits/chosen": -1.5852404832839966, "logits/rejected": -1.4178290367126465, "logps/chosen": -71.80659484863281, "logps/ref_chosen": -68.68444061279297, "logps/ref_rejected": -85.81898498535156, "logps/rejected": -99.06565856933594, "loss": 1.1023, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17008624970912933, "rewards/margins": 0.5425100922584534, "rewards/rejected": -0.7125963568687439, "step": 226 }, { "epoch": 0.3431594860166289, "grad_norm": 12.188491821289062, "kl/avg_steps": 0.34375, "kl/beta": 0.054177507758140564, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.166922501290729e-07, "logits/chosen": -1.364269733428955, "logits/rejected": -1.418736457824707, "logps/chosen": -77.14056396484375, "logps/ref_chosen": -72.52030181884766, "logps/ref_rejected": -90.7720718383789, "logps/rejected": -105.46800231933594, "loss": 1.1484, "rewards/accuracies": 0.640625, "rewards/chosen": -0.2539520263671875, "rewards/margins": 0.5371447205543518, "rewards/rejected": -0.7910966873168945, "step": 227 }, { "epoch": 0.34467120181405897, "grad_norm": 12.944748878479004, "kl/avg_steps": 0.3125, "kl/beta": 0.05399191007018089, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.1570452044027405e-07, "logits/chosen": -1.4697837829589844, "logits/rejected": -1.390408992767334, "logps/chosen": -77.01600646972656, "logps/ref_chosen": -72.23167419433594, "logps/ref_rejected": -95.45873260498047, "logps/rejected": -109.11170196533203, "loss": 1.1582, "rewards/accuracies": 0.671875, "rewards/chosen": -0.26035553216934204, "rewards/margins": 0.4708428382873535, "rewards/rejected": -0.7311983108520508, "step": 228 }, { "epoch": 0.34618291761148906, "grad_norm": 11.364020347595215, "kl/avg_steps": 0.34375, "kl/beta": 0.05382370948791504, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.147121556398312e-07, "logits/chosen": -1.5286439657211304, "logits/rejected": -1.5656533241271973, "logps/chosen": -66.92510986328125, "logps/ref_chosen": -66.88822174072266, "logps/ref_rejected": -92.27890014648438, "logps/rejected": -102.7301025390625, "loss": 1.0666, "rewards/accuracies": 0.734375, "rewards/chosen": -0.002183683216571808, "rewards/margins": 0.5534392595291138, "rewards/rejected": -0.5556229948997498, "step": 229 }, { "epoch": 0.3476946334089191, "grad_norm": 14.49431324005127, "kl/avg_steps": 0.375, "kl/beta": 0.05363932624459267, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.137151834863213e-07, "logits/chosen": -1.7045170068740845, "logits/rejected": -1.6729419231414795, "logps/chosen": -79.54010009765625, "logps/ref_chosen": -76.12332153320312, "logps/ref_rejected": -78.19171905517578, "logps/rejected": -89.6407241821289, "loss": 1.1833, "rewards/accuracies": 0.703125, "rewards/chosen": -0.1854729950428009, "rewards/margins": 0.4231482148170471, "rewards/rejected": -0.6086212396621704, "step": 230 }, { "epoch": 0.3492063492063492, "grad_norm": 13.21183967590332, "kl/avg_steps": 0.46875, "kl/beta": 0.053438927978277206, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.1271363186719835e-07, "logits/chosen": -1.4684646129608154, "logits/rejected": -1.1993520259857178, "logps/chosen": -96.73280334472656, "logps/ref_chosen": -92.45181274414062, "logps/ref_rejected": -100.89735412597656, "logps/rejected": -117.29405212402344, "loss": 1.0262, "rewards/accuracies": 0.765625, "rewards/chosen": -0.23050335049629211, "rewards/margins": 0.6388251781463623, "rewards/rejected": -0.8693285584449768, "step": 231 }, { "epoch": 0.3507180650037793, "grad_norm": 14.258382797241211, "kl/avg_steps": 0.28125, "kl/beta": 0.05318960174918175, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.1170752879801436e-07, "logits/chosen": -1.6453282833099365, "logits/rejected": -1.5070879459381104, "logps/chosen": -89.7475357055664, "logps/ref_chosen": -86.75383758544922, "logps/ref_rejected": -98.16909790039062, "logps/rejected": -109.11927795410156, "loss": 1.2166, "rewards/accuracies": 0.625, "rewards/chosen": -0.16051898896694183, "rewards/margins": 0.41557577252388, "rewards/rejected": -0.5760947465896606, "step": 232 }, { "epoch": 0.35222978080120937, "grad_norm": 11.350086212158203, "kl/avg_steps": 0.0625, "kl/beta": 0.05304042622447014, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 4.106969024216348e-07, "logits/chosen": -1.3634660243988037, "logits/rejected": -1.2045012712478638, "logps/chosen": -74.8712158203125, "logps/ref_chosen": -72.87556457519531, "logps/ref_rejected": -85.22943115234375, "logps/rejected": -95.50105285644531, "loss": 1.1816, "rewards/accuracies": 0.609375, "rewards/chosen": -0.10806849598884583, "rewards/margins": 0.43179118633270264, "rewards/rejected": -0.5398597121238708, "step": 233 }, { "epoch": 0.35374149659863946, "grad_norm": 11.72382926940918, "kl/avg_steps": 0.1875, "kl/beta": 0.05300729721784592, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.09681781007452e-07, "logits/chosen": -1.547611951828003, "logits/rejected": -1.6473437547683716, "logps/chosen": -71.36154174804688, "logps/ref_chosen": -70.05477905273438, "logps/ref_rejected": -68.7240982055664, "logps/rejected": -78.51712799072266, "loss": 1.1585, "rewards/accuracies": 0.640625, "rewards/chosen": -0.07237260043621063, "rewards/margins": 0.4424641728401184, "rewards/rejected": -0.5148367881774902, "step": 234 }, { "epoch": 0.35525321239606955, "grad_norm": 14.6141996383667, "kl/avg_steps": 0.421875, "kl/beta": 0.052908092737197876, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.08662192950594e-07, "logits/chosen": -1.7234680652618408, "logits/rejected": -1.5612242221832275, "logps/chosen": -85.46180725097656, "logps/ref_chosen": -85.86051940917969, "logps/ref_rejected": -96.14663696289062, "logps/rejected": -106.57968139648438, "loss": 1.0272, "rewards/accuracies": 0.8125, "rewards/chosen": 0.020824704319238663, "rewards/margins": 0.5663049221038818, "rewards/rejected": -0.5454801917076111, "step": 235 }, { "epoch": 0.35676492819349964, "grad_norm": 11.916207313537598, "kl/avg_steps": 0.25, "kl/beta": 0.052685827016830444, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.076381667711306e-07, "logits/chosen": -1.7236182689666748, "logits/rejected": -1.3514024019241333, "logps/chosen": -92.91275787353516, "logps/ref_chosen": -89.75252532958984, "logps/ref_rejected": -99.28534698486328, "logps/rejected": -112.02670288085938, "loss": 1.1745, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17047369480133057, "rewards/margins": 0.4959697127342224, "rewards/rejected": -0.6664433479309082, "step": 236 }, { "epoch": 0.35827664399092973, "grad_norm": 13.740058898925781, "kl/avg_steps": 0.15625, "kl/beta": 0.05255443975329399, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 4.066097311132753e-07, "logits/chosen": -1.4527928829193115, "logits/rejected": -1.2798683643341064, "logps/chosen": -95.31475067138672, "logps/ref_chosen": -92.59001922607422, "logps/ref_rejected": -101.45585632324219, "logps/rejected": -110.77891540527344, "loss": 1.2174, "rewards/accuracies": 0.65625, "rewards/chosen": -0.14533735811710358, "rewards/margins": 0.3403947353363037, "rewards/rejected": -0.4857320785522461, "step": 237 }, { "epoch": 0.35978835978835977, "grad_norm": 10.330193519592285, "kl/avg_steps": 0.375, "kl/beta": 0.05247244983911514, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.0557691474458414e-07, "logits/chosen": -1.3112481832504272, "logits/rejected": -1.2185739278793335, "logps/chosen": -83.69696807861328, "logps/ref_chosen": -82.2470474243164, "logps/ref_rejected": -92.59944152832031, "logps/rejected": -104.04986572265625, "loss": 1.0835, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07855997234582901, "rewards/margins": 0.5173219442367554, "rewards/rejected": -0.5958819389343262, "step": 238 }, { "epoch": 0.36130007558578986, "grad_norm": 12.408441543579102, "kl/avg_steps": 0.46875, "kl/beta": 0.05227641388773918, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.045397465551513e-07, "logits/chosen": -1.2152026891708374, "logits/rejected": -1.4065308570861816, "logps/chosen": -79.02906799316406, "logps/ref_chosen": -75.30878448486328, "logps/ref_rejected": -131.2318115234375, "logps/rejected": -145.25335693359375, "loss": 1.1178, "rewards/accuracies": 0.75, "rewards/chosen": -0.19433492422103882, "rewards/margins": 0.5304996967315674, "rewards/rejected": -0.7248346209526062, "step": 239 }, { "epoch": 0.36281179138321995, "grad_norm": 15.577049255371094, "kl/avg_steps": 0.53125, "kl/beta": 0.05203251168131828, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.0349825555680045e-07, "logits/chosen": -1.8081055879592896, "logits/rejected": -1.8345035314559937, "logps/chosen": -72.12168884277344, "logps/ref_chosen": -70.81785583496094, "logps/ref_rejected": -98.53778839111328, "logps/rejected": -114.63734436035156, "loss": 0.95, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06976085901260376, "rewards/margins": 0.7603122591972351, "rewards/rejected": -0.8300731778144836, "step": 240 }, { "epoch": 0.36432350718065004, "grad_norm": 13.830245971679688, "kl/avg_steps": 0.234375, "kl/beta": 0.05175755172967911, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.0245247088227377e-07, "logits/chosen": -1.710863709449768, "logits/rejected": -1.5587537288665771, "logps/chosen": -91.37069702148438, "logps/ref_chosen": -88.60260009765625, "logps/ref_rejected": -101.42214965820312, "logps/rejected": -111.60231018066406, "loss": 1.1998, "rewards/accuracies": 0.640625, "rewards/chosen": -0.14622732996940613, "rewards/margins": 0.37693488597869873, "rewards/rejected": -0.5231622457504272, "step": 241 }, { "epoch": 0.36583522297808013, "grad_norm": 12.737060546875, "kl/avg_steps": 0.640625, "kl/beta": 0.051636528223752975, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.0140242178441665e-07, "logits/chosen": -1.5724875926971436, "logits/rejected": -1.6078267097473145, "logps/chosen": -78.00882720947266, "logps/ref_chosen": -77.34109497070312, "logps/ref_rejected": -84.76332092285156, "logps/rejected": -97.80973815917969, "loss": 1.0196, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03599818795919418, "rewards/margins": 0.6313221454620361, "rewards/rejected": -0.6673203110694885, "step": 242 }, { "epoch": 0.3673469387755102, "grad_norm": 13.271928787231445, "kl/avg_steps": 0.3125, "kl/beta": 0.051307834684848785, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.003481376353596e-07, "logits/chosen": -1.4511834383010864, "logits/rejected": -1.342944622039795, "logps/chosen": -98.63224792480469, "logps/ref_chosen": -93.55897521972656, "logps/ref_rejected": -89.33551025390625, "logps/rejected": -103.51713562011719, "loss": 1.155, "rewards/accuracies": 0.671875, "rewards/chosen": -0.26296675205230713, "rewards/margins": 0.4598330557346344, "rewards/rejected": -0.7227997779846191, "step": 243 }, { "epoch": 0.3688586545729403, "grad_norm": 10.046283721923828, "kl/avg_steps": 0.59375, "kl/beta": 0.05114799737930298, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.9928964792569654e-07, "logits/chosen": -1.482820749282837, "logits/rejected": -1.142737865447998, "logps/chosen": -71.78289031982422, "logps/ref_chosen": -69.82603454589844, "logps/ref_rejected": -92.47640991210938, "logps/rejected": -110.2171630859375, "loss": 0.8783, "rewards/accuracies": 0.84375, "rewards/chosen": -0.10063998401165009, "rewards/margins": 0.7985442876815796, "rewards/rejected": -0.8991843461990356, "step": 244 }, { "epoch": 0.37037037037037035, "grad_norm": 11.417145729064941, "kl/avg_steps": 0.5625, "kl/beta": 0.050846099853515625, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.982269822636601e-07, "logits/chosen": -1.7022724151611328, "logits/rejected": -1.54628586769104, "logps/chosen": -91.77505493164062, "logps/ref_chosen": -85.68216705322266, "logps/ref_rejected": -93.8754653930664, "logps/rejected": -114.80189514160156, "loss": 0.9433, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3088444173336029, "rewards/margins": 0.7455431222915649, "rewards/rejected": -1.0543875694274902, "step": 245 }, { "epoch": 0.37188208616780044, "grad_norm": 12.649994850158691, "kl/avg_steps": 0.46875, "kl/beta": 0.05056168884038925, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.971601703742932e-07, "logits/chosen": -1.5699760913848877, "logits/rejected": -1.451397180557251, "logps/chosen": -100.3763656616211, "logps/ref_chosen": -90.05093383789062, "logps/ref_rejected": -112.77645874023438, "logps/rejected": -136.43222045898438, "loss": 1.0232, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5240607857704163, "rewards/margins": 0.6649122834205627, "rewards/rejected": -1.1889731884002686, "step": 246 }, { "epoch": 0.37339380196523053, "grad_norm": 19.564983367919922, "kl/avg_steps": 0.25, "kl/beta": 0.05032578855752945, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.960892420986177e-07, "logits/chosen": -1.52877676486969, "logits/rejected": -1.4984183311462402, "logps/chosen": -115.11454772949219, "logps/ref_chosen": -103.23979187011719, "logps/ref_rejected": -105.26278686523438, "logps/rejected": -123.63394165039062, "loss": 1.2711, "rewards/accuracies": 0.640625, "rewards/chosen": -0.5987535715103149, "rewards/margins": 0.3198391795158386, "rewards/rejected": -0.9185927510261536, "step": 247 }, { "epoch": 0.3749055177626606, "grad_norm": 15.282120704650879, "kl/avg_steps": 0.46875, "kl/beta": 0.050200287252664566, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.9501422739279953e-07, "logits/chosen": -1.6684272289276123, "logits/rejected": -1.437838077545166, "logps/chosen": -98.20668029785156, "logps/ref_chosen": -88.16007995605469, "logps/ref_rejected": -75.11514282226562, "logps/rejected": -96.37853240966797, "loss": 1.1208, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5064951777458191, "rewards/margins": 0.5542141199111938, "rewards/rejected": -1.0607093572616577, "step": 248 }, { "epoch": 0.3764172335600907, "grad_norm": 17.950796127319336, "kl/avg_steps": 0.25, "kl/beta": 0.04996607080101967, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.9393515632731094e-07, "logits/chosen": -1.2981152534484863, "logits/rejected": -1.335099458694458, "logps/chosen": -105.10684204101562, "logps/ref_chosen": -91.01773071289062, "logps/ref_rejected": -80.51113891601562, "logps/rejected": -98.92596435546875, "loss": 1.4253, "rewards/accuracies": 0.640625, "rewards/chosen": -0.7059436440467834, "rewards/margins": 0.20767927169799805, "rewards/rejected": -0.9136229157447815, "step": 249 }, { "epoch": 0.3779289493575208, "grad_norm": 17.256160736083984, "kl/avg_steps": 0.53125, "kl/beta": 0.049841467291116714, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.9285205908608934e-07, "logits/chosen": -1.6364936828613281, "logits/rejected": -1.335681438446045, "logps/chosen": -93.50019836425781, "logps/ref_chosen": -80.5888671875, "logps/ref_rejected": -90.15093994140625, "logps/rejected": -117.05657958984375, "loss": 1.0234, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6429185271263123, "rewards/margins": 0.6881313323974609, "rewards/rejected": -1.331049919128418, "step": 250 }, { "epoch": 0.3794406651549509, "grad_norm": 14.625908851623535, "kl/avg_steps": 0.375, "kl/beta": 0.04957808554172516, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.9176496596569265e-07, "logits/chosen": -1.4772026538848877, "logits/rejected": -1.676422357559204, "logps/chosen": -95.66470336914062, "logps/ref_chosen": -82.70405578613281, "logps/ref_rejected": -98.94266510009766, "logps/rejected": -120.11300659179688, "loss": 1.2047, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6434235572814941, "rewards/margins": 0.39970919489860535, "rewards/rejected": -1.0431327819824219, "step": 251 }, { "epoch": 0.38095238095238093, "grad_norm": 11.416725158691406, "kl/avg_steps": 0.4375, "kl/beta": 0.049392860382795334, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.9067390737445254e-07, "logits/chosen": -1.717487096786499, "logits/rejected": -1.6335628032684326, "logps/chosen": -83.50773620605469, "logps/ref_chosen": -73.10369110107422, "logps/ref_rejected": -94.90235900878906, "logps/rejected": -113.88876342773438, "loss": 1.2021, "rewards/accuracies": 0.75, "rewards/chosen": -0.514301598072052, "rewards/margins": 0.41625702381134033, "rewards/rejected": -0.9305586814880371, "step": 252 }, { "epoch": 0.382464096749811, "grad_norm": 17.22648811340332, "kl/avg_steps": 0.375, "kl/beta": 0.049177709966897964, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.8957891383162304e-07, "logits/chosen": -1.2668952941894531, "logits/rejected": -1.309208869934082, "logps/chosen": -80.58470153808594, "logps/ref_chosen": -68.7789535522461, "logps/ref_rejected": -75.98162078857422, "logps/rejected": -95.89788818359375, "loss": 1.1948, "rewards/accuracies": 0.671875, "rewards/chosen": -0.580489993095398, "rewards/margins": 0.3920561969280243, "rewards/rejected": -0.9725462198257446, "step": 253 }, { "epoch": 0.3839758125472411, "grad_norm": 15.513288497924805, "kl/avg_steps": 0.3125, "kl/beta": 0.04899398237466812, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.884800159665276e-07, "logits/chosen": -1.7666159868240356, "logits/rejected": -1.7136458158493042, "logps/chosen": -94.73291778564453, "logps/ref_chosen": -81.49362182617188, "logps/ref_rejected": -101.43673706054688, "logps/rejected": -125.28010559082031, "loss": 1.1055, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6512777805328369, "rewards/margins": 0.5118330717086792, "rewards/rejected": -1.1631108522415161, "step": 254 }, { "epoch": 0.3854875283446712, "grad_norm": 18.647626876831055, "kl/avg_steps": 0.5625, "kl/beta": 0.04884135350584984, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.873772445177015e-07, "logits/chosen": -1.631853699684143, "logits/rejected": -1.4802442789077759, "logps/chosen": -102.59626770019531, "logps/ref_chosen": -90.46350860595703, "logps/ref_rejected": -105.32445526123047, "logps/rejected": -128.67465209960938, "loss": 1.1135, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5910813808441162, "rewards/margins": 0.5390965938568115, "rewards/rejected": -1.1301779747009277, "step": 255 }, { "epoch": 0.3869992441421013, "grad_norm": 11.703644752502441, "kl/avg_steps": 0.46875, "kl/beta": 0.048568155616521835, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.862706303320329e-07, "logits/chosen": -1.4065661430358887, "logits/rejected": -1.6162680387496948, "logps/chosen": -95.55255126953125, "logps/ref_chosen": -81.56578826904297, "logps/ref_rejected": -108.58460998535156, "logps/rejected": -134.34637451171875, "loss": 1.1095, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6802552938461304, "rewards/margins": 0.5633381605148315, "rewards/rejected": -1.243593454360962, "step": 256 }, { "epoch": 0.3885109599395314, "grad_norm": 16.859773635864258, "kl/avg_steps": 0.46875, "kl/beta": 0.048341553658246994, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.851602043638994e-07, "logits/chosen": -1.5432624816894531, "logits/rejected": -1.2107794284820557, "logps/chosen": -105.39306640625, "logps/ref_chosen": -89.57557678222656, "logps/ref_rejected": -123.74462127685547, "logps/rejected": -150.5018310546875, "loss": 1.1676, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7634068727493286, "rewards/margins": 0.52019864320755, "rewards/rejected": -1.2836055755615234, "step": 257 }, { "epoch": 0.3900226757369615, "grad_norm": 15.744192123413086, "kl/avg_steps": 0.625, "kl/beta": 0.048116009682416916, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.840459976743023e-07, "logits/chosen": -1.4739046096801758, "logits/rejected": -1.587180256843567, "logps/chosen": -92.76885986328125, "logps/ref_chosen": -77.34173583984375, "logps/ref_rejected": -99.5709228515625, "logps/rejected": -127.6431655883789, "loss": 0.9873, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7396783828735352, "rewards/margins": 0.601047158241272, "rewards/rejected": -1.3407254219055176, "step": 258 }, { "epoch": 0.3915343915343915, "grad_norm": 12.638566970825195, "kl/avg_steps": 0.5625, "kl/beta": 0.047817155718803406, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.8292804142999796e-07, "logits/chosen": -1.365210771560669, "logits/rejected": -1.1926498413085938, "logps/chosen": -91.97223663330078, "logps/ref_chosen": -82.39556121826172, "logps/ref_rejected": -113.73309326171875, "logps/rejected": -140.43887329101562, "loss": 0.9534, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4582955837249756, "rewards/margins": 0.808417797088623, "rewards/rejected": -1.2667133808135986, "step": 259 }, { "epoch": 0.3930461073318216, "grad_norm": 22.389368057250977, "kl/avg_steps": 0.4375, "kl/beta": 0.04754968732595444, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.818063669026256e-07, "logits/chosen": -1.1842126846313477, "logits/rejected": -1.1899852752685547, "logps/chosen": -79.66688537597656, "logps/ref_chosen": -65.98947143554688, "logps/ref_rejected": -94.59706115722656, "logps/rejected": -119.67667388916016, "loss": 1.1858, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6499999761581421, "rewards/margins": 0.5323763489723206, "rewards/rejected": -1.1823763847351074, "step": 260 }, { "epoch": 0.3945578231292517, "grad_norm": 14.604168891906738, "kl/avg_steps": 0.34375, "kl/beta": 0.04734256491065025, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.806810054678331e-07, "logits/chosen": -1.3006818294525146, "logits/rejected": -1.2254362106323242, "logps/chosen": -101.1712646484375, "logps/ref_chosen": -88.87684631347656, "logps/ref_rejected": -82.348388671875, "logps/rejected": -103.30619049072266, "loss": 1.2224, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5821806192398071, "rewards/margins": 0.40273311734199524, "rewards/rejected": -0.98491370677948, "step": 261 }, { "epoch": 0.3960695389266818, "grad_norm": 11.085594177246094, "kl/avg_steps": 0.4375, "kl/beta": 0.047180380672216415, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.7955198860439887e-07, "logits/chosen": -1.4064021110534668, "logits/rejected": -1.547346830368042, "logps/chosen": -94.58271789550781, "logps/ref_chosen": -85.81719970703125, "logps/ref_rejected": -105.49027252197266, "logps/rejected": -125.6654281616211, "loss": 1.0612, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4132178723812103, "rewards/margins": 0.5312476754188538, "rewards/rejected": -0.9444655179977417, "step": 262 }, { "epoch": 0.3975812547241119, "grad_norm": 11.340027809143066, "kl/avg_steps": 0.390625, "kl/beta": 0.04697486758232117, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.784193478933516e-07, "logits/chosen": -1.2239423990249634, "logits/rejected": -1.7259702682495117, "logps/chosen": -82.4855728149414, "logps/ref_chosen": -73.61693572998047, "logps/ref_rejected": -102.39161682128906, "logps/rejected": -123.17403411865234, "loss": 1.0883, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4169412851333618, "rewards/margins": 0.5516859889030457, "rewards/rejected": -0.9686272740364075, "step": 263 }, { "epoch": 0.39909297052154197, "grad_norm": 10.866528511047363, "kl/avg_steps": 0.5, "kl/beta": 0.04679208621382713, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.7728311501708674e-07, "logits/chosen": -1.3833404779434204, "logits/rejected": -1.5971425771713257, "logps/chosen": -112.40450286865234, "logps/ref_chosen": -101.57856750488281, "logps/ref_rejected": -111.6573486328125, "logps/rejected": -135.07891845703125, "loss": 1.0381, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5059612989425659, "rewards/margins": 0.5817750096321106, "rewards/rejected": -1.0877363681793213, "step": 264 }, { "epoch": 0.40060468631897206, "grad_norm": 12.059004783630371, "kl/avg_steps": 0.5625, "kl/beta": 0.04655928909778595, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.7614332175848027e-07, "logits/chosen": -1.4162359237670898, "logits/rejected": -1.36917245388031, "logps/chosen": -72.67507934570312, "logps/ref_chosen": -65.76426696777344, "logps/ref_rejected": -85.19627380371094, "logps/rejected": -107.6723861694336, "loss": 1.0147, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3222898840904236, "rewards/margins": 0.7146923542022705, "rewards/rejected": -1.0369821786880493, "step": 265 }, { "epoch": 0.4021164021164021, "grad_norm": 11.972450256347656, "kl/avg_steps": 0.5625, "kl/beta": 0.046298857778310776, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.75e-07, "logits/chosen": -1.4073446989059448, "logits/rejected": -1.5456604957580566, "logps/chosen": -80.41755676269531, "logps/ref_chosen": -75.05682373046875, "logps/ref_rejected": -97.52758026123047, "logps/rejected": -116.14582824707031, "loss": 1.0522, "rewards/accuracies": 0.796875, "rewards/chosen": -0.2491399049758911, "rewards/margins": 0.6057909727096558, "rewards/rejected": -0.8549308776855469, "step": 266 }, { "epoch": 0.4036281179138322, "grad_norm": 11.077033042907715, "kl/avg_steps": 0.4375, "kl/beta": 0.0460398830473423, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.738531817228131e-07, "logits/chosen": -1.2547085285186768, "logits/rejected": -1.262139916419983, "logps/chosen": -75.00776672363281, "logps/ref_chosen": -71.13494110107422, "logps/ref_rejected": -81.14566040039062, "logps/rejected": -96.05207824707031, "loss": 1.1117, "rewards/accuracies": 0.75, "rewards/chosen": -0.18061110377311707, "rewards/margins": 0.5004351735115051, "rewards/rejected": -0.6810463666915894, "step": 267 }, { "epoch": 0.4051398337112623, "grad_norm": 10.130515098571777, "kl/avg_steps": 0.1875, "kl/beta": 0.0458393357694149, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 3.7270289900589204e-07, "logits/chosen": -1.469900369644165, "logits/rejected": -1.4314866065979004, "logps/chosen": -85.93333435058594, "logps/ref_chosen": -80.06082153320312, "logps/ref_rejected": -87.43035888671875, "logps/rejected": -101.26762390136719, "loss": 1.2189, "rewards/accuracies": 0.578125, "rewards/chosen": -0.2703646123409271, "rewards/margins": 0.359006404876709, "rewards/rejected": -0.6293710470199585, "step": 268 }, { "epoch": 0.40665154950869237, "grad_norm": 10.948187828063965, "kl/avg_steps": 0.453125, "kl/beta": 0.04575354605913162, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.7154918402511714e-07, "logits/chosen": -1.5809710025787354, "logits/rejected": -1.2682452201843262, "logps/chosen": -90.92002868652344, "logps/ref_chosen": -83.36943817138672, "logps/ref_rejected": -100.66839599609375, "logps/rejected": -119.58181762695312, "loss": 1.0943, "rewards/accuracies": 0.71875, "rewards/chosen": -0.34568658471107483, "rewards/margins": 0.5124187469482422, "rewards/rejected": -0.8581053018569946, "step": 269 }, { "epoch": 0.40816326530612246, "grad_norm": 11.601164817810059, "kl/avg_steps": 0.46875, "kl/beta": 0.045547161251306534, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.7039206905237656e-07, "logits/chosen": -1.4058010578155518, "logits/rejected": -1.5133944749832153, "logps/chosen": -91.64334106445312, "logps/ref_chosen": -85.35945129394531, "logps/ref_rejected": -104.47489929199219, "logps/rejected": -121.33040618896484, "loss": 1.1188, "rewards/accuracies": 0.75, "rewards/chosen": -0.2873057723045349, "rewards/margins": 0.4743884801864624, "rewards/rejected": -0.7616941928863525, "step": 270 }, { "epoch": 0.40967498110355255, "grad_norm": 12.935283660888672, "kl/avg_steps": 0.1875, "kl/beta": 0.04533465579152107, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 3.692315864546635e-07, "logits/chosen": -1.7285494804382324, "logits/rejected": -1.6788990497589111, "logps/chosen": -93.83595275878906, "logps/ref_chosen": -86.01373291015625, "logps/ref_rejected": -109.99561309814453, "logps/rejected": -125.69892120361328, "loss": 1.2685, "rewards/accuracies": 0.546875, "rewards/chosen": -0.35634368658065796, "rewards/margins": 0.3500638008117676, "rewards/rejected": -0.7064074873924255, "step": 271 }, { "epoch": 0.41118669690098264, "grad_norm": 14.665283203125, "kl/avg_steps": 0.53125, "kl/beta": 0.0452498123049736, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.6806776869317067e-07, "logits/chosen": -1.4215366840362549, "logits/rejected": -1.2945075035095215, "logps/chosen": -89.55738830566406, "logps/ref_chosen": -86.3701400756836, "logps/ref_rejected": -85.74638366699219, "logps/rejected": -104.7607421875, "loss": 0.9318, "rewards/accuracies": 0.796875, "rewards/chosen": -0.145250603556633, "rewards/margins": 0.7082393169403076, "rewards/rejected": -0.853489875793457, "step": 272 }, { "epoch": 0.4126984126984127, "grad_norm": 17.91227912902832, "kl/avg_steps": 0.5625, "kl/beta": 0.04501069337129593, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.669006483223828e-07, "logits/chosen": -1.6112267971038818, "logits/rejected": -1.544572114944458, "logps/chosen": -82.86746215820312, "logps/ref_chosen": -75.51087951660156, "logps/ref_rejected": -101.60345458984375, "logps/rejected": -121.30127716064453, "loss": 1.1397, "rewards/accuracies": 0.765625, "rewards/chosen": -0.33276399970054626, "rewards/margins": 0.5460294485092163, "rewards/rejected": -0.8787934184074402, "step": 273 }, { "epoch": 0.41421012849584277, "grad_norm": 10.048867225646973, "kl/avg_steps": 0.4375, "kl/beta": 0.04475892335176468, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.657302579891656e-07, "logits/chosen": -1.2657063007354736, "logits/rejected": -1.2205548286437988, "logps/chosen": -86.40071105957031, "logps/ref_chosen": -79.040283203125, "logps/ref_rejected": -86.31329345703125, "logps/rejected": -106.71549987792969, "loss": 1.0445, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3302973508834839, "rewards/margins": 0.5760163068771362, "rewards/rejected": -0.9063136577606201, "step": 274 }, { "epoch": 0.41572184429327286, "grad_norm": 10.825162887573242, "kl/avg_steps": 0.5625, "kl/beta": 0.044563956558704376, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.645566304318526e-07, "logits/chosen": -1.4031257629394531, "logits/rejected": -1.5135775804519653, "logps/chosen": -75.87445831298828, "logps/ref_chosen": -71.82034301757812, "logps/ref_rejected": -94.29946899414062, "logps/rejected": -114.9261474609375, "loss": 0.9428, "rewards/accuracies": 0.796875, "rewards/chosen": -0.1804373562335968, "rewards/margins": 0.7301989793777466, "rewards/rejected": -0.910636305809021, "step": 275 }, { "epoch": 0.41723356009070295, "grad_norm": 14.92078685760498, "kl/avg_steps": 0.46875, "kl/beta": 0.044314686208963394, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.633797984793294e-07, "logits/chosen": -1.5658140182495117, "logits/rejected": -1.4458580017089844, "logps/chosen": -74.48373413085938, "logps/ref_chosen": -69.54020690917969, "logps/ref_rejected": -78.59674072265625, "logps/rejected": -96.87590026855469, "loss": 1.0668, "rewards/accuracies": 0.78125, "rewards/chosen": -0.22064831852912903, "rewards/margins": 0.582705020904541, "rewards/rejected": -0.8033533096313477, "step": 276 }, { "epoch": 0.41874527588813304, "grad_norm": 12.200444221496582, "kl/avg_steps": 0.171875, "kl/beta": 0.04410793259739876, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.578125, "learning_rate": 3.6219979505011555e-07, "logits/chosen": -1.7910408973693848, "logits/rejected": -1.4916061162948608, "logps/chosen": -105.21025085449219, "logps/ref_chosen": -94.4896240234375, "logps/ref_rejected": -85.45901489257812, "logps/rejected": -103.62118530273438, "loss": 1.2871, "rewards/accuracies": 0.640625, "rewards/chosen": -0.47567400336265564, "rewards/margins": 0.3210156559944153, "rewards/rejected": -0.7966896891593933, "step": 277 }, { "epoch": 0.42025699168556313, "grad_norm": 13.179181098937988, "kl/avg_steps": 0.421875, "kl/beta": 0.044032249599695206, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.6101665315144353e-07, "logits/chosen": -1.5639655590057373, "logits/rejected": -1.7161953449249268, "logps/chosen": -97.54840850830078, "logps/ref_chosen": -87.42613220214844, "logps/ref_rejected": -105.44854736328125, "logps/rejected": -126.17444610595703, "loss": 1.183, "rewards/accuracies": 0.734375, "rewards/chosen": -0.44696807861328125, "rewards/margins": 0.45861679315567017, "rewards/rejected": -0.9055849313735962, "step": 278 }, { "epoch": 0.4217687074829932, "grad_norm": 12.131983757019043, "kl/avg_steps": 0.625, "kl/beta": 0.04384727030992508, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.5983040587833563e-07, "logits/chosen": -1.8436882495880127, "logits/rejected": -1.4893114566802979, "logps/chosen": -72.60688781738281, "logps/ref_chosen": -70.516845703125, "logps/ref_rejected": -86.04248809814453, "logps/rejected": -106.16204071044922, "loss": 0.9083, "rewards/accuracies": 0.828125, "rewards/chosen": -0.09204297512769699, "rewards/margins": 0.7816708087921143, "rewards/rejected": -0.8737137913703918, "step": 279 }, { "epoch": 0.42328042328042326, "grad_norm": 17.89347267150879, "kl/avg_steps": 0.578125, "kl/beta": 0.043574925512075424, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.586410864126781e-07, "logits/chosen": -1.7019422054290771, "logits/rejected": -1.617280125617981, "logps/chosen": -81.91899108886719, "logps/ref_chosen": -76.5021743774414, "logps/ref_rejected": -94.2752685546875, "logps/rejected": -116.81834411621094, "loss": 0.9263, "rewards/accuracies": 0.84375, "rewards/chosen": -0.23566317558288574, "rewards/margins": 0.7379894256591797, "rewards/rejected": -0.9736526608467102, "step": 280 }, { "epoch": 0.42479213907785335, "grad_norm": 10.356485366821289, "kl/avg_steps": 0.40625, "kl/beta": 0.04332445561885834, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.574487280222929e-07, "logits/chosen": -1.6503106355667114, "logits/rejected": -1.4820497035980225, "logps/chosen": -84.52981567382812, "logps/ref_chosen": -77.50468444824219, "logps/ref_rejected": -79.05716705322266, "logps/rejected": -99.70474243164062, "loss": 1.0542, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3050358295440674, "rewards/margins": 0.5821521282196045, "rewards/rejected": -0.8871879577636719, "step": 281 }, { "epoch": 0.42630385487528344, "grad_norm": 14.454909324645996, "kl/avg_steps": 0.46875, "kl/beta": 0.04314916208386421, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.562533640600075e-07, "logits/chosen": -1.5837814807891846, "logits/rejected": -1.5102015733718872, "logps/chosen": -89.67692565917969, "logps/ref_chosen": -80.31298065185547, "logps/ref_rejected": -83.72120666503906, "logps/rejected": -108.5927505493164, "loss": 1.032, "rewards/accuracies": 0.765625, "rewards/chosen": -0.40527933835983276, "rewards/margins": 0.6601608991622925, "rewards/rejected": -1.06544029712677, "step": 282 }, { "epoch": 0.42781557067271353, "grad_norm": 12.454751968383789, "kl/avg_steps": 0.5, "kl/beta": 0.04294784367084503, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.550550279627215e-07, "logits/chosen": -1.2369191646575928, "logits/rejected": -1.814032793045044, "logps/chosen": -93.92501831054688, "logps/ref_chosen": -80.72602844238281, "logps/ref_rejected": -115.68379211425781, "logps/rejected": -142.0513916015625, "loss": 1.1111, "rewards/accuracies": 0.75, "rewards/chosen": -0.5656991004943848, "rewards/margins": 0.5568342804908752, "rewards/rejected": -1.1225333213806152, "step": 283 }, { "epoch": 0.4293272864701436, "grad_norm": 10.446672439575195, "kl/avg_steps": 0.578125, "kl/beta": 0.04273417592048645, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -1.236511468887329, "logits/rejected": -1.3110226392745972, "logps/chosen": -86.48180389404297, "logps/ref_chosen": -77.5223388671875, "logps/ref_rejected": -104.1847152709961, "logps/rejected": -130.71986389160156, "loss": 0.9483, "rewards/accuracies": 0.796875, "rewards/chosen": -0.38278716802597046, "rewards/margins": 0.742293119430542, "rewards/rejected": -1.1250803470611572, "step": 284 }, { "epoch": 0.4308390022675737, "grad_norm": 14.101186752319336, "kl/avg_steps": 0.328125, "kl/beta": 0.04248853772878647, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.5264957352549375e-07, "logits/chosen": -1.3226224184036255, "logits/rejected": -1.2694287300109863, "logps/chosen": -105.07521057128906, "logps/ref_chosen": -85.79348754882812, "logps/ref_rejected": -96.46463775634766, "logps/rejected": -126.5572509765625, "loss": 1.2016, "rewards/accuracies": 0.671875, "rewards/chosen": -0.8215754628181458, "rewards/margins": 0.45096495747566223, "rewards/rejected": -1.2725403308868408, "step": 285 }, { "epoch": 0.4323507180650038, "grad_norm": 10.810320854187012, "kl/avg_steps": 0.53125, "kl/beta": 0.04234957695007324, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.514425224712835e-07, "logits/chosen": -1.5429000854492188, "logits/rejected": -1.6007463932037354, "logps/chosen": -94.66502380371094, "logps/ref_chosen": -77.86268615722656, "logps/ref_rejected": -110.77134704589844, "logps/rejected": -144.87844848632812, "loss": 0.997, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7112575769424438, "rewards/margins": 0.723479151725769, "rewards/rejected": -1.434736728668213, "step": 286 }, { "epoch": 0.43386243386243384, "grad_norm": 11.010448455810547, "kl/avg_steps": 0.59375, "kl/beta": 0.04212578386068344, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.502326338516534e-07, "logits/chosen": -1.6829123497009277, "logits/rejected": -1.173173189163208, "logps/chosen": -73.6214599609375, "logps/ref_chosen": -62.552825927734375, "logps/ref_rejected": -77.7650146484375, "logps/rejected": -112.00250244140625, "loss": 0.8527, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4675137996673584, "rewards/margins": 0.9647963047027588, "rewards/rejected": -1.4323101043701172, "step": 287 }, { "epoch": 0.43537414965986393, "grad_norm": 15.223966598510742, "kl/avg_steps": 0.375, "kl/beta": 0.0418771393597126, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.490199415097892e-07, "logits/chosen": -1.740882396697998, "logits/rejected": -1.5782954692840576, "logps/chosen": -103.83071899414062, "logps/ref_chosen": -83.74117279052734, "logps/ref_rejected": -106.93913269042969, "logps/rejected": -139.09048461914062, "loss": 1.1656, "rewards/accuracies": 0.71875, "rewards/chosen": -0.841812252998352, "rewards/margins": 0.49655526876449585, "rewards/rejected": -1.3383675813674927, "step": 288 }, { "epoch": 0.436885865457294, "grad_norm": 10.957082748413086, "kl/avg_steps": 0.3125, "kl/beta": 0.041720688343048096, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.4780447936730247e-07, "logits/chosen": -1.3379182815551758, "logits/rejected": -1.224417805671692, "logps/chosen": -92.84049987792969, "logps/ref_chosen": -73.04204559326172, "logps/ref_rejected": -88.07904052734375, "logps/rejected": -124.24183654785156, "loss": 1.0359, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8277724981307983, "rewards/margins": 0.6740319728851318, "rewards/rejected": -1.5018043518066406, "step": 289 }, { "epoch": 0.4383975812547241, "grad_norm": 11.407926559448242, "kl/avg_steps": 0.46875, "kl/beta": 0.04159071668982506, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.465862814232821e-07, "logits/chosen": -1.1827163696289062, "logits/rejected": -1.036186695098877, "logps/chosen": -104.59332275390625, "logps/ref_chosen": -78.60614013671875, "logps/ref_rejected": -108.50082397460938, "logps/rejected": -148.4783935546875, "loss": 1.1027, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0788097381591797, "rewards/margins": 0.5728899836540222, "rewards/rejected": -1.6516997814178467, "step": 290 }, { "epoch": 0.4399092970521542, "grad_norm": 13.589823722839355, "kl/avg_steps": 0.40625, "kl/beta": 0.041396670043468475, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.4536538175334343e-07, "logits/chosen": -1.402148723602295, "logits/rejected": -1.925370216369629, "logps/chosen": -89.59666442871094, "logps/ref_chosen": -66.71226501464844, "logps/ref_rejected": -96.14028930664062, "logps/rejected": -136.31503295898438, "loss": 1.0914, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9476636648178101, "rewards/margins": 0.7052263021469116, "rewards/rejected": -1.6528899669647217, "step": 291 }, { "epoch": 0.4414210128495843, "grad_norm": 12.945611000061035, "kl/avg_steps": 0.4375, "kl/beta": 0.04122917354106903, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.4414181450867465e-07, "logits/chosen": -1.4477874040603638, "logits/rejected": -1.6135427951812744, "logps/chosen": -103.1048583984375, "logps/ref_chosen": -80.3355484008789, "logps/ref_rejected": -90.44906616210938, "logps/rejected": -128.4977569580078, "loss": 1.0903, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9375072121620178, "rewards/margins": 0.6204345226287842, "rewards/rejected": -1.5579416751861572, "step": 292 }, { "epoch": 0.4429327286470144, "grad_norm": 12.051216125488281, "kl/avg_steps": 0.5, "kl/beta": 0.04104958474636078, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.4291561391508185e-07, "logits/chosen": -1.367284893989563, "logits/rejected": -1.1402729749679565, "logps/chosen": -95.94146728515625, "logps/ref_chosen": -71.69970703125, "logps/ref_rejected": -102.13948059082031, "logps/rejected": -145.989013671875, "loss": 1.0684, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9954971075057983, "rewards/margins": 0.7931400537490845, "rewards/rejected": -1.7886371612548828, "step": 293 }, { "epoch": 0.4444444444444444, "grad_norm": 17.355674743652344, "kl/avg_steps": 0.375, "kl/beta": 0.0408453568816185, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.4168681427203153e-07, "logits/chosen": -1.38045072555542, "logits/rejected": -1.2437189817428589, "logps/chosen": -95.20700073242188, "logps/ref_chosen": -70.73458862304688, "logps/ref_rejected": -86.68821716308594, "logps/rejected": -126.80274963378906, "loss": 1.0621, "rewards/accuracies": 0.671875, "rewards/chosen": -0.99778813123703, "rewards/margins": 0.6298106908798218, "rewards/rejected": -1.6275988817214966, "step": 294 }, { "epoch": 0.4459561602418745, "grad_norm": 15.311004638671875, "kl/avg_steps": 0.53125, "kl/beta": 0.040692757815122604, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.4045544995169125e-07, "logits/chosen": -1.6135108470916748, "logits/rejected": -1.862878680229187, "logps/chosen": -92.92829895019531, "logps/ref_chosen": -66.42643737792969, "logps/ref_rejected": -99.58766174316406, "logps/rejected": -138.8087158203125, "loss": 1.1684, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0746097564697266, "rewards/margins": 0.5088898539543152, "rewards/rejected": -1.5834996700286865, "step": 295 }, { "epoch": 0.4474678760393046, "grad_norm": 12.159005165100098, "kl/avg_steps": 0.46875, "kl/beta": 0.04047771915793419, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.392215553979679e-07, "logits/chosen": -1.4763997793197632, "logits/rejected": -1.5790629386901855, "logps/chosen": -112.81022644042969, "logps/ref_chosen": -87.47459411621094, "logps/ref_rejected": -103.96894836425781, "logps/rejected": -148.70249938964844, "loss": 1.0083, "rewards/accuracies": 0.6875, "rewards/chosen": -1.024478793144226, "rewards/margins": 0.7745535373687744, "rewards/rejected": -1.7990323305130005, "step": 296 }, { "epoch": 0.4489795918367347, "grad_norm": 17.412349700927734, "kl/avg_steps": 0.5625, "kl/beta": 0.04028886556625366, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.3798516512554485e-07, "logits/chosen": -1.5393011569976807, "logits/rejected": -1.604128122329712, "logps/chosen": -100.6285171508789, "logps/ref_chosen": -73.46731567382812, "logps/ref_rejected": -88.22674560546875, "logps/rejected": -134.59902954101562, "loss": 0.9386, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0900453329086304, "rewards/margins": 0.7648427486419678, "rewards/rejected": -1.8548879623413086, "step": 297 }, { "epoch": 0.4504913076341648, "grad_norm": 15.284070014953613, "kl/avg_steps": 0.5, "kl/beta": 0.04006350785493851, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.367463137189156e-07, "logits/chosen": -1.5410802364349365, "logits/rejected": -1.2973120212554932, "logps/chosen": -97.686279296875, "logps/ref_chosen": -73.21676635742188, "logps/ref_rejected": -84.9563217163086, "logps/rejected": -126.0870590209961, "loss": 1.1325, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9782853722572327, "rewards/margins": 0.6568226218223572, "rewards/rejected": -1.6351079940795898, "step": 298 }, { "epoch": 0.4520030234315949, "grad_norm": 12.5038480758667, "kl/avg_steps": 0.34375, "kl/beta": 0.0398641899228096, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.355050358314172e-07, "logits/chosen": -1.4272797107696533, "logits/rejected": -1.3961483240127563, "logps/chosen": -104.31736755371094, "logps/ref_chosen": -76.9534912109375, "logps/ref_rejected": -87.53433227539062, "logps/rejected": -127.41835021972656, "loss": 1.2044, "rewards/accuracies": 0.671875, "rewards/chosen": -1.0907185077667236, "rewards/margins": 0.48999127745628357, "rewards/rejected": -1.58070969581604, "step": 299 }, { "epoch": 0.45351473922902497, "grad_norm": 13.767475128173828, "kl/avg_steps": 0.40625, "kl/beta": 0.039727624505758286, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.3426136618426043e-07, "logits/chosen": -1.284356951713562, "logits/rejected": -1.4688689708709717, "logps/chosen": -105.30261993408203, "logps/ref_chosen": -78.36398315429688, "logps/ref_rejected": -97.03912353515625, "logps/rejected": -137.79039001464844, "loss": 1.1837, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0687929391860962, "rewards/margins": 0.5387318730354309, "rewards/rejected": -1.6075247526168823, "step": 300 }, { "epoch": 0.45351473922902497, "eval_kl/n_epsilon_steps": 0.26892605423927307, "eval_kl/p_epsilon_steps": 0.7306337952613831, "eval_logits/chosen": -1.303465723991394, "eval_logits/rejected": -1.4157907962799072, "eval_logps/chosen": -110.7581558227539, "eval_logps/ref_chosen": -86.90177917480469, "eval_logps/ref_rejected": -96.69639587402344, "eval_logps/rejected": -137.05419921875, "eval_loss": 0.5443636178970337, "eval_rewards/accuracies": 0.7376760840415955, "eval_rewards/chosen": -0.9429383873939514, "eval_rewards/margins": 0.6431780457496643, "eval_rewards/rejected": -1.5861164331436157, "eval_runtime": 46.97, "eval_samples_per_second": 49.031, "eval_steps_per_second": 1.533, "step": 300 }, { "epoch": 0.455026455026455, "grad_norm": 10.284299850463867, "kl/avg_steps": 0.28125, "kl/beta": 0.03956688567996025, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 3.3301533956555885e-07, "logits/chosen": -1.5722053050994873, "logits/rejected": -1.5532550811767578, "logps/chosen": -96.18799591064453, "logps/ref_chosen": -70.6719741821289, "logps/ref_rejected": -87.11650848388672, "logps/rejected": -127.36128234863281, "loss": 1.1416, "rewards/accuracies": 0.625, "rewards/chosen": -1.0108939409255981, "rewards/margins": 0.5733522772789001, "rewards/rejected": -1.584246277809143, "step": 301 }, { "epoch": 0.4565381708238851, "grad_norm": 12.417190551757812, "kl/avg_steps": 0.3125, "kl/beta": 0.03945591300725937, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.317669908293554e-07, "logits/chosen": -1.6059229373931885, "logits/rejected": -1.5071606636047363, "logps/chosen": -115.22109985351562, "logps/ref_chosen": -85.29096221923828, "logps/ref_rejected": -106.22589874267578, "logps/rejected": -144.5008087158203, "loss": 1.2904, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1810417175292969, "rewards/margins": 0.32185858488082886, "rewards/rejected": -1.5029003620147705, "step": 302 }, { "epoch": 0.4580498866213152, "grad_norm": 11.21825122833252, "kl/avg_steps": 0.5, "kl/beta": 0.039333000779151917, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.3051635489464793e-07, "logits/chosen": -1.2936410903930664, "logits/rejected": -1.1640466451644897, "logps/chosen": -109.26667785644531, "logps/ref_chosen": -83.90058898925781, "logps/ref_rejected": -104.7340087890625, "logps/rejected": -149.0026397705078, "loss": 1.0433, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9966724514961243, "rewards/margins": 0.7328269481658936, "rewards/rejected": -1.729499340057373, "step": 303 }, { "epoch": 0.4595616024187453, "grad_norm": 10.65771770477295, "kl/avg_steps": 0.5625, "kl/beta": 0.03913731127977371, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.292634667444117e-07, "logits/chosen": -1.5216636657714844, "logits/rejected": -1.3718217611312866, "logps/chosen": -99.35061645507812, "logps/ref_chosen": -77.39997100830078, "logps/ref_rejected": -94.21647644042969, "logps/rejected": -134.24893188476562, "loss": 1.0042, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8575640320777893, "rewards/margins": 0.6984165906906128, "rewards/rejected": -1.5559806823730469, "step": 304 }, { "epoch": 0.46107331821617537, "grad_norm": 11.185667037963867, "kl/avg_steps": 0.40625, "kl/beta": 0.038918398320674896, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.280083614246217e-07, "logits/chosen": -1.4200937747955322, "logits/rejected": -1.178731918334961, "logps/chosen": -116.03550720214844, "logps/ref_chosen": -90.90805053710938, "logps/ref_rejected": -85.84992980957031, "logps/rejected": -128.09902954101562, "loss": 1.0985, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9784424304962158, "rewards/margins": 0.6563032269477844, "rewards/rejected": -1.6347455978393555, "step": 305 }, { "epoch": 0.46258503401360546, "grad_norm": 12.585004806518555, "kl/avg_steps": 0.53125, "kl/beta": 0.03876093029975891, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.267510740432719e-07, "logits/chosen": -1.1087026596069336, "logits/rejected": -1.497624397277832, "logps/chosen": -93.91549682617188, "logps/ref_chosen": -71.7261962890625, "logps/ref_rejected": -97.70491027832031, "logps/rejected": -135.10858154296875, "loss": 1.0697, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8565143346786499, "rewards/margins": 0.581344723701477, "rewards/rejected": -1.437859058380127, "step": 306 }, { "epoch": 0.46409674981103555, "grad_norm": 11.676605224609375, "kl/avg_steps": 0.21875, "kl/beta": 0.03855610266327858, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 3.2549163976939285e-07, "logits/chosen": -1.4268429279327393, "logits/rejected": -1.6580591201782227, "logps/chosen": -92.18916320800781, "logps/ref_chosen": -74.38668823242188, "logps/ref_rejected": -84.16001892089844, "logps/rejected": -113.77175903320312, "loss": 1.2034, "rewards/accuracies": 0.609375, "rewards/chosen": -0.6898114085197449, "rewards/margins": 0.4468283951282501, "rewards/rejected": -1.1366398334503174, "step": 307 }, { "epoch": 0.4656084656084656, "grad_norm": 10.842734336853027, "kl/avg_steps": 0.375, "kl/beta": 0.03847194463014603, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.2423009383206874e-07, "logits/chosen": -2.018758773803711, "logits/rejected": -1.362054705619812, "logps/chosen": -107.61516571044922, "logps/ref_chosen": -87.50894165039062, "logps/ref_rejected": -94.80848693847656, "logps/rejected": -130.3218231201172, "loss": 1.1014, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7732656002044678, "rewards/margins": 0.5833746194839478, "rewards/rejected": -1.356640338897705, "step": 308 }, { "epoch": 0.4671201814058957, "grad_norm": 11.334769248962402, "kl/avg_steps": 0.46875, "kl/beta": 0.03832821175456047, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.229664715194511e-07, "logits/chosen": -1.5512263774871826, "logits/rejected": -1.5080833435058594, "logps/chosen": -105.232421875, "logps/ref_chosen": -82.15191650390625, "logps/ref_rejected": -95.03496551513672, "logps/rejected": -134.20103454589844, "loss": 1.02, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8828413486480713, "rewards/margins": 0.6087982654571533, "rewards/rejected": -1.4916396141052246, "step": 309 }, { "epoch": 0.46863189720332576, "grad_norm": 18.294967651367188, "kl/avg_steps": 0.15625, "kl/beta": 0.0381493866443634, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 3.2170080817777257e-07, "logits/chosen": -1.8603748083114624, "logits/rejected": -1.6818149089813232, "logps/chosen": -117.48551940917969, "logps/ref_chosen": -93.7555160522461, "logps/ref_rejected": -96.93236541748047, "logps/rejected": -127.7294921875, "loss": 1.3625, "rewards/accuracies": 0.625, "rewards/chosen": -0.9083326458930969, "rewards/margins": 0.2615802586078644, "rewards/rejected": -1.1699128150939941, "step": 310 }, { "epoch": 0.47014361300075586, "grad_norm": 9.972360610961914, "kl/avg_steps": 0.4375, "kl/beta": 0.038089871406555176, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.204331392103574e-07, "logits/chosen": -1.3419418334960938, "logits/rejected": -1.67874276638031, "logps/chosen": -90.90428161621094, "logps/ref_chosen": -76.20762634277344, "logps/ref_rejected": -110.48141479492188, "logps/rejected": -139.69790649414062, "loss": 1.1054, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5603994131088257, "rewards/margins": 0.5446619987487793, "rewards/rejected": -1.105061411857605, "step": 311 }, { "epoch": 0.47165532879818595, "grad_norm": 9.703566551208496, "kl/avg_steps": 0.53125, "kl/beta": 0.03792395442724228, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.1916350007663176e-07, "logits/chosen": -1.3120102882385254, "logits/rejected": -1.1737221479415894, "logps/chosen": -89.21003723144531, "logps/ref_chosen": -69.08878326416016, "logps/ref_rejected": -91.84494018554688, "logps/rejected": -130.23162841796875, "loss": 1.0066, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7615611553192139, "rewards/margins": 0.6839256286621094, "rewards/rejected": -1.4454867839813232, "step": 312 }, { "epoch": 0.47316704459561604, "grad_norm": 11.096412658691406, "kl/avg_steps": 0.3125, "kl/beta": 0.03772354871034622, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.178919262911314e-07, "logits/chosen": -1.3597538471221924, "logits/rejected": -1.3801279067993164, "logps/chosen": -92.74771118164062, "logps/ref_chosen": -78.20826721191406, "logps/ref_rejected": -86.90351867675781, "logps/rejected": -113.30249786376953, "loss": 1.2374, "rewards/accuracies": 0.640625, "rewards/chosen": -0.5501388311386108, "rewards/margins": 0.43886855244636536, "rewards/rejected": -0.9890073537826538, "step": 313 }, { "epoch": 0.47467876039304613, "grad_norm": 12.68870735168457, "kl/avg_steps": 0.34375, "kl/beta": 0.03760603070259094, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.166184534225087e-07, "logits/chosen": -1.4780869483947754, "logits/rejected": -1.4974141120910645, "logps/chosen": -106.15601348876953, "logps/ref_chosen": -90.41890716552734, "logps/ref_rejected": -84.33525848388672, "logps/rejected": -117.95146942138672, "loss": 1.0187, "rewards/accuracies": 0.671875, "rewards/chosen": -0.592218279838562, "rewards/margins": 0.6636958122253418, "rewards/rejected": -1.2559140920639038, "step": 314 }, { "epoch": 0.47619047619047616, "grad_norm": 10.189184188842773, "kl/avg_steps": 0.46875, "kl/beta": 0.03747720271348953, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.1534311709253723e-07, "logits/chosen": -1.4552478790283203, "logits/rejected": -1.5378687381744385, "logps/chosen": -100.55183410644531, "logps/ref_chosen": -87.32842254638672, "logps/ref_rejected": -93.71661376953125, "logps/rejected": -122.59095764160156, "loss": 1.0451, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4959626793861389, "rewards/margins": 0.5785229206085205, "rewards/rejected": -1.0744855403900146, "step": 315 }, { "epoch": 0.47770219198790626, "grad_norm": 11.919107437133789, "kl/avg_steps": 0.5, "kl/beta": 0.037302348762750626, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.1406595297511564e-07, "logits/chosen": -1.3762269020080566, "logits/rejected": -1.3930758237838745, "logps/chosen": -89.23201751708984, "logps/ref_chosen": -73.898681640625, "logps/ref_rejected": -115.42668151855469, "logps/rejected": -147.5204315185547, "loss": 1.0206, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5697667598724365, "rewards/margins": 0.617039680480957, "rewards/rejected": -1.1868064403533936, "step": 316 }, { "epoch": 0.47921390778533635, "grad_norm": 13.636117935180664, "kl/avg_steps": 0.5625, "kl/beta": 0.03711676225066185, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.1278699679526975e-07, "logits/chosen": -1.3755412101745605, "logits/rejected": -1.2973852157592773, "logps/chosen": -86.28968048095703, "logps/ref_chosen": -75.42947387695312, "logps/ref_rejected": -90.60166931152344, "logps/rejected": -119.53173828125, "loss": 1.0444, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4039730429649353, "rewards/margins": 0.6617342233657837, "rewards/rejected": -1.0657072067260742, "step": 317 }, { "epoch": 0.48072562358276644, "grad_norm": 9.761408805847168, "kl/avg_steps": 0.4375, "kl/beta": 0.03690914809703827, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.1150628432815336e-07, "logits/chosen": -1.304398775100708, "logits/rejected": -1.2814505100250244, "logps/chosen": -82.9610366821289, "logps/ref_chosen": -70.3831787109375, "logps/ref_rejected": -98.19901275634766, "logps/rejected": -124.93925476074219, "loss": 1.1756, "rewards/accuracies": 0.6875, "rewards/chosen": -0.46455252170562744, "rewards/margins": 0.5136125683784485, "rewards/rejected": -0.9781651496887207, "step": 318 }, { "epoch": 0.48223733938019653, "grad_norm": 10.158917427062988, "kl/avg_steps": 0.4375, "kl/beta": 0.03674837574362755, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.1022385139804707e-07, "logits/chosen": -1.8672878742218018, "logits/rejected": -1.6835671663284302, "logps/chosen": -97.37528228759766, "logps/ref_chosen": -83.40225982666016, "logps/ref_rejected": -95.40069580078125, "logps/rejected": -126.68914794921875, "loss": 1.0576, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5123413801193237, "rewards/margins": 0.6273802518844604, "rewards/rejected": -1.1397216320037842, "step": 319 }, { "epoch": 0.4837490551776266, "grad_norm": 11.402392387390137, "kl/avg_steps": 0.15625, "kl/beta": 0.03658830001950264, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -1.3285362720489502, "logits/rejected": -1.4290411472320557, "logps/chosen": -80.62210083007812, "logps/ref_chosen": -68.70979309082031, "logps/ref_rejected": -87.00541687011719, "logps/rejected": -108.64424133300781, "loss": 1.2827, "rewards/accuracies": 0.671875, "rewards/chosen": -0.43682974576950073, "rewards/margins": 0.34812477231025696, "rewards/rejected": -0.7849545478820801, "step": 320 }, { "epoch": 0.4852607709750567, "grad_norm": 13.148420333862305, "kl/avg_steps": 0.53125, "kl/beta": 0.036531221121549606, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.0765396768561004e-07, "logits/chosen": -1.6233880519866943, "logits/rejected": -1.6341774463653564, "logps/chosen": -77.85541534423828, "logps/ref_chosen": -66.48135375976562, "logps/ref_rejected": -71.84545135498047, "logps/rejected": -98.65225982666016, "loss": 1.1251, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4173020124435425, "rewards/margins": 0.5548455119132996, "rewards/rejected": -0.9721474647521973, "step": 321 }, { "epoch": 0.48677248677248675, "grad_norm": 11.733065605163574, "kl/avg_steps": 0.640625, "kl/beta": 0.03633817285299301, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.063665887884511e-07, "logits/chosen": -1.226560354232788, "logits/rejected": -1.4153425693511963, "logps/chosen": -79.17440795898438, "logps/ref_chosen": -65.94654846191406, "logps/ref_rejected": -94.26603698730469, "logps/rejected": -128.4293670654297, "loss": 0.9387, "rewards/accuracies": 0.828125, "rewards/chosen": -0.47956448793411255, "rewards/margins": 0.7515709400177002, "rewards/rejected": -1.231135368347168, "step": 322 }, { "epoch": 0.48828420256991684, "grad_norm": 14.1713228225708, "kl/avg_steps": 0.375, "kl/beta": 0.03610686585307121, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.0507763319663517e-07, "logits/chosen": -1.614477276802063, "logits/rejected": -1.2867920398712158, "logps/chosen": -102.02337646484375, "logps/ref_chosen": -86.5498046875, "logps/ref_rejected": -110.39498901367188, "logps/rejected": -141.76101684570312, "loss": 1.1979, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5602794885635376, "rewards/margins": 0.5638449192047119, "rewards/rejected": -1.1241244077682495, "step": 323 }, { "epoch": 0.4897959183673469, "grad_norm": 13.024452209472656, "kl/avg_steps": 0.453125, "kl/beta": 0.03597196936607361, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.0378713696502097e-07, "logits/chosen": -1.6353603601455688, "logits/rejected": -1.6581999063491821, "logps/chosen": -85.73527526855469, "logps/ref_chosen": -74.44218444824219, "logps/ref_rejected": -85.76464080810547, "logps/rejected": -114.40110778808594, "loss": 1.0649, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4083707630634308, "rewards/margins": 0.6154605746269226, "rewards/rejected": -1.0238313674926758, "step": 324 }, { "epoch": 0.491307634164777, "grad_norm": 11.41663932800293, "kl/avg_steps": 0.46875, "kl/beta": 0.0358097068965435, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.0249513619156206e-07, "logits/chosen": -1.4231340885162354, "logits/rejected": -1.398125410079956, "logps/chosen": -100.60375213623047, "logps/ref_chosen": -81.43812561035156, "logps/ref_rejected": -97.04302978515625, "logps/rejected": -132.8642578125, "loss": 1.0998, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6865410804748535, "rewards/margins": 0.5874417424201965, "rewards/rejected": -1.2739828824996948, "step": 325 }, { "epoch": 0.4928193499622071, "grad_norm": 10.984821319580078, "kl/avg_steps": 0.28125, "kl/beta": 0.035642631351947784, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 3.012016670162977e-07, "logits/chosen": -1.5135228633880615, "logits/rejected": -1.453247308731079, "logps/chosen": -113.7421875, "logps/ref_chosen": -91.65318298339844, "logps/ref_rejected": -90.64222717285156, "logps/rejected": -124.34673309326172, "loss": 1.2212, "rewards/accuracies": 0.640625, "rewards/chosen": -0.7893346548080444, "rewards/margins": 0.40609291195869446, "rewards/rejected": -1.195427656173706, "step": 326 }, { "epoch": 0.4943310657596372, "grad_norm": 13.8237886428833, "kl/avg_steps": 0.28125, "kl/beta": 0.035542670637369156, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.99906765620341e-07, "logits/chosen": -1.5450892448425293, "logits/rejected": -1.5019280910491943, "logps/chosen": -112.25006103515625, "logps/ref_chosen": -89.97216796875, "logps/ref_rejected": -97.54869079589844, "logps/rejected": -133.211181640625, "loss": 1.2382, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7921844720840454, "rewards/margins": 0.4665413796901703, "rewards/rejected": -1.258725881576538, "step": 327 }, { "epoch": 0.4958427815570673, "grad_norm": 9.504220008850098, "kl/avg_steps": 0.4375, "kl/beta": 0.035442985594272614, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.9861046822486766e-07, "logits/chosen": -1.2479305267333984, "logits/rejected": -1.1524548530578613, "logps/chosen": -99.87271118164062, "logps/ref_chosen": -80.27335357666016, "logps/ref_rejected": -99.04093933105469, "logps/rejected": -132.95858764648438, "loss": 1.135, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6942028999328613, "rewards/margins": 0.4996661841869354, "rewards/rejected": -1.1938691139221191, "step": 328 }, { "epoch": 0.4973544973544973, "grad_norm": 11.80764389038086, "kl/avg_steps": 0.40625, "kl/beta": 0.035288598388433456, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.9731281109010253e-07, "logits/chosen": -1.4124984741210938, "logits/rejected": -1.446164846420288, "logps/chosen": -100.15596008300781, "logps/ref_chosen": -79.75892639160156, "logps/ref_rejected": -102.06265258789062, "logps/rejected": -139.7376708984375, "loss": 1.0659, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7199461460113525, "rewards/margins": 0.601253867149353, "rewards/rejected": -1.3212000131607056, "step": 329 }, { "epoch": 0.4988662131519274, "grad_norm": 9.457327842712402, "kl/avg_steps": 0.53125, "kl/beta": 0.03514581918716431, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.9601383051430505e-07, "logits/chosen": -1.3547937870025635, "logits/rejected": -1.2515625953674316, "logps/chosen": -85.88477325439453, "logps/ref_chosen": -70.55734252929688, "logps/ref_rejected": -94.53077697753906, "logps/rejected": -131.9217529296875, "loss": 0.9911, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5381969213485718, "rewards/margins": 0.7649656534194946, "rewards/rejected": -1.3031624555587769, "step": 330 }, { "epoch": 0.5003779289493575, "grad_norm": 9.513692855834961, "kl/avg_steps": 0.5, "kl/beta": 0.03496009111404419, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.947135628327544e-07, "logits/chosen": -1.3662301301956177, "logits/rejected": -1.1261534690856934, "logps/chosen": -97.10047149658203, "logps/ref_chosen": -75.46063995361328, "logps/ref_rejected": -84.78495788574219, "logps/rejected": -129.4873046875, "loss": 0.9935, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7566953897476196, "rewards/margins": 0.7958719730377197, "rewards/rejected": -1.5525672435760498, "step": 331 }, { "epoch": 0.5018896447467877, "grad_norm": 11.208243370056152, "kl/avg_steps": 0.5625, "kl/beta": 0.0347861610352993, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.934120444167326e-07, "logits/chosen": -1.6225413084030151, "logits/rejected": -1.5228370428085327, "logps/chosen": -105.17870330810547, "logps/ref_chosen": -84.32807922363281, "logps/ref_rejected": -95.63302612304688, "logps/rejected": -136.43002319335938, "loss": 1.0095, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7220751643180847, "rewards/margins": 0.6849751472473145, "rewards/rejected": -1.4070502519607544, "step": 332 }, { "epoch": 0.5034013605442177, "grad_norm": 10.370922088623047, "kl/avg_steps": 0.53125, "kl/beta": 0.03459158539772034, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.921093116725076e-07, "logits/chosen": -1.5618640184402466, "logits/rejected": -1.3652313947677612, "logps/chosen": -102.90045166015625, "logps/ref_chosen": -78.21322631835938, "logps/ref_rejected": -103.82716369628906, "logps/rejected": -152.17138671875, "loss": 0.9057, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8525649905204773, "rewards/margins": 0.8088828325271606, "rewards/rejected": -1.6614477634429932, "step": 333 }, { "epoch": 0.5049130763416477, "grad_norm": 11.503582000732422, "kl/avg_steps": 0.375, "kl/beta": 0.03440878912806511, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.9080540104031484e-07, "logits/chosen": -1.3274097442626953, "logits/rejected": -1.448803424835205, "logps/chosen": -109.16493225097656, "logps/ref_chosen": -85.0171127319336, "logps/ref_rejected": -106.79039764404297, "logps/rejected": -147.31170654296875, "loss": 1.2069, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8311678171157837, "rewards/margins": 0.5532088279724121, "rewards/rejected": -1.3843765258789062, "step": 334 }, { "epoch": 0.5064247921390779, "grad_norm": 19.717958450317383, "kl/avg_steps": 0.3125, "kl/beta": 0.034280236810445786, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.895003489933375e-07, "logits/chosen": -1.1505557298660278, "logits/rejected": -1.2536959648132324, "logps/chosen": -106.75822448730469, "logps/ref_chosen": -78.56512451171875, "logps/ref_rejected": -92.68515014648438, "logps/rejected": -138.57762145996094, "loss": 1.2316, "rewards/accuracies": 0.640625, "rewards/chosen": -0.9674444198608398, "rewards/margins": 0.5958920121192932, "rewards/rejected": -1.5633363723754883, "step": 335 }, { "epoch": 0.5079365079365079, "grad_norm": 14.143689155578613, "kl/avg_steps": 0.5, "kl/beta": 0.03417344391345978, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.8819419203668675e-07, "logits/chosen": -1.2581086158752441, "logits/rejected": -1.164086103439331, "logps/chosen": -122.58998107910156, "logps/ref_chosen": -88.63243103027344, "logps/ref_rejected": -107.89385223388672, "logps/rejected": -161.6552734375, "loss": 1.0498, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1570062637329102, "rewards/margins": 0.6672290563583374, "rewards/rejected": -1.824235439300537, "step": 336 }, { "epoch": 0.509448223733938, "grad_norm": 11.089780807495117, "kl/avg_steps": 0.34375, "kl/beta": 0.03400342911481857, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.8688696670638053e-07, "logits/chosen": -1.247017502784729, "logits/rejected": -1.4651862382888794, "logps/chosen": -127.10426330566406, "logps/ref_chosen": -93.25018310546875, "logps/ref_rejected": -103.8592529296875, "logps/rejected": -151.89874267578125, "loss": 1.203, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1513574123382568, "rewards/margins": 0.47332531213760376, "rewards/rejected": -1.624682903289795, "step": 337 }, { "epoch": 0.5109599395313681, "grad_norm": 12.785868644714355, "kl/avg_steps": 0.34375, "kl/beta": 0.03388693928718567, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.8557870956832133e-07, "logits/chosen": -0.8946573138237, "logits/rejected": -0.9418930411338806, "logps/chosen": -117.18191528320312, "logps/ref_chosen": -81.79462432861328, "logps/ref_rejected": -90.98943328857422, "logps/rejected": -141.94183349609375, "loss": 1.1434, "rewards/accuracies": 0.703125, "rewards/chosen": -1.197884202003479, "rewards/margins": 0.5193731784820557, "rewards/rejected": -1.7172573804855347, "step": 338 }, { "epoch": 0.5124716553287982, "grad_norm": 10.959848403930664, "kl/avg_steps": 0.59375, "kl/beta": 0.033770851790905, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.842694572172736e-07, "logits/chosen": -1.2875313758850098, "logits/rejected": -1.4024841785430908, "logps/chosen": -85.72515869140625, "logps/ref_chosen": -61.803558349609375, "logps/ref_rejected": -85.16979217529297, "logps/rejected": -131.8690948486328, "loss": 0.9512, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8063684701919556, "rewards/margins": 0.7594180107116699, "rewards/rejected": -1.565786361694336, "step": 339 }, { "epoch": 0.5139833711262283, "grad_norm": 9.914621353149414, "kl/avg_steps": 0.40625, "kl/beta": 0.0335715226829052, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.8295924627584004e-07, "logits/chosen": -1.2250884771347046, "logits/rejected": -0.937452495098114, "logps/chosen": -104.68017578125, "logps/ref_chosen": -72.48607635498047, "logps/ref_rejected": -79.86129760742188, "logps/rejected": -131.80706787109375, "loss": 1.1311, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0805590152740479, "rewards/margins": 0.6526724696159363, "rewards/rejected": -1.7332314252853394, "step": 340 }, { "epoch": 0.5154950869236583, "grad_norm": 11.14721393585205, "kl/avg_steps": 0.375, "kl/beta": 0.03343569114804268, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.816481133934373e-07, "logits/chosen": -1.2342188358306885, "logits/rejected": -1.3302650451660156, "logps/chosen": -108.94258117675781, "logps/ref_chosen": -77.36830139160156, "logps/ref_rejected": -94.64933013916016, "logps/rejected": -147.7334442138672, "loss": 1.0595, "rewards/accuracies": 0.671875, "rewards/chosen": -1.0553505420684814, "rewards/margins": 0.7088862657546997, "rewards/rejected": -1.7642368078231812, "step": 341 }, { "epoch": 0.5170068027210885, "grad_norm": 9.97767162322998, "kl/avg_steps": 0.4375, "kl/beta": 0.033310774713754654, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.8033609524527046e-07, "logits/chosen": -1.2789771556854248, "logits/rejected": -1.195197582244873, "logps/chosen": -99.66441345214844, "logps/ref_chosen": -71.0083236694336, "logps/ref_rejected": -84.22953796386719, "logps/rejected": -134.83029174804688, "loss": 1.0397, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9530007243156433, "rewards/margins": 0.7210956811904907, "rewards/rejected": -1.6740963459014893, "step": 342 }, { "epoch": 0.5185185185185185, "grad_norm": 12.255571365356445, "kl/avg_steps": 0.3125, "kl/beta": 0.03316567465662956, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.7902322853130753e-07, "logits/chosen": -1.649799108505249, "logits/rejected": -1.6315422058105469, "logps/chosen": -120.76673889160156, "logps/ref_chosen": -91.44624328613281, "logps/ref_rejected": -99.0604476928711, "logps/rejected": -140.72201538085938, "loss": 1.2552, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9723671078681946, "rewards/margins": 0.4006619453430176, "rewards/rejected": -1.3730289936065674, "step": 343 }, { "epoch": 0.5200302343159486, "grad_norm": 10.119938850402832, "kl/avg_steps": 0.5625, "kl/beta": 0.03306235373020172, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.7770954997525274e-07, "logits/chosen": -1.1954832077026367, "logits/rejected": -1.3106439113616943, "logps/chosen": -108.83035278320312, "logps/ref_chosen": -73.43608093261719, "logps/ref_rejected": -100.76569366455078, "logps/rejected": -157.56735229492188, "loss": 1.0125, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1673158407211304, "rewards/margins": 0.6984189748764038, "rewards/rejected": -1.8657348155975342, "step": 344 }, { "epoch": 0.5215419501133787, "grad_norm": 12.316679954528809, "kl/avg_steps": 0.375, "kl/beta": 0.032877419143915176, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.7639509632351927e-07, "logits/chosen": -1.2922027111053467, "logits/rejected": -1.3272606134414673, "logps/chosen": -100.18463134765625, "logps/ref_chosen": -75.79296875, "logps/ref_rejected": -94.34156799316406, "logps/rejected": -137.67491149902344, "loss": 1.1094, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8025590181350708, "rewards/margins": 0.6133404970169067, "rewards/rejected": -1.4158995151519775, "step": 345 }, { "epoch": 0.5230536659108088, "grad_norm": 12.418992042541504, "kl/avg_steps": 0.375, "kl/beta": 0.03275458887219429, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.7507990434420123e-07, "logits/chosen": -1.4952889680862427, "logits/rejected": -1.435889482498169, "logps/chosen": -98.2471694946289, "logps/ref_chosen": -72.26289367675781, "logps/ref_rejected": -106.36925506591797, "logps/rejected": -154.2978515625, "loss": 1.1168, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8507057428359985, "rewards/margins": 0.7072672843933105, "rewards/rejected": -1.5579731464385986, "step": 346 }, { "epoch": 0.5245653817082389, "grad_norm": 12.831889152526855, "kl/avg_steps": 0.25, "kl/beta": 0.03263222053647041, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.737640108260456e-07, "logits/chosen": -1.141088604927063, "logits/rejected": -1.2003611326217651, "logps/chosen": -102.23765563964844, "logps/ref_chosen": -71.19871520996094, "logps/ref_rejected": -91.543212890625, "logps/rejected": -139.07275390625, "loss": 1.1903, "rewards/accuracies": 0.625, "rewards/chosen": -1.0140210390090942, "rewards/margins": 0.5289695858955383, "rewards/rejected": -1.5429906845092773, "step": 347 }, { "epoch": 0.5260770975056689, "grad_norm": 9.99068546295166, "kl/avg_steps": 0.28125, "kl/beta": 0.03255084156990051, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.724474525774229e-07, "logits/chosen": -1.159302830696106, "logits/rejected": -1.177544355392456, "logps/chosen": -96.44154357910156, "logps/ref_chosen": -69.95603942871094, "logps/ref_rejected": -83.64309692382812, "logps/rejected": -132.4287109375, "loss": 1.0353, "rewards/accuracies": 0.703125, "rewards/chosen": -0.8636122941970825, "rewards/margins": 0.7161925435066223, "rewards/rejected": -1.57980477809906, "step": 348 }, { "epoch": 0.527588813303099, "grad_norm": 10.788860321044922, "kl/avg_steps": 0.453125, "kl/beta": 0.032459549605846405, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.711302664252973e-07, "logits/chosen": -1.4271414279937744, "logits/rejected": -1.271451473236084, "logps/chosen": -96.35053253173828, "logps/ref_chosen": -70.71857452392578, "logps/ref_rejected": -99.93263244628906, "logps/rejected": -146.58157348632812, "loss": 1.07, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8293413519859314, "rewards/margins": 0.6725594401359558, "rewards/rejected": -1.5019009113311768, "step": 349 }, { "epoch": 0.5291005291005291, "grad_norm": 10.467818260192871, "kl/avg_steps": 0.40625, "kl/beta": 0.03231313079595566, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.698124892141971e-07, "logits/chosen": -1.440155267715454, "logits/rejected": -1.2903451919555664, "logps/chosen": -106.1199722290039, "logps/ref_chosen": -78.16873168945312, "logps/ref_rejected": -104.84309387207031, "logps/rejected": -159.8768310546875, "loss": 0.904, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9024109840393066, "rewards/margins": 0.8648760318756104, "rewards/rejected": -1.767287015914917, "step": 350 }, { "epoch": 0.5306122448979592, "grad_norm": 9.843159675598145, "kl/avg_steps": 0.59375, "kl/beta": 0.03218238800764084, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.6849415780518357e-07, "logits/chosen": -1.2251023054122925, "logits/rejected": -1.3334765434265137, "logps/chosen": -101.87110900878906, "logps/ref_chosen": -71.79151916503906, "logps/ref_rejected": -97.04634094238281, "logps/rejected": -147.25169372558594, "loss": 1.1072, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9649567604064941, "rewards/margins": 0.6374739408493042, "rewards/rejected": -1.602430820465088, "step": 351 }, { "epoch": 0.5321239606953893, "grad_norm": 11.369997024536133, "kl/avg_steps": 0.34375, "kl/beta": 0.031992435455322266, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.6717530907482024e-07, "logits/chosen": -1.220479130744934, "logits/rejected": -1.3581123352050781, "logps/chosen": -112.46723937988281, "logps/ref_chosen": -80.86544799804688, "logps/ref_rejected": -102.02128601074219, "logps/rejected": -151.5338592529297, "loss": 1.1227, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0110710859298706, "rewards/margins": 0.5643045902252197, "rewards/rejected": -1.5753756761550903, "step": 352 }, { "epoch": 0.5336356764928194, "grad_norm": 10.501523971557617, "kl/avg_steps": 0.4375, "kl/beta": 0.031882837414741516, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.658559799141411e-07, "logits/chosen": -1.4000976085662842, "logits/rejected": -1.3244915008544922, "logps/chosen": -108.99580383300781, "logps/ref_chosen": -84.77235412597656, "logps/ref_rejected": -86.77130126953125, "logps/rejected": -132.26800537109375, "loss": 1.0936, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7711913585662842, "rewards/margins": 0.6679372787475586, "rewards/rejected": -1.4391286373138428, "step": 353 }, { "epoch": 0.5351473922902494, "grad_norm": 9.845510482788086, "kl/avg_steps": 0.5, "kl/beta": 0.031743958592414856, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.6453620722761895e-07, "logits/chosen": -1.3057725429534912, "logits/rejected": -1.5508718490600586, "logps/chosen": -80.18466186523438, "logps/ref_chosen": -54.33562088012695, "logps/ref_rejected": -92.4120101928711, "logps/rejected": -140.17367553710938, "loss": 1.0502, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8189990520477295, "rewards/margins": 0.6852681636810303, "rewards/rejected": -1.5042672157287598, "step": 354 }, { "epoch": 0.5366591080876795, "grad_norm": 9.737431526184082, "kl/avg_steps": 0.53125, "kl/beta": 0.03158602863550186, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.632160279321328e-07, "logits/chosen": -1.1256142854690552, "logits/rejected": -1.3838088512420654, "logps/chosen": -90.2604751586914, "logps/ref_chosen": -61.8388671875, "logps/ref_rejected": -98.65571594238281, "logps/rejected": -151.04173278808594, "loss": 1.0026, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8956200480461121, "rewards/margins": 0.746633768081665, "rewards/rejected": -1.6422538757324219, "step": 355 }, { "epoch": 0.5381708238851096, "grad_norm": 11.063801765441895, "kl/avg_steps": 0.5, "kl/beta": 0.03141911327838898, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.618954789559356e-07, "logits/chosen": -1.2441139221191406, "logits/rejected": -1.296750783920288, "logps/chosen": -86.55245971679688, "logps/ref_chosen": -63.92546463012695, "logps/ref_rejected": -89.682861328125, "logps/rejected": -133.89390563964844, "loss": 1.1363, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7118154764175415, "rewards/margins": 0.6670767664909363, "rewards/rejected": -1.378892183303833, "step": 356 }, { "epoch": 0.5396825396825397, "grad_norm": 9.868560791015625, "kl/avg_steps": 0.375, "kl/beta": 0.031262800097465515, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.6057459723762076e-07, "logits/chosen": -1.2802687883377075, "logits/rejected": -1.6558847427368164, "logps/chosen": -110.63601684570312, "logps/ref_chosen": -81.07589721679688, "logps/ref_rejected": -85.06967163085938, "logps/rejected": -132.71707153320312, "loss": 1.1316, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9249534010887146, "rewards/margins": 0.5564345121383667, "rewards/rejected": -1.4813878536224365, "step": 357 }, { "epoch": 0.5411942554799698, "grad_norm": 9.289213180541992, "kl/avg_steps": 0.375, "kl/beta": 0.03114600107073784, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.5925341972508954e-07, "logits/chosen": -1.5175108909606934, "logits/rejected": -1.3619422912597656, "logps/chosen": -109.77130126953125, "logps/ref_chosen": -84.09109497070312, "logps/ref_rejected": -85.07244110107422, "logps/rejected": -135.58029174804688, "loss": 0.9844, "rewards/accuracies": 0.71875, "rewards/chosen": -0.799083948135376, "rewards/margins": 0.7630775570869446, "rewards/rejected": -1.5621614456176758, "step": 358 }, { "epoch": 0.5427059712773998, "grad_norm": 12.027239799499512, "kl/avg_steps": 0.28125, "kl/beta": 0.03102963976562023, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.579319833745169e-07, "logits/chosen": -1.8530116081237793, "logits/rejected": -1.6771414279937744, "logps/chosen": -108.74325561523438, "logps/ref_chosen": -80.7490234375, "logps/ref_rejected": -94.92912292480469, "logps/rejected": -136.96435546875, "loss": 1.2222, "rewards/accuracies": 0.640625, "rewards/chosen": -0.8691080808639526, "rewards/margins": 0.42722088098526, "rewards/rejected": -1.2963290214538574, "step": 359 }, { "epoch": 0.54421768707483, "grad_norm": 8.521217346191406, "kl/avg_steps": 0.34375, "kl/beta": 0.030942615121603012, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.5661032514931834e-07, "logits/chosen": -1.2853665351867676, "logits/rejected": -1.4590386152267456, "logps/chosen": -107.85910034179688, "logps/ref_chosen": -78.38680267333984, "logps/ref_rejected": -109.6893310546875, "logps/rejected": -161.26239013671875, "loss": 1.038, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9122262001037598, "rewards/margins": 0.6750233173370361, "rewards/rejected": -1.587249517440796, "step": 360 }, { "epoch": 0.54572940287226, "grad_norm": 9.228687286376953, "kl/avg_steps": 0.5, "kl/beta": 0.030836613848805428, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.552884820191154e-07, "logits/chosen": -1.172134280204773, "logits/rejected": -1.341686725616455, "logps/chosen": -102.47615814208984, "logps/ref_chosen": -73.9055404663086, "logps/ref_rejected": -89.8489990234375, "logps/rejected": -143.1851806640625, "loss": 0.9892, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8791087865829468, "rewards/margins": 0.7538092136383057, "rewards/rejected": -1.632917881011963, "step": 361 }, { "epoch": 0.54724111866969, "grad_norm": 10.797682762145996, "kl/avg_steps": 0.453125, "kl/beta": 0.03068319708108902, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.53966490958702e-07, "logits/chosen": -1.5308431386947632, "logits/rejected": -1.8336783647537231, "logps/chosen": -116.3746337890625, "logps/ref_chosen": -82.32565307617188, "logps/ref_rejected": -123.14100646972656, "logps/rejected": -175.5985565185547, "loss": 1.1361, "rewards/accuracies": 0.734375, "rewards/chosen": -1.0415807962417603, "rewards/margins": 0.5558252334594727, "rewards/rejected": -1.5974061489105225, "step": 362 }, { "epoch": 0.5487528344671202, "grad_norm": 10.823992729187012, "kl/avg_steps": 0.78125, "kl/beta": 0.030544791370630264, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.526443889470099e-07, "logits/chosen": -1.1187574863433838, "logits/rejected": -1.4465909004211426, "logps/chosen": -98.7936782836914, "logps/ref_chosen": -66.05493927001953, "logps/ref_rejected": -106.79598999023438, "logps/rejected": -164.49728393554688, "loss": 0.9382, "rewards/accuracies": 0.875, "rewards/chosen": -0.9929654598236084, "rewards/margins": 0.7533215284347534, "rewards/rejected": -1.7462869882583618, "step": 363 }, { "epoch": 0.5502645502645502, "grad_norm": 8.624401092529297, "kl/avg_steps": 0.46875, "kl/beta": 0.030308010056614876, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.513222129660744e-07, "logits/chosen": -1.4273147583007812, "logits/rejected": -1.2582423686981201, "logps/chosen": -104.58074951171875, "logps/ref_chosen": -76.38365173339844, "logps/ref_rejected": -100.22221374511719, "logps/rejected": -153.84181213378906, "loss": 1.0464, "rewards/accuracies": 0.75, "rewards/chosen": -0.8530045747756958, "rewards/margins": 0.7596189975738525, "rewards/rejected": -1.6126235723495483, "step": 364 }, { "epoch": 0.5517762660619804, "grad_norm": 10.569746017456055, "kl/avg_steps": 0.5625, "kl/beta": 0.03016660362482071, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.5e-07, "logits/chosen": -1.4147711992263794, "logits/rejected": -1.129202961921692, "logps/chosen": -104.73855590820312, "logps/ref_chosen": -81.83399963378906, "logps/ref_rejected": -89.06932830810547, "logps/rejected": -133.12460327148438, "loss": 0.995, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6893056631088257, "rewards/margins": 0.6299935579299927, "rewards/rejected": -1.3192992210388184, "step": 365 }, { "epoch": 0.5532879818594104, "grad_norm": 10.754393577575684, "kl/avg_steps": 0.34375, "kl/beta": 0.029997866600751877, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.486777870339255e-07, "logits/chosen": -1.580064296722412, "logits/rejected": -1.4120087623596191, "logps/chosen": -98.59486389160156, "logps/ref_chosen": -72.03398895263672, "logps/ref_rejected": -83.65354919433594, "logps/rejected": -129.49868774414062, "loss": 1.1278, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7969068288803101, "rewards/margins": 0.5694468021392822, "rewards/rejected": -1.3663535118103027, "step": 366 }, { "epoch": 0.5547996976568406, "grad_norm": 9.318233489990234, "kl/avg_steps": 0.40625, "kl/beta": 0.029895102605223656, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.4735561105299014e-07, "logits/chosen": -1.5403746366500854, "logits/rejected": -1.6996790170669556, "logps/chosen": -104.44827270507812, "logps/ref_chosen": -72.39827728271484, "logps/ref_rejected": -95.58364868164062, "logps/rejected": -145.46209716796875, "loss": 1.1044, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9578984975814819, "rewards/margins": 0.5246537327766418, "rewards/rejected": -1.4825522899627686, "step": 367 }, { "epoch": 0.5563114134542706, "grad_norm": 10.34378433227539, "kl/avg_steps": 0.34375, "kl/beta": 0.029774144291877747, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.46033509041298e-07, "logits/chosen": -1.5399154424667358, "logits/rejected": -1.1814687252044678, "logps/chosen": -126.15576171875, "logps/ref_chosen": -90.12812042236328, "logps/ref_rejected": -91.6636962890625, "logps/rejected": -148.67593383789062, "loss": 1.0811, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0728702545166016, "rewards/margins": 0.6154052019119263, "rewards/rejected": -1.6882755756378174, "step": 368 }, { "epoch": 0.5578231292517006, "grad_norm": 17.82675552368164, "kl/avg_steps": 0.5, "kl/beta": 0.02967214584350586, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.447115179808846e-07, "logits/chosen": -1.8711051940917969, "logits/rejected": -1.640881061553955, "logps/chosen": -102.46626281738281, "logps/ref_chosen": -71.29417419433594, "logps/ref_rejected": -99.03875732421875, "logps/rejected": -147.19813537597656, "loss": 1.1811, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9218083620071411, "rewards/margins": 0.49515968561172485, "rewards/rejected": -1.4169681072235107, "step": 369 }, { "epoch": 0.5593348450491308, "grad_norm": 9.260988235473633, "kl/avg_steps": 0.46875, "kl/beta": 0.02952452376484871, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.4338967485068164e-07, "logits/chosen": -1.1396031379699707, "logits/rejected": -1.0747642517089844, "logps/chosen": -98.35624694824219, "logps/ref_chosen": -69.14627075195312, "logps/ref_rejected": -93.58651733398438, "logps/rejected": -149.8125457763672, "loss": 1.0041, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8626432418823242, "rewards/margins": 0.786967933177948, "rewards/rejected": -1.649611234664917, "step": 370 }, { "epoch": 0.5608465608465608, "grad_norm": 10.98085880279541, "kl/avg_steps": 0.5, "kl/beta": 0.029386773705482483, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.420680166254831e-07, "logits/chosen": -1.3450419902801514, "logits/rejected": -1.1297814846038818, "logps/chosen": -95.94750213623047, "logps/ref_chosen": -65.76728820800781, "logps/ref_rejected": -79.9320068359375, "logps/rejected": -134.09359741210938, "loss": 1.0643, "rewards/accuracies": 0.75, "rewards/chosen": -0.8851276636123657, "rewards/margins": 0.6940947771072388, "rewards/rejected": -1.5792224407196045, "step": 371 }, { "epoch": 0.562358276643991, "grad_norm": 11.20559310913086, "kl/avg_steps": 0.3125, "kl/beta": 0.029240570962429047, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.4074658027491044e-07, "logits/chosen": -1.160583734512329, "logits/rejected": -1.33573579788208, "logps/chosen": -96.75958251953125, "logps/ref_chosen": -69.97252655029297, "logps/ref_rejected": -92.38317108154297, "logps/rejected": -138.94244384765625, "loss": 1.2057, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7855249643325806, "rewards/margins": 0.5673984885215759, "rewards/rejected": -1.3529233932495117, "step": 372 }, { "epoch": 0.563869992441421, "grad_norm": 12.406031608581543, "kl/avg_steps": 0.40625, "kl/beta": 0.029149478301405907, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.394254027623792e-07, "logits/chosen": -1.3365615606307983, "logits/rejected": -1.7029169797897339, "logps/chosen": -114.59992980957031, "logps/ref_chosen": -79.34700012207031, "logps/ref_rejected": -95.69737243652344, "logps/rejected": -154.38079833984375, "loss": 1.0857, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0270100831985474, "rewards/margins": 0.6728541851043701, "rewards/rejected": -1.699864387512207, "step": 373 }, { "epoch": 0.5653817082388511, "grad_norm": 11.336803436279297, "kl/avg_steps": 0.53125, "kl/beta": 0.02903153747320175, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.381045210440644e-07, "logits/chosen": -1.3897533416748047, "logits/rejected": -1.2900179624557495, "logps/chosen": -124.2972412109375, "logps/ref_chosen": -93.45108032226562, "logps/ref_rejected": -93.575927734375, "logps/rejected": -156.41477966308594, "loss": 0.8913, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8952039480209351, "rewards/margins": 0.917777419090271, "rewards/rejected": -1.812981367111206, "step": 374 }, { "epoch": 0.5668934240362812, "grad_norm": 9.698143005371094, "kl/avg_steps": 0.53125, "kl/beta": 0.028878122568130493, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.3678397206786715e-07, "logits/chosen": -1.5062309503555298, "logits/rejected": -1.3306735754013062, "logps/chosen": -100.72938537597656, "logps/ref_chosen": -77.37176513671875, "logps/ref_rejected": -98.59054565429688, "logps/rejected": -143.8408203125, "loss": 1.0636, "rewards/accuracies": 0.75, "rewards/chosen": -0.6728950142860413, "rewards/margins": 0.6229599118232727, "rewards/rejected": -1.295854926109314, "step": 375 }, { "epoch": 0.5684051398337112, "grad_norm": 9.342375755310059, "kl/avg_steps": 0.5, "kl/beta": 0.02872551791369915, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.3546379277238103e-07, "logits/chosen": -1.0259218215942383, "logits/rejected": -1.408305048942566, "logps/chosen": -99.64701080322266, "logps/ref_chosen": -68.99790954589844, "logps/ref_rejected": -90.37117004394531, "logps/rejected": -149.88748168945312, "loss": 1.0369, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8794429302215576, "rewards/margins": 0.8176605105400085, "rewards/rejected": -1.697103500366211, "step": 376 }, { "epoch": 0.5699168556311414, "grad_norm": 9.864453315734863, "kl/avg_steps": 0.40625, "kl/beta": 0.028582604601979256, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.3414402008585886e-07, "logits/chosen": -1.0788323879241943, "logits/rejected": -0.9475747346878052, "logps/chosen": -95.7227554321289, "logps/ref_chosen": -64.22705841064453, "logps/ref_rejected": -73.10292053222656, "logps/rejected": -123.53993225097656, "loss": 1.105, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8982629179954529, "rewards/margins": 0.532755970954895, "rewards/rejected": -1.4310189485549927, "step": 377 }, { "epoch": 0.5714285714285714, "grad_norm": 10.464985847473145, "kl/avg_steps": 0.5625, "kl/beta": 0.028466958552598953, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.3282469092517977e-07, "logits/chosen": -1.3532541990280151, "logits/rejected": -1.3627448081970215, "logps/chosen": -108.75630187988281, "logps/ref_chosen": -76.90864562988281, "logps/ref_rejected": -90.53459930419922, "logps/rejected": -139.50833129882812, "loss": 1.1656, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9033360481262207, "rewards/margins": 0.479516863822937, "rewards/rejected": -1.3828529119491577, "step": 378 }, { "epoch": 0.5729402872260015, "grad_norm": 15.603812217712402, "kl/avg_steps": 0.59375, "kl/beta": 0.028307726606726646, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.3150584219481643e-07, "logits/chosen": -1.583913803100586, "logits/rejected": -1.7535749673843384, "logps/chosen": -120.983154296875, "logps/ref_chosen": -91.2371597290039, "logps/ref_rejected": -120.1969985961914, "logps/rejected": -173.04830932617188, "loss": 1.0491, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8407952785491943, "rewards/margins": 0.6449382305145264, "rewards/rejected": -1.4857335090637207, "step": 379 }, { "epoch": 0.5744520030234316, "grad_norm": 8.755367279052734, "kl/avg_steps": 0.59375, "kl/beta": 0.028140641748905182, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.3018751078580283e-07, "logits/chosen": -1.3944610357284546, "logits/rejected": -1.2365813255310059, "logps/chosen": -99.56253051757812, "logps/ref_chosen": -77.78315734863281, "logps/ref_rejected": -92.56083679199219, "logps/rejected": -143.6357421875, "loss": 0.9618, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6120294332504272, "rewards/margins": 0.813266932964325, "rewards/rejected": -1.4252963066101074, "step": 380 }, { "epoch": 0.5759637188208617, "grad_norm": 12.221405982971191, "kl/avg_steps": 0.3125, "kl/beta": 0.02797454409301281, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.288697335747027e-07, "logits/chosen": -1.468564510345459, "logits/rejected": -1.4412519931793213, "logps/chosen": -109.62928771972656, "logps/ref_chosen": -75.28189086914062, "logps/ref_rejected": -81.1995849609375, "logps/rejected": -131.20687866210938, "loss": 1.2431, "rewards/accuracies": 0.640625, "rewards/chosen": -0.9610272645950317, "rewards/margins": 0.4295161962509155, "rewards/rejected": -1.3905434608459473, "step": 381 }, { "epoch": 0.5774754346182918, "grad_norm": 9.942258834838867, "kl/avg_steps": 0.4375, "kl/beta": 0.027887394651770592, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.2755254742257706e-07, "logits/chosen": -1.4594696760177612, "logits/rejected": -1.5281555652618408, "logps/chosen": -113.67582702636719, "logps/ref_chosen": -78.74870300292969, "logps/ref_rejected": -99.77484130859375, "logps/rejected": -154.184814453125, "loss": 1.0931, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9716081023216248, "rewards/margins": 0.535379946231842, "rewards/rejected": -1.5069879293441772, "step": 382 }, { "epoch": 0.5789871504157218, "grad_norm": 13.791141510009766, "kl/avg_steps": 0.4375, "kl/beta": 0.027765920385718346, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.2623598917395436e-07, "logits/chosen": -1.6387135982513428, "logits/rejected": -1.3257139921188354, "logps/chosen": -126.18170166015625, "logps/ref_chosen": -95.92772674560547, "logps/ref_rejected": -92.13604736328125, "logps/rejected": -146.85276794433594, "loss": 1.0978, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8407931923866272, "rewards/margins": 0.6689916849136353, "rewards/rejected": -1.5097849369049072, "step": 383 }, { "epoch": 0.5804988662131519, "grad_norm": 9.773907661437988, "kl/avg_steps": 0.546875, "kl/beta": 0.027644973248243332, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.2492009565579875e-07, "logits/chosen": -1.1795517206192017, "logits/rejected": -1.1752303838729858, "logps/chosen": -112.3473129272461, "logps/ref_chosen": -80.20899200439453, "logps/ref_rejected": -94.39380645751953, "logps/rejected": -150.83253479003906, "loss": 1.0315, "rewards/accuracies": 0.78125, "rewards/chosen": -0.885909378528595, "rewards/margins": 0.6628715991973877, "rewards/rejected": -1.548780918121338, "step": 384 }, { "epoch": 0.582010582010582, "grad_norm": 9.207103729248047, "kl/avg_steps": 0.40625, "kl/beta": 0.027494611218571663, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.2360490367648084e-07, "logits/chosen": -1.5247611999511719, "logits/rejected": -1.5652906894683838, "logps/chosen": -121.40055084228516, "logps/ref_chosen": -85.26632690429688, "logps/ref_rejected": -102.1983413696289, "logps/rejected": -163.07257080078125, "loss": 1.0272, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9921504259109497, "rewards/margins": 0.6710430979728699, "rewards/rejected": -1.6631935834884644, "step": 385 }, { "epoch": 0.5835222978080121, "grad_norm": 11.019001007080078, "kl/avg_steps": 0.4375, "kl/beta": 0.027383366599678993, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.2229045002474724e-07, "logits/chosen": -1.2989556789398193, "logits/rejected": -1.536360502243042, "logps/chosen": -135.63621520996094, "logps/ref_chosen": -93.19975280761719, "logps/ref_rejected": -112.98829650878906, "logps/rejected": -170.507568359375, "loss": 1.2507, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1580798625946045, "rewards/margins": 0.40500038862228394, "rewards/rejected": -1.5630803108215332, "step": 386 }, { "epoch": 0.5850340136054422, "grad_norm": 8.650019645690918, "kl/avg_steps": 0.5, "kl/beta": 0.02726408652961254, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.209767714686924e-07, "logits/chosen": -1.4123413562774658, "logits/rejected": -1.272484540939331, "logps/chosen": -100.4596939086914, "logps/ref_chosen": -66.32860565185547, "logps/ref_rejected": -100.56486511230469, "logps/rejected": -163.15472412109375, "loss": 0.9735, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9282127022743225, "rewards/margins": 0.7661646604537964, "rewards/rejected": -1.6943774223327637, "step": 387 }, { "epoch": 0.5865457294028723, "grad_norm": 17.062511444091797, "kl/avg_steps": 0.3125, "kl/beta": 0.027128444984555244, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.1966390475472954e-07, "logits/chosen": -1.4685077667236328, "logits/rejected": -1.4835941791534424, "logps/chosen": -132.53494262695312, "logps/ref_chosen": -92.95967864990234, "logps/ref_rejected": -97.9437255859375, "logps/rejected": -153.61680603027344, "loss": 1.2439, "rewards/accuracies": 0.640625, "rewards/chosen": -1.0734572410583496, "rewards/margins": 0.4278140664100647, "rewards/rejected": -1.5012712478637695, "step": 388 }, { "epoch": 0.5880574452003023, "grad_norm": 8.395482063293457, "kl/avg_steps": 0.53125, "kl/beta": 0.027043931186199188, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.1835188660656265e-07, "logits/chosen": -1.2796024084091187, "logits/rejected": -1.412411093711853, "logps/chosen": -111.56185913085938, "logps/ref_chosen": -76.89031982421875, "logps/ref_rejected": -93.79212951660156, "logps/rejected": -154.0435028076172, "loss": 1.02, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9355847835540771, "rewards/margins": 0.6820655465126038, "rewards/rejected": -1.6176502704620361, "step": 389 }, { "epoch": 0.5895691609977324, "grad_norm": 8.924866676330566, "kl/avg_steps": 0.5625, "kl/beta": 0.026901019737124443, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.170407537241599e-07, "logits/chosen": -1.3276596069335938, "logits/rejected": -1.1806645393371582, "logps/chosen": -88.36257934570312, "logps/ref_chosen": -61.05881881713867, "logps/ref_rejected": -79.55152893066406, "logps/rejected": -131.11795043945312, "loss": 1.0329, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7331553101539612, "rewards/margins": 0.6438949108123779, "rewards/rejected": -1.3770502805709839, "step": 390 }, { "epoch": 0.5910808767951625, "grad_norm": 9.212340354919434, "kl/avg_steps": 0.46875, "kl/beta": 0.02675054781138897, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.1573054278272636e-07, "logits/chosen": -1.4783949851989746, "logits/rejected": -1.3130332231521606, "logps/chosen": -112.14136505126953, "logps/ref_chosen": -78.60820770263672, "logps/ref_rejected": -103.33676147460938, "logps/rejected": -168.4642333984375, "loss": 1.0115, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8957010507583618, "rewards/margins": 0.8333579301834106, "rewards/rejected": -1.729059100151062, "step": 391 }, { "epoch": 0.5925925925925926, "grad_norm": 8.47270679473877, "kl/avg_steps": 0.53125, "kl/beta": 0.0266257394105196, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.1442129043167873e-07, "logits/chosen": -1.5991801023483276, "logits/rejected": -1.2611522674560547, "logps/chosen": -114.23375701904297, "logps/ref_chosen": -86.99469757080078, "logps/ref_rejected": -112.73616790771484, "logps/rejected": -174.4199676513672, "loss": 0.9338, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7235299348831177, "rewards/margins": 0.9055388569831848, "rewards/rejected": -1.6290687322616577, "step": 392 }, { "epoch": 0.5941043083900227, "grad_norm": 8.3270902633667, "kl/avg_steps": 0.40625, "kl/beta": 0.026485038921236992, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.131130332936195e-07, "logits/chosen": -1.2096143960952759, "logits/rejected": -1.2805571556091309, "logps/chosen": -108.41553497314453, "logps/ref_chosen": -71.26398468017578, "logps/ref_rejected": -88.99722290039062, "logps/rejected": -152.43466186523438, "loss": 1.0012, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9834790229797363, "rewards/margins": 0.6872824430465698, "rewards/rejected": -1.6707613468170166, "step": 393 }, { "epoch": 0.5956160241874527, "grad_norm": 9.936538696289062, "kl/avg_steps": 0.53125, "kl/beta": 0.026377879083156586, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.1180580796331323e-07, "logits/chosen": -1.1447997093200684, "logits/rejected": -1.3278568983078003, "logps/chosen": -113.02339935302734, "logps/ref_chosen": -78.70564270019531, "logps/ref_rejected": -87.01431274414062, "logps/rejected": -145.6602020263672, "loss": 1.0376, "rewards/accuracies": 0.75, "rewards/chosen": -0.9038447141647339, "rewards/margins": 0.6327704191207886, "rewards/rejected": -1.5366151332855225, "step": 394 }, { "epoch": 0.5971277399848829, "grad_norm": 9.601051330566406, "kl/avg_steps": 0.375, "kl/beta": 0.026238486170768738, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.104996510066625e-07, "logits/chosen": -1.2977375984191895, "logits/rejected": -1.5407524108886719, "logps/chosen": -100.53248596191406, "logps/ref_chosen": -65.30274963378906, "logps/ref_rejected": -93.22492980957031, "logps/rejected": -150.34323120117188, "loss": 1.0998, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9225190281867981, "rewards/margins": 0.5657308101654053, "rewards/rejected": -1.4882497787475586, "step": 395 }, { "epoch": 0.5986394557823129, "grad_norm": 9.81004810333252, "kl/avg_steps": 0.625, "kl/beta": 0.02614045888185501, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.0919459895968517e-07, "logits/chosen": -1.2869203090667725, "logits/rejected": -1.2694096565246582, "logps/chosen": -102.52066040039062, "logps/ref_chosen": -67.33502197265625, "logps/ref_rejected": -98.8193359375, "logps/rejected": -159.61468505859375, "loss": 0.9813, "rewards/accuracies": 0.859375, "rewards/chosen": -0.915213406085968, "rewards/margins": 0.6612294912338257, "rewards/rejected": -1.576442837715149, "step": 396 }, { "epoch": 0.600151171579743, "grad_norm": 12.078638076782227, "kl/avg_steps": 0.4375, "kl/beta": 0.025978095829486847, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.078906883274924e-07, "logits/chosen": -1.2485475540161133, "logits/rejected": -1.1371123790740967, "logps/chosen": -126.35214233398438, "logps/ref_chosen": -89.6042251586914, "logps/ref_rejected": -104.9779052734375, "logps/rejected": -159.32118225097656, "loss": 1.2242, "rewards/accuracies": 0.75, "rewards/chosen": -0.9524945616722107, "rewards/margins": 0.4481835961341858, "rewards/rejected": -1.4006781578063965, "step": 397 }, { "epoch": 0.6016628873771731, "grad_norm": 8.951614379882812, "kl/avg_steps": 0.40625, "kl/beta": 0.025864936411380768, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.065879555832674e-07, "logits/chosen": -1.4257827997207642, "logits/rejected": -1.3792155981063843, "logps/chosen": -99.28257751464844, "logps/ref_chosen": -66.43465423583984, "logps/ref_rejected": -90.90376281738281, "logps/rejected": -153.3382568359375, "loss": 0.9944, "rewards/accuracies": 0.71875, "rewards/chosen": -0.847280740737915, "rewards/margins": 0.7553883790969849, "rewards/rejected": -1.6026690006256104, "step": 398 }, { "epoch": 0.6031746031746031, "grad_norm": 10.219222068786621, "kl/avg_steps": 0.53125, "kl/beta": 0.025760285556316376, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.052864371672457e-07, "logits/chosen": -1.0542405843734741, "logits/rejected": -1.3199812173843384, "logps/chosen": -131.15020751953125, "logps/ref_chosen": -87.22315979003906, "logps/ref_rejected": -136.32411193847656, "logps/rejected": -211.09661865234375, "loss": 0.9769, "rewards/accuracies": 0.78125, "rewards/chosen": -1.130582332611084, "rewards/margins": 0.7845978736877441, "rewards/rejected": -1.9151802062988281, "step": 399 }, { "epoch": 0.6046863189720333, "grad_norm": 13.402877807617188, "kl/avg_steps": 0.25, "kl/beta": 0.02562415786087513, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.0398616948569493e-07, "logits/chosen": -1.2406623363494873, "logits/rejected": -1.7748053073883057, "logps/chosen": -139.27703857421875, "logps/ref_chosen": -91.1212158203125, "logps/ref_rejected": -108.19235229492188, "logps/rejected": -173.50042724609375, "loss": 1.2239, "rewards/accuracies": 0.640625, "rewards/chosen": -1.2333451509475708, "rewards/margins": 0.431148886680603, "rewards/rejected": -1.6644940376281738, "step": 400 }, { "epoch": 0.6046863189720333, "eval_kl/n_epsilon_steps": 0.2614436745643616, "eval_kl/p_epsilon_steps": 0.7372359037399292, "eval_logits/chosen": -1.249387264251709, "eval_logits/rejected": -1.3749488592147827, "eval_logps/chosen": -124.13003540039062, "eval_logps/ref_chosen": -86.90177917480469, "eval_logps/ref_rejected": -96.69639587402344, "eval_logps/rejected": -159.53257751464844, "eval_loss": 0.5372243523597717, "eval_rewards/accuracies": 0.7429577708244324, "eval_rewards/chosen": -0.950084388256073, "eval_rewards/margins": 0.6450637578964233, "eval_rewards/rejected": -1.5951480865478516, "eval_runtime": 46.8655, "eval_samples_per_second": 49.141, "eval_steps_per_second": 1.536, "step": 400 }, { "epoch": 0.6061980347694633, "grad_norm": 8.485547065734863, "kl/avg_steps": 0.5625, "kl/beta": 0.025560257956385612, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.0268718890989752e-07, "logits/chosen": -1.0869808197021484, "logits/rejected": -1.0903010368347168, "logps/chosen": -98.18634033203125, "logps/ref_chosen": -67.54151153564453, "logps/ref_rejected": -98.06488800048828, "logps/rejected": -162.6023406982422, "loss": 0.9048, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7803738117218018, "rewards/margins": 0.8559675216674805, "rewards/rejected": -1.6363413333892822, "step": 401 }, { "epoch": 0.6077097505668935, "grad_norm": 11.565892219543457, "kl/avg_steps": 0.4375, "kl/beta": 0.025417285040020943, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.013895317751323e-07, "logits/chosen": -1.3836373090744019, "logits/rejected": -1.0982781648635864, "logps/chosen": -110.84662628173828, "logps/ref_chosen": -77.44487762451172, "logps/ref_rejected": -83.1333236694336, "logps/rejected": -144.36251831054688, "loss": 0.993, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8493508100509644, "rewards/margins": 0.6979364156723022, "rewards/rejected": -1.5472872257232666, "step": 402 }, { "epoch": 0.6092214663643235, "grad_norm": 9.853551864624023, "kl/avg_steps": 0.53125, "kl/beta": 0.025306569412350655, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.0009323437965898e-07, "logits/chosen": -1.2379374504089355, "logits/rejected": -1.3333361148834229, "logps/chosen": -106.8541030883789, "logps/ref_chosen": -68.8230972290039, "logps/ref_rejected": -99.82356262207031, "logps/rejected": -171.64031982421875, "loss": 0.9869, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9604059457778931, "rewards/margins": 0.8437454700469971, "rewards/rejected": -1.8041512966156006, "step": 403 }, { "epoch": 0.6107331821617535, "grad_norm": 9.483668327331543, "kl/avg_steps": 0.453125, "kl/beta": 0.02517283894121647, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.9879833298370237e-07, "logits/chosen": -1.5257608890533447, "logits/rejected": -1.4390606880187988, "logps/chosen": -115.48838806152344, "logps/ref_chosen": -80.26783752441406, "logps/ref_rejected": -111.60258483886719, "logps/rejected": -178.8803253173828, "loss": 0.9802, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8856651186943054, "rewards/margins": 0.7965131998062134, "rewards/rejected": -1.6821783781051636, "step": 404 }, { "epoch": 0.6122448979591837, "grad_norm": 10.482511520385742, "kl/avg_steps": 0.34375, "kl/beta": 0.02505928836762905, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.975048638084379e-07, "logits/chosen": -1.202235460281372, "logits/rejected": -1.3670843839645386, "logps/chosen": -107.26358032226562, "logps/ref_chosen": -68.31065368652344, "logps/ref_rejected": -81.56044006347656, "logps/rejected": -144.6019287109375, "loss": 1.0878, "rewards/accuracies": 0.671875, "rewards/chosen": -0.9755101203918457, "rewards/margins": 0.5946328043937683, "rewards/rejected": -1.5701429843902588, "step": 405 }, { "epoch": 0.6137566137566137, "grad_norm": 9.058980941772461, "kl/avg_steps": 0.625, "kl/beta": 0.024973442777991295, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.9621286303497914e-07, "logits/chosen": -1.4929070472717285, "logits/rejected": -1.4613780975341797, "logps/chosen": -98.95768737792969, "logps/ref_chosen": -64.86714172363281, "logps/ref_rejected": -110.06051635742188, "logps/rejected": -177.30059814453125, "loss": 0.9677, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8477808833122253, "rewards/margins": 0.817082405090332, "rewards/rejected": -1.6648633480072021, "step": 406 }, { "epoch": 0.6152683295540439, "grad_norm": 16.868221282958984, "kl/avg_steps": 0.421875, "kl/beta": 0.02481832727789879, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.9492236680336483e-07, "logits/chosen": -1.4997903108596802, "logits/rejected": -1.9023494720458984, "logps/chosen": -155.658447265625, "logps/ref_chosen": -102.01712799072266, "logps/ref_rejected": -121.53548431396484, "logps/rejected": -199.111572265625, "loss": 1.0753, "rewards/accuracies": 0.703125, "rewards/chosen": -1.3289562463760376, "rewards/margins": 0.5858536958694458, "rewards/rejected": -1.9148099422454834, "step": 407 }, { "epoch": 0.6167800453514739, "grad_norm": 8.977225303649902, "kl/avg_steps": 0.5, "kl/beta": 0.024714065715670586, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.9363341121154895e-07, "logits/chosen": -1.303640604019165, "logits/rejected": -1.3134398460388184, "logps/chosen": -109.69934844970703, "logps/ref_chosen": -72.77989959716797, "logps/ref_rejected": -92.01815795898438, "logps/rejected": -163.4852294921875, "loss": 0.9017, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9086741805076599, "rewards/margins": 0.8438174724578857, "rewards/rejected": -1.7524917125701904, "step": 408 }, { "epoch": 0.618291761148904, "grad_norm": 11.181131362915039, "kl/avg_steps": 0.25, "kl/beta": 0.02459110878407955, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.9234603231438994e-07, "logits/chosen": -1.167852520942688, "logits/rejected": -1.0533185005187988, "logps/chosen": -126.26255798339844, "logps/ref_chosen": -77.7901611328125, "logps/ref_rejected": -79.2997055053711, "logps/rejected": -147.17803955078125, "loss": 1.2168, "rewards/accuracies": 0.609375, "rewards/chosen": -1.1924008131027222, "rewards/margins": 0.46864524483680725, "rewards/rejected": -1.661046028137207, "step": 409 }, { "epoch": 0.6198034769463341, "grad_norm": 11.200785636901855, "kl/avg_steps": 0.5, "kl/beta": 0.02452978491783142, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.9106026612264315e-07, "logits/chosen": -1.6598620414733887, "logits/rejected": -1.5288690328598022, "logps/chosen": -122.84921264648438, "logps/ref_chosen": -80.35844421386719, "logps/ref_rejected": -92.19056701660156, "logps/rejected": -167.47018432617188, "loss": 0.9488, "rewards/accuracies": 0.75, "rewards/chosen": -1.0397820472717285, "rewards/margins": 0.7941303849220276, "rewards/rejected": -1.8339124917984009, "step": 410 }, { "epoch": 0.6213151927437641, "grad_norm": 18.039907455444336, "kl/avg_steps": 0.46875, "kl/beta": 0.024407746270298958, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.8977614860195296e-07, "logits/chosen": -1.147395133972168, "logits/rejected": -1.2293355464935303, "logps/chosen": -113.1170654296875, "logps/ref_chosen": -70.72857666015625, "logps/ref_rejected": -93.19205474853516, "logps/rejected": -168.95037841796875, "loss": 0.963, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0353784561157227, "rewards/margins": 0.8042628765106201, "rewards/rejected": -1.8396413326263428, "step": 411 }, { "epoch": 0.6228269085411943, "grad_norm": 11.914444923400879, "kl/avg_steps": 0.53125, "kl/beta": 0.024293867871165276, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.8849371567184662e-07, "logits/chosen": -0.9596520662307739, "logits/rejected": -0.8119238615036011, "logps/chosen": -122.37300872802734, "logps/ref_chosen": -72.87568664550781, "logps/ref_rejected": -88.21068572998047, "logps/rejected": -167.08526611328125, "loss": 0.9993, "rewards/accuracies": 0.78125, "rewards/chosen": -1.200660228729248, "rewards/margins": 0.7046034336090088, "rewards/rejected": -1.9052636623382568, "step": 412 }, { "epoch": 0.6243386243386243, "grad_norm": 12.280434608459473, "kl/avg_steps": 0.46875, "kl/beta": 0.02416548877954483, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.872130032047302e-07, "logits/chosen": -1.359757900238037, "logits/rejected": -1.4349701404571533, "logps/chosen": -136.6390838623047, "logps/ref_chosen": -84.70051574707031, "logps/ref_rejected": -92.06742095947266, "logps/rejected": -169.40145874023438, "loss": 1.1385, "rewards/accuracies": 0.71875, "rewards/chosen": -1.253061294555664, "rewards/margins": 0.6037291288375854, "rewards/rejected": -1.8567904233932495, "step": 413 }, { "epoch": 0.6258503401360545, "grad_norm": 9.417985916137695, "kl/avg_steps": 0.53125, "kl/beta": 0.02405274286866188, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.8593404702488436e-07, "logits/chosen": -0.892124354839325, "logits/rejected": -0.8772637844085693, "logps/chosen": -122.20513916015625, "logps/ref_chosen": -70.97660827636719, "logps/ref_rejected": -92.90523529052734, "logps/rejected": -174.98516845703125, "loss": 1.0183, "rewards/accuracies": 0.765625, "rewards/chosen": -1.2277061939239502, "rewards/margins": 0.73204505443573, "rewards/rejected": -1.9597512483596802, "step": 414 }, { "epoch": 0.6273620559334845, "grad_norm": 12.158763885498047, "kl/avg_steps": 0.375, "kl/beta": 0.02392563782632351, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.846568829074628e-07, "logits/chosen": -1.1391077041625977, "logits/rejected": -1.0729453563690186, "logps/chosen": -122.39847564697266, "logps/ref_chosen": -71.7189712524414, "logps/ref_rejected": -74.54219818115234, "logps/rejected": -152.57752990722656, "loss": 1.1305, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2116117477416992, "rewards/margins": 0.6440663933753967, "rewards/rejected": -1.8556783199310303, "step": 415 }, { "epoch": 0.6288737717309146, "grad_norm": 10.729681015014648, "kl/avg_steps": 0.40625, "kl/beta": 0.023836251348257065, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.8338154657749128e-07, "logits/chosen": -1.4660162925720215, "logits/rejected": -1.5791254043579102, "logps/chosen": -126.84781646728516, "logps/ref_chosen": -72.88249206542969, "logps/ref_rejected": -85.30692291259766, "logps/rejected": -160.66006469726562, "loss": 1.2088, "rewards/accuracies": 0.65625, "rewards/chosen": -1.285239338874817, "rewards/margins": 0.5007971525192261, "rewards/rejected": -1.786036491394043, "step": 416 }, { "epoch": 0.6303854875283447, "grad_norm": 10.104597091674805, "kl/avg_steps": 0.5625, "kl/beta": 0.023739809170365334, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.8210807370886849e-07, "logits/chosen": -1.3048408031463623, "logits/rejected": -1.5203348398208618, "logps/chosen": -124.10682678222656, "logps/ref_chosen": -72.49703216552734, "logps/ref_rejected": -89.38966369628906, "logps/rejected": -176.7666015625, "loss": 0.9937, "rewards/accuracies": 0.796875, "rewards/chosen": -1.2212988138198853, "rewards/margins": 0.837921142578125, "rewards/rejected": -2.0592198371887207, "step": 417 }, { "epoch": 0.6318972033257747, "grad_norm": 11.858148574829102, "kl/avg_steps": 0.265625, "kl/beta": 0.02360701933503151, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.8083649992336825e-07, "logits/chosen": -1.4539004564285278, "logits/rejected": -1.4051744937896729, "logps/chosen": -150.91690063476562, "logps/ref_chosen": -89.70926666259766, "logps/ref_rejected": -90.98756408691406, "logps/rejected": -169.56109619140625, "loss": 1.2647, "rewards/accuracies": 0.671875, "rewards/chosen": -1.445763111114502, "rewards/margins": 0.4013369083404541, "rewards/rejected": -1.8470999002456665, "step": 418 }, { "epoch": 0.6334089191232048, "grad_norm": 10.206515312194824, "kl/avg_steps": 0.625, "kl/beta": 0.02354447916150093, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.7956686078964255e-07, "logits/chosen": -1.3517265319824219, "logits/rejected": -1.5228190422058105, "logps/chosen": -115.05638122558594, "logps/ref_chosen": -75.65210723876953, "logps/ref_rejected": -91.00135040283203, "logps/rejected": -171.54263305664062, "loss": 0.8518, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9251964092254639, "rewards/margins": 0.9574536085128784, "rewards/rejected": -1.8826498985290527, "step": 419 }, { "epoch": 0.6349206349206349, "grad_norm": 12.082380294799805, "kl/avg_steps": 0.34375, "kl/beta": 0.02339823916554451, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.782991918222275e-07, "logits/chosen": -1.1388458013534546, "logits/rejected": -1.2260863780975342, "logps/chosen": -128.3040771484375, "logps/ref_chosen": -72.58028411865234, "logps/ref_rejected": -79.90303039550781, "logps/rejected": -158.22250366210938, "loss": 1.2252, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3038926124572754, "rewards/margins": 0.5187857151031494, "rewards/rejected": -1.8226780891418457, "step": 420 }, { "epoch": 0.636432350718065, "grad_norm": 11.983504295349121, "kl/avg_steps": 0.28125, "kl/beta": 0.023318083956837654, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.7703352848054887e-07, "logits/chosen": -1.274233341217041, "logits/rejected": -1.7651447057724, "logps/chosen": -131.89462280273438, "logps/ref_chosen": -78.71546936035156, "logps/ref_rejected": -90.82321166992188, "logps/rejected": -167.80934143066406, "loss": 1.2296, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2419317960739136, "rewards/margins": 0.5444414615631104, "rewards/rejected": -1.7863733768463135, "step": 421 }, { "epoch": 0.6379440665154951, "grad_norm": 11.744466781616211, "kl/avg_steps": 0.40625, "kl/beta": 0.023252686485648155, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.7576990616793137e-07, "logits/chosen": -1.4543811082839966, "logits/rejected": -1.436232566833496, "logps/chosen": -134.90113830566406, "logps/ref_chosen": -86.74519348144531, "logps/ref_rejected": -94.02015686035156, "logps/rejected": -169.43231201171875, "loss": 1.0689, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1186872720718384, "rewards/margins": 0.6243889331817627, "rewards/rejected": -1.7430763244628906, "step": 422 }, { "epoch": 0.6394557823129252, "grad_norm": 9.857087135314941, "kl/avg_steps": 0.53125, "kl/beta": 0.0231586042791605, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.745083602306071e-07, "logits/chosen": -1.21421217918396, "logits/rejected": -1.77446448802948, "logps/chosen": -122.7453384399414, "logps/ref_chosen": -72.02232360839844, "logps/ref_rejected": -93.269775390625, "logps/rejected": -175.97735595703125, "loss": 1.0005, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1713683605194092, "rewards/margins": 0.7311166524887085, "rewards/rejected": -1.9024851322174072, "step": 423 }, { "epoch": 0.6409674981103552, "grad_norm": 10.320049285888672, "kl/avg_steps": 0.6875, "kl/beta": 0.023036224767565727, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.7324892595672804e-07, "logits/chosen": -1.5990121364593506, "logits/rejected": -1.411468505859375, "logps/chosen": -121.66751098632812, "logps/ref_chosen": -68.22148132324219, "logps/ref_rejected": -94.12411499023438, "logps/rejected": -182.5116424560547, "loss": 0.9414, "rewards/accuracies": 0.828125, "rewards/chosen": -1.224773645401001, "rewards/margins": 0.7947477698326111, "rewards/rejected": -2.019521474838257, "step": 424 }, { "epoch": 0.6424792139077853, "grad_norm": 11.145761489868164, "kl/avg_steps": 0.5, "kl/beta": 0.02287893183529377, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.7199163857537824e-07, "logits/chosen": -0.9588738679885864, "logits/rejected": -0.9888179898262024, "logps/chosen": -124.32681274414062, "logps/ref_chosen": -75.90104675292969, "logps/ref_rejected": -86.08673095703125, "logps/rejected": -165.00479125976562, "loss": 0.9915, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1062071323394775, "rewards/margins": 0.6885690689086914, "rewards/rejected": -1.794776201248169, "step": 425 }, { "epoch": 0.6439909297052154, "grad_norm": 13.805582046508789, "kl/avg_steps": 0.28125, "kl/beta": 0.022765105590224266, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.7073653325558828e-07, "logits/chosen": -1.2274202108383179, "logits/rejected": -1.1886839866638184, "logps/chosen": -151.05612182617188, "logps/ref_chosen": -89.93118286132812, "logps/ref_rejected": -91.04658508300781, "logps/rejected": -170.2470703125, "loss": 1.2932, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3905560970306396, "rewards/margins": 0.4026485085487366, "rewards/rejected": -1.7932045459747314, "step": 426 }, { "epoch": 0.6455026455026455, "grad_norm": 10.216066360473633, "kl/avg_steps": 0.4375, "kl/beta": 0.022701257839798927, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.6948364510535218e-07, "logits/chosen": -1.3483772277832031, "logits/rejected": -1.4812953472137451, "logps/chosen": -135.0600128173828, "logps/ref_chosen": -77.83393859863281, "logps/ref_rejected": -98.69865417480469, "logps/rejected": -183.77285766601562, "loss": 1.0638, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2972296476364136, "rewards/margins": 0.6236185431480408, "rewards/rejected": -1.9208481311798096, "step": 427 }, { "epoch": 0.6470143613000756, "grad_norm": 10.091684341430664, "kl/avg_steps": 0.4375, "kl/beta": 0.022602373734116554, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.6823300917064458e-07, "logits/chosen": -1.261292576789856, "logits/rejected": -1.0671913623809814, "logps/chosen": -145.90863037109375, "logps/ref_chosen": -90.3450927734375, "logps/ref_rejected": -100.24185180664062, "logps/rejected": -189.25491333007812, "loss": 1.0105, "rewards/accuracies": 0.734375, "rewards/chosen": -1.2532129287719727, "rewards/margins": 0.7458755970001221, "rewards/rejected": -1.9990885257720947, "step": 428 }, { "epoch": 0.6485260770975056, "grad_norm": 11.277990341186523, "kl/avg_steps": 0.4375, "kl/beta": 0.022503918036818504, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.669846604344412e-07, "logits/chosen": -1.235177993774414, "logits/rejected": -1.189939022064209, "logps/chosen": -137.588134765625, "logps/ref_chosen": -78.24811553955078, "logps/ref_rejected": -75.24494934082031, "logps/rejected": -165.09100341796875, "loss": 1.1433, "rewards/accuracies": 0.75, "rewards/chosen": -1.3354589939117432, "rewards/margins": 0.6751536726951599, "rewards/rejected": -2.0106124877929688, "step": 429 }, { "epoch": 0.6500377928949358, "grad_norm": 8.702858924865723, "kl/avg_steps": 0.5, "kl/beta": 0.022405892610549927, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.6573863381573954e-07, "logits/chosen": -1.2727080583572388, "logits/rejected": -0.9296888113021851, "logps/chosen": -130.50823974609375, "logps/ref_chosen": -76.08027648925781, "logps/ref_rejected": -84.09554290771484, "logps/rejected": -175.60972595214844, "loss": 0.9741, "rewards/accuracies": 0.765625, "rewards/chosen": -1.2184827327728271, "rewards/margins": 0.8200922012329102, "rewards/rejected": -2.0385749340057373, "step": 430 }, { "epoch": 0.6515495086923658, "grad_norm": 11.595307350158691, "kl/avg_steps": 0.46875, "kl/beta": 0.02229442074894905, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -1.2344250679016113, "logits/rejected": -1.259427547454834, "logps/chosen": -116.51612854003906, "logps/ref_chosen": -66.88581085205078, "logps/ref_rejected": -89.56040954589844, "logps/rejected": -169.1386260986328, "loss": 1.0887, "rewards/accuracies": 0.734375, "rewards/chosen": -1.1038061380386353, "rewards/margins": 0.6578041315078735, "rewards/rejected": -1.7616102695465088, "step": 431 }, { "epoch": 0.6530612244897959, "grad_norm": 10.99130630493164, "kl/avg_steps": 0.28125, "kl/beta": 0.022190403193235397, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.632536862810844e-07, "logits/chosen": -1.6262152194976807, "logits/rejected": -1.450348973274231, "logps/chosen": -132.7317352294922, "logps/ref_chosen": -79.65066528320312, "logps/ref_rejected": -103.92634582519531, "logps/rejected": -187.55831909179688, "loss": 1.1065, "rewards/accuracies": 0.703125, "rewards/chosen": -1.1787724494934082, "rewards/margins": 0.6679859161376953, "rewards/rejected": -1.846758246421814, "step": 432 }, { "epoch": 0.654572940287226, "grad_norm": 8.838223457336426, "kl/avg_steps": 0.5, "kl/beta": 0.022128168493509293, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.6201483487445515e-07, "logits/chosen": -1.1491540670394897, "logits/rejected": -1.1288440227508545, "logps/chosen": -128.4044952392578, "logps/ref_chosen": -77.30774688720703, "logps/ref_rejected": -81.65180206298828, "logps/rejected": -174.46002197265625, "loss": 0.9816, "rewards/accuracies": 0.75, "rewards/chosen": -1.1291104555130005, "rewards/margins": 0.9108319282531738, "rewards/rejected": -2.039942502975464, "step": 433 }, { "epoch": 0.656084656084656, "grad_norm": 8.635628700256348, "kl/avg_steps": 0.40625, "kl/beta": 0.022018076851963997, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.6077844460203204e-07, "logits/chosen": -1.2930514812469482, "logits/rejected": -1.4299508333206177, "logps/chosen": -105.981689453125, "logps/ref_chosen": -63.31850051879883, "logps/ref_rejected": -89.15093994140625, "logps/rejected": -168.12149047851562, "loss": 1.0291, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9386637806892395, "rewards/margins": 0.7881932854652405, "rewards/rejected": -1.7268571853637695, "step": 434 }, { "epoch": 0.6575963718820862, "grad_norm": 10.655919075012207, "kl/avg_steps": 0.28125, "kl/beta": 0.02192899025976658, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.5954455004830878e-07, "logits/chosen": -1.2391023635864258, "logits/rejected": -1.1685447692871094, "logps/chosen": -127.31586456298828, "logps/ref_chosen": -71.1719741821289, "logps/ref_rejected": -86.42095184326172, "logps/rejected": -169.24099731445312, "loss": 1.1213, "rewards/accuracies": 0.671875, "rewards/chosen": -1.2311064004898071, "rewards/margins": 0.575607419013977, "rewards/rejected": -1.8067138195037842, "step": 435 }, { "epoch": 0.6591080876795162, "grad_norm": 10.871426582336426, "kl/avg_steps": 0.375, "kl/beta": 0.02186748757958412, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.5831318572796847e-07, "logits/chosen": -1.319658637046814, "logits/rejected": -1.2668474912643433, "logps/chosen": -123.50926208496094, "logps/ref_chosen": -74.45087432861328, "logps/ref_rejected": -86.01708984375, "logps/rejected": -165.40402221679688, "loss": 1.0927, "rewards/accuracies": 0.703125, "rewards/chosen": -1.072027564048767, "rewards/margins": 0.6532148718833923, "rewards/rejected": -1.7252424955368042, "step": 436 }, { "epoch": 0.6606198034769464, "grad_norm": 11.484429359436035, "kl/avg_steps": 0.28125, "kl/beta": 0.021785791963338852, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.5708438608491815e-07, "logits/chosen": -0.9691067337989807, "logits/rejected": -1.2407987117767334, "logps/chosen": -133.60284423828125, "logps/ref_chosen": -72.38908386230469, "logps/ref_rejected": -111.03279876708984, "logps/rejected": -195.80502319335938, "loss": 1.2222, "rewards/accuracies": 0.625, "rewards/chosen": -1.33245050907135, "rewards/margins": 0.5034982562065125, "rewards/rejected": -1.8359487056732178, "step": 437 }, { "epoch": 0.6621315192743764, "grad_norm": 10.047585487365723, "kl/avg_steps": 0.4375, "kl/beta": 0.02172469161450863, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.558581854913253e-07, "logits/chosen": -1.2113059759140015, "logits/rejected": -1.1109917163848877, "logps/chosen": -108.18560791015625, "logps/ref_chosen": -57.27682876586914, "logps/ref_rejected": -83.07940673828125, "logps/rejected": -171.33041381835938, "loss": 0.9888, "rewards/accuracies": 0.734375, "rewards/chosen": -1.106644630432129, "rewards/margins": 0.8007879257202148, "rewards/rejected": -1.9074325561523438, "step": 438 }, { "epoch": 0.6636432350718064, "grad_norm": 8.809502601623535, "kl/avg_steps": 0.5625, "kl/beta": 0.02163005992770195, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.5463461824665658e-07, "logits/chosen": -1.3769385814666748, "logits/rejected": -1.4923155307769775, "logps/chosen": -153.1571044921875, "logps/ref_chosen": -98.35890197753906, "logps/ref_rejected": -112.69817352294922, "logps/rejected": -200.78346252441406, "loss": 1.0063, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1812466382980347, "rewards/margins": 0.7106728553771973, "rewards/rejected": -1.8919193744659424, "step": 439 }, { "epoch": 0.6651549508692366, "grad_norm": 9.849800109863281, "kl/avg_steps": 0.65625, "kl/beta": 0.021509071812033653, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.534137185767178e-07, "logits/chosen": -1.2204618453979492, "logits/rejected": -1.4051882028579712, "logps/chosen": -104.81169128417969, "logps/ref_chosen": -61.662452697753906, "logps/ref_rejected": -86.81646728515625, "logps/rejected": -172.91555786132812, "loss": 0.8625, "rewards/accuracies": 0.796875, "rewards/chosen": -0.923372745513916, "rewards/margins": 0.9133882522583008, "rewards/rejected": -1.8367609977722168, "step": 440 }, { "epoch": 0.6666666666666666, "grad_norm": 10.69460678100586, "kl/avg_steps": 0.59375, "kl/beta": 0.021368838846683502, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.521955206326976e-07, "logits/chosen": -1.348102331161499, "logits/rejected": -1.6415989398956299, "logps/chosen": -119.5263671875, "logps/ref_chosen": -74.33235168457031, "logps/ref_rejected": -99.654541015625, "logps/rejected": -179.17562866210938, "loss": 0.9737, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9617807865142822, "rewards/margins": 0.7241696119308472, "rewards/rejected": -1.6859502792358398, "step": 441 }, { "epoch": 0.6681783824640968, "grad_norm": 11.308244705200195, "kl/avg_steps": 0.40625, "kl/beta": 0.021242709830403328, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.5098005849021078e-07, "logits/chosen": -1.4728314876556396, "logits/rejected": -1.384155035018921, "logps/chosen": -144.53326416015625, "logps/ref_chosen": -82.42591857910156, "logps/ref_rejected": -106.71090698242188, "logps/rejected": -197.6875, "loss": 1.0805, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3178826570510864, "rewards/margins": 0.6041462421417236, "rewards/rejected": -1.9220290184020996, "step": 442 }, { "epoch": 0.6696900982615268, "grad_norm": 11.918970108032227, "kl/avg_steps": 0.53125, "kl/beta": 0.021156759932637215, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.4976736614834662e-07, "logits/chosen": -1.001205563545227, "logits/rejected": -1.202185869216919, "logps/chosen": -124.41624450683594, "logps/ref_chosen": -72.87019348144531, "logps/ref_rejected": -94.48143005371094, "logps/rejected": -187.20260620117188, "loss": 0.9738, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0882420539855957, "rewards/margins": 0.8598648905754089, "rewards/rejected": -1.9481067657470703, "step": 443 }, { "epoch": 0.671201814058957, "grad_norm": 14.023918151855469, "kl/avg_steps": 0.34375, "kl/beta": 0.021044958382844925, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.4855747752871654e-07, "logits/chosen": -1.3784263134002686, "logits/rejected": -1.6147680282592773, "logps/chosen": -137.09361267089844, "logps/ref_chosen": -74.65039825439453, "logps/ref_rejected": -106.89204406738281, "logps/rejected": -185.17578125, "loss": 1.2926, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3123301267623901, "rewards/margins": 0.32617539167404175, "rewards/rejected": -1.638505458831787, "step": 444 }, { "epoch": 0.672713529856387, "grad_norm": 13.924370765686035, "kl/avg_steps": 0.46875, "kl/beta": 0.02097286470234394, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.473504264745062e-07, "logits/chosen": -1.3273162841796875, "logits/rejected": -1.1322157382965088, "logps/chosen": -133.45980834960938, "logps/ref_chosen": -76.26957702636719, "logps/ref_rejected": -89.84994506835938, "logps/rejected": -186.78306579589844, "loss": 0.973, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1988749504089355, "rewards/margins": 0.8226101398468018, "rewards/rejected": -2.021484851837158, "step": 445 }, { "epoch": 0.674225245653817, "grad_norm": 9.73694896697998, "kl/avg_steps": 0.65625, "kl/beta": 0.02087501250207424, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.461462467495284e-07, "logits/chosen": -1.1069025993347168, "logits/rejected": -1.1735448837280273, "logps/chosen": -109.88125610351562, "logps/ref_chosen": -62.74647521972656, "logps/ref_rejected": -86.395751953125, "logps/rejected": -176.1666259765625, "loss": 0.8619, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9798285365104675, "rewards/margins": 0.8798956274986267, "rewards/rejected": -1.8597241640090942, "step": 446 }, { "epoch": 0.6757369614512472, "grad_norm": 10.825597763061523, "kl/avg_steps": 0.546875, "kl/beta": 0.020738914608955383, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.4494497203727843e-07, "logits/chosen": -1.481667399406433, "logits/rejected": -1.7828559875488281, "logps/chosen": -115.90046691894531, "logps/ref_chosen": -71.06666564941406, "logps/ref_rejected": -103.57110595703125, "logps/rejected": -184.4591064453125, "loss": 1.0198, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9274003505706787, "rewards/margins": 0.7373617887496948, "rewards/rejected": -1.6647621393203735, "step": 447 }, { "epoch": 0.6772486772486772, "grad_norm": 9.556791305541992, "kl/avg_steps": 0.46875, "kl/beta": 0.020626114681363106, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.4374663593999256e-07, "logits/chosen": -1.3543277978897095, "logits/rejected": -1.264389991760254, "logps/chosen": -129.90399169921875, "logps/ref_chosen": -73.400146484375, "logps/ref_rejected": -96.34330749511719, "logps/rejected": -183.10574340820312, "loss": 1.0692, "rewards/accuracies": 0.703125, "rewards/chosen": -1.161041498184204, "rewards/margins": 0.6152825355529785, "rewards/rejected": -1.776323914527893, "step": 448 }, { "epoch": 0.6787603930461074, "grad_norm": 14.48768424987793, "kl/avg_steps": 0.25, "kl/beta": 0.02052988111972809, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.4255127197770707e-07, "logits/chosen": -1.723862886428833, "logits/rejected": -1.4851081371307373, "logps/chosen": -156.7047576904297, "logps/ref_chosen": -93.66099548339844, "logps/ref_rejected": -102.53019714355469, "logps/rejected": -185.07656860351562, "loss": 1.2583, "rewards/accuracies": 0.640625, "rewards/chosen": -1.292679786682129, "rewards/margins": 0.39184582233428955, "rewards/rejected": -1.6845256090164185, "step": 449 }, { "epoch": 0.6802721088435374, "grad_norm": 8.63284969329834, "kl/avg_steps": 0.34375, "kl/beta": 0.020478684455156326, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.4135891358732205e-07, "logits/chosen": -1.5146286487579346, "logits/rejected": -1.8276360034942627, "logps/chosen": -109.09320831298828, "logps/ref_chosen": -62.52460479736328, "logps/ref_rejected": -94.04987335205078, "logps/rejected": -168.84970092773438, "loss": 1.0938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.952759861946106, "rewards/margins": 0.569326639175415, "rewards/rejected": -1.522086501121521, "step": 450 }, { "epoch": 0.6817838246409675, "grad_norm": 8.414955139160156, "kl/avg_steps": 0.375, "kl/beta": 0.02040852978825569, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.4016959412166437e-07, "logits/chosen": -1.3371424674987793, "logits/rejected": -1.300230622291565, "logps/chosen": -128.77247619628906, "logps/ref_chosen": -79.14009094238281, "logps/ref_rejected": -93.23920440673828, "logps/rejected": -169.08416748046875, "loss": 1.1148, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0107604265213013, "rewards/margins": 0.5268110036849976, "rewards/rejected": -1.5375714302062988, "step": 451 }, { "epoch": 0.6832955404383976, "grad_norm": 8.979512214660645, "kl/avg_steps": 0.40625, "kl/beta": 0.02033228427171707, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.3898334684855645e-07, "logits/chosen": -1.4430395364761353, "logits/rejected": -1.396399736404419, "logps/chosen": -122.95205688476562, "logps/ref_chosen": -70.38827514648438, "logps/ref_rejected": -95.47691345214844, "logps/rejected": -180.23406982421875, "loss": 1.0688, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0676555633544922, "rewards/margins": 0.644894540309906, "rewards/rejected": -1.7125499248504639, "step": 452 }, { "epoch": 0.6848072562358276, "grad_norm": 9.879496574401855, "kl/avg_steps": 0.46875, "kl/beta": 0.020250018686056137, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.3780020494988445e-07, "logits/chosen": -1.1745672225952148, "logits/rejected": -1.3190906047821045, "logps/chosen": -132.80862426757812, "logps/ref_chosen": -79.9207763671875, "logps/ref_rejected": -90.20779418945312, "logps/rejected": -173.9451904296875, "loss": 1.1077, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0687193870544434, "rewards/margins": 0.6148155927658081, "rewards/rejected": -1.683534860610962, "step": 453 }, { "epoch": 0.6863189720332578, "grad_norm": 9.429729461669922, "kl/avg_steps": 0.5, "kl/beta": 0.020155539736151695, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.366202015206706e-07, "logits/chosen": -1.3146743774414062, "logits/rejected": -1.2765040397644043, "logps/chosen": -113.04515075683594, "logps/ref_chosen": -69.71887969970703, "logps/ref_rejected": -82.86952209472656, "logps/rejected": -163.2214813232422, "loss": 1.0098, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8697865605354309, "rewards/margins": 0.7362134456634521, "rewards/rejected": -1.6060001850128174, "step": 454 }, { "epoch": 0.6878306878306878, "grad_norm": 8.902891159057617, "kl/avg_steps": 0.59375, "kl/beta": 0.020055262371897697, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.354433695681474e-07, "logits/chosen": -1.547599196434021, "logits/rejected": -1.477367877960205, "logps/chosen": -142.42257690429688, "logps/ref_chosen": -89.51481628417969, "logps/ref_rejected": -97.93235778808594, "logps/rejected": -190.3402099609375, "loss": 0.9365, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0567333698272705, "rewards/margins": 0.7826640605926514, "rewards/rejected": -1.8393973112106323, "step": 455 }, { "epoch": 0.6893424036281179, "grad_norm": 9.697734832763672, "kl/avg_steps": 0.5625, "kl/beta": 0.01993688754737377, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.3426974201083439e-07, "logits/chosen": -1.1608035564422607, "logits/rejected": -1.2271933555603027, "logps/chosen": -125.051513671875, "logps/ref_chosen": -74.60526275634766, "logps/ref_rejected": -97.98377227783203, "logps/rejected": -183.13381958007812, "loss": 0.9867, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0037286281585693, "rewards/margins": 0.6831802129745483, "rewards/rejected": -1.6869087219238281, "step": 456 }, { "epoch": 0.690854119425548, "grad_norm": 9.47767448425293, "kl/avg_steps": 0.5, "kl/beta": 0.019825369119644165, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.3309935167761717e-07, "logits/chosen": -1.1805062294006348, "logits/rejected": -1.66481614112854, "logps/chosen": -122.52874755859375, "logps/ref_chosen": -63.927032470703125, "logps/ref_rejected": -83.15243530273438, "logps/rejected": -170.2135772705078, "loss": 1.0579, "rewards/accuracies": 0.734375, "rewards/chosen": -1.1584389209747314, "rewards/margins": 0.5565627217292786, "rewards/rejected": -1.7150015830993652, "step": 457 }, { "epoch": 0.6923658352229781, "grad_norm": 10.947969436645508, "kl/avg_steps": 0.59375, "kl/beta": 0.019726736471056938, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.3193223130682936e-07, "logits/chosen": -1.3556028604507446, "logits/rejected": -1.7164582014083862, "logps/chosen": -115.37059020996094, "logps/ref_chosen": -67.68869018554688, "logps/ref_rejected": -104.40899658203125, "logps/rejected": -187.53775024414062, "loss": 0.9928, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9373903274536133, "rewards/margins": 0.6901355981826782, "rewards/rejected": -1.627525806427002, "step": 458 }, { "epoch": 0.6938775510204082, "grad_norm": 10.526549339294434, "kl/avg_steps": 0.53125, "kl/beta": 0.019610300660133362, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.3076841354533658e-07, "logits/chosen": -1.671908974647522, "logits/rejected": -1.4922375679016113, "logps/chosen": -131.16941833496094, "logps/ref_chosen": -83.82363891601562, "logps/ref_rejected": -103.7593765258789, "logps/rejected": -188.73544311523438, "loss": 0.984, "rewards/accuracies": 0.75, "rewards/chosen": -0.9256250262260437, "rewards/margins": 0.7286085486412048, "rewards/rejected": -1.654233694076538, "step": 459 }, { "epoch": 0.6953892668178382, "grad_norm": 9.452888488769531, "kl/avg_steps": 0.625, "kl/beta": 0.01950667053461075, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.2960793094762345e-07, "logits/chosen": -1.1332123279571533, "logits/rejected": -1.6192915439605713, "logps/chosen": -136.13206481933594, "logps/ref_chosen": -79.4836654663086, "logps/ref_rejected": -112.31745910644531, "logps/rejected": -207.57310485839844, "loss": 0.9451, "rewards/accuracies": 0.796875, "rewards/chosen": -1.1002308130264282, "rewards/margins": 0.744149386882782, "rewards/rejected": -1.844380259513855, "step": 460 }, { "epoch": 0.6969009826152683, "grad_norm": 9.566939353942871, "kl/avg_steps": 0.515625, "kl/beta": 0.019385511055588722, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.2845081597488286e-07, "logits/chosen": -1.3776705265045166, "logits/rejected": -1.5931401252746582, "logps/chosen": -109.80137634277344, "logps/ref_chosen": -64.28482055664062, "logps/ref_rejected": -93.73818969726562, "logps/rejected": -176.29751586914062, "loss": 0.9552, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8800297975540161, "rewards/margins": 0.7092906832695007, "rewards/rejected": -1.5893205404281616, "step": 461 }, { "epoch": 0.6984126984126984, "grad_norm": 9.090144157409668, "kl/avg_steps": 0.59375, "kl/beta": 0.01928606815636158, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.27297100994108e-07, "logits/chosen": -0.8449782133102417, "logits/rejected": -1.0140793323516846, "logps/chosen": -128.93240356445312, "logps/ref_chosen": -77.15335083007812, "logps/ref_rejected": -91.12923431396484, "logps/rejected": -184.95663452148438, "loss": 0.9597, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9942206144332886, "rewards/margins": 0.8003889918327332, "rewards/rejected": -1.794609546661377, "step": 462 }, { "epoch": 0.6999244142101285, "grad_norm": 11.438972473144531, "kl/avg_steps": 0.21875, "kl/beta": 0.01917223259806633, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 1.2614681827718695e-07, "logits/chosen": -1.5319080352783203, "logits/rejected": -1.303840160369873, "logps/chosen": -144.3251953125, "logps/ref_chosen": -87.58760070800781, "logps/ref_rejected": -87.97022247314453, "logps/rejected": -171.95938110351562, "loss": 1.1095, "rewards/accuracies": 0.640625, "rewards/chosen": -1.0876948833465576, "rewards/margins": 0.5146704912185669, "rewards/rejected": -1.602365493774414, "step": 463 }, { "epoch": 0.7014361300075586, "grad_norm": 9.427376747131348, "kl/avg_steps": 0.46875, "kl/beta": 0.019130384549498558, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -1.2840232849121094, "logits/rejected": -1.3823883533477783, "logps/chosen": -134.31118774414062, "logps/ref_chosen": -75.83175659179688, "logps/ref_rejected": -84.4811019897461, "logps/rejected": -181.33981323242188, "loss": 1.0668, "rewards/accuracies": 0.71875, "rewards/chosen": -1.116503357887268, "rewards/margins": 0.7235656380653381, "rewards/rejected": -1.840069055557251, "step": 464 }, { "epoch": 0.7029478458049887, "grad_norm": 11.430929183959961, "kl/avg_steps": 0.4375, "kl/beta": 0.01904112845659256, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.238566782415197e-07, "logits/chosen": -1.1909315586090088, "logits/rejected": -1.3115463256835938, "logps/chosen": -136.56283569335938, "logps/ref_chosen": -77.057861328125, "logps/ref_rejected": -102.75727844238281, "logps/rejected": -197.5943603515625, "loss": 1.0598, "rewards/accuracies": 0.734375, "rewards/chosen": -1.1309858560562134, "rewards/margins": 0.6631190776824951, "rewards/rejected": -1.794105052947998, "step": 465 }, { "epoch": 0.7044595616024187, "grad_norm": 15.998496055603027, "kl/avg_steps": 0.3125, "kl/beta": 0.018958186730742455, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.2271688498291334e-07, "logits/chosen": -1.4902276992797852, "logits/rejected": -1.216938853263855, "logps/chosen": -155.81655883789062, "logps/ref_chosen": -91.7751693725586, "logps/ref_rejected": -90.2679443359375, "logps/rejected": -177.01776123046875, "loss": 1.1964, "rewards/accuracies": 0.640625, "rewards/chosen": -1.215767741203308, "rewards/margins": 0.422646701335907, "rewards/rejected": -1.6384143829345703, "step": 466 }, { "epoch": 0.7059712773998488, "grad_norm": 10.740083694458008, "kl/avg_steps": 0.5625, "kl/beta": 0.01889912784099579, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.2158065210664848e-07, "logits/chosen": -1.1391918659210205, "logits/rejected": -1.6300973892211914, "logps/chosen": -118.06198120117188, "logps/ref_chosen": -64.77557373046875, "logps/ref_rejected": -102.58863830566406, "logps/rejected": -190.88726806640625, "loss": 0.9962, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0028396844863892, "rewards/margins": 0.6537683010101318, "rewards/rejected": -1.6566078662872314, "step": 467 }, { "epoch": 0.7074829931972789, "grad_norm": 11.700135231018066, "kl/avg_steps": 0.5, "kl/beta": 0.018793415278196335, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.204480113956011e-07, "logits/chosen": -1.5250056982040405, "logits/rejected": -1.129932165145874, "logps/chosen": -136.98934936523438, "logps/ref_chosen": -82.22445678710938, "logps/ref_rejected": -92.99041748046875, "logps/rejected": -191.02359008789062, "loss": 0.9621, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0257847309112549, "rewards/margins": 0.8028579950332642, "rewards/rejected": -1.8286426067352295, "step": 468 }, { "epoch": 0.708994708994709, "grad_norm": 9.69684886932373, "kl/avg_steps": 0.421875, "kl/beta": 0.01869991421699524, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.1931899453216697e-07, "logits/chosen": -1.6485295295715332, "logits/rejected": -1.3863415718078613, "logps/chosen": -131.82183837890625, "logps/ref_chosen": -75.93031311035156, "logps/ref_rejected": -92.26559448242188, "logps/rejected": -180.31640625, "loss": 1.0334, "rewards/accuracies": 0.734375, "rewards/chosen": -1.0436447858810425, "rewards/margins": 0.5939034819602966, "rewards/rejected": -1.6375482082366943, "step": 469 }, { "epoch": 0.7105064247921391, "grad_norm": 8.640731811523438, "kl/avg_steps": 0.40625, "kl/beta": 0.018621355295181274, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.1819363309737438e-07, "logits/chosen": -1.0694873332977295, "logits/rejected": -1.247729778289795, "logps/chosen": -122.56364440917969, "logps/ref_chosen": -65.86345672607422, "logps/ref_rejected": -85.89833068847656, "logps/rejected": -177.00157165527344, "loss": 1.0405, "rewards/accuracies": 0.71875, "rewards/chosen": -1.054185152053833, "rewards/margins": 0.6317137479782104, "rewards/rejected": -1.685899019241333, "step": 470 }, { "epoch": 0.7120181405895691, "grad_norm": 12.291424751281738, "kl/avg_steps": 0.703125, "kl/beta": 0.01854601316154003, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.1707195857000215e-07, "logits/chosen": -1.5064555406570435, "logits/rejected": -1.5125834941864014, "logps/chosen": -124.05204772949219, "logps/ref_chosen": -74.3460922241211, "logps/ref_rejected": -93.43672943115234, "logps/rejected": -188.60585021972656, "loss": 0.9318, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9173819422721863, "rewards/margins": 0.8328464031219482, "rewards/rejected": -1.7502284049987793, "step": 471 }, { "epoch": 0.7135298563869993, "grad_norm": 9.380815505981445, "kl/avg_steps": 0.4375, "kl/beta": 0.018416522070765495, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.1595400232569768e-07, "logits/chosen": -1.3893640041351318, "logits/rejected": -1.1879550218582153, "logps/chosen": -125.4832763671875, "logps/ref_chosen": -74.75674438476562, "logps/ref_rejected": -95.18183135986328, "logps/rejected": -183.03448486328125, "loss": 1.0654, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9331221580505371, "rewards/margins": 0.6739637851715088, "rewards/rejected": -1.607085943222046, "step": 472 }, { "epoch": 0.7150415721844293, "grad_norm": 10.035146713256836, "kl/avg_steps": 0.5625, "kl/beta": 0.018336299806833267, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.1483979563610069e-07, "logits/chosen": -1.2917176485061646, "logits/rejected": -1.6067132949829102, "logps/chosen": -117.21717071533203, "logps/ref_chosen": -71.65933227539062, "logps/ref_rejected": -109.99200439453125, "logps/rejected": -192.41232299804688, "loss": 1.0543, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8327977657318115, "rewards/margins": 0.6665883660316467, "rewards/rejected": -1.4993860721588135, "step": 473 }, { "epoch": 0.7165532879818595, "grad_norm": 11.2937593460083, "kl/avg_steps": 0.46875, "kl/beta": 0.018233735114336014, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.1372936966796709e-07, "logits/chosen": -1.3110771179199219, "logits/rejected": -1.6031813621520996, "logps/chosen": -124.39402770996094, "logps/ref_chosen": -65.91990661621094, "logps/ref_rejected": -89.09432983398438, "logps/rejected": -178.44122314453125, "loss": 1.121, "rewards/accuracies": 0.75, "rewards/chosen": -1.0649316310882568, "rewards/margins": 0.5543969869613647, "rewards/rejected": -1.619328498840332, "step": 474 }, { "epoch": 0.7180650037792895, "grad_norm": 9.53951644897461, "kl/avg_steps": 0.6875, "kl/beta": 0.018148664385080338, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.126227554822985e-07, "logits/chosen": -1.5834732055664062, "logits/rejected": -1.2278413772583008, "logps/chosen": -133.9127197265625, "logps/ref_chosen": -79.02459716796875, "logps/ref_rejected": -107.33058166503906, "logps/rejected": -212.11447143554688, "loss": 0.856, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9914153814315796, "rewards/margins": 0.8952299952507019, "rewards/rejected": -1.8866453170776367, "step": 475 }, { "epoch": 0.7195767195767195, "grad_norm": 9.075240135192871, "kl/avg_steps": 0.421875, "kl/beta": 0.01802474446594715, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.1151998403347243e-07, "logits/chosen": -1.4091227054595947, "logits/rejected": -1.4494162797927856, "logps/chosen": -156.70401000976562, "logps/ref_chosen": -93.72602844238281, "logps/ref_rejected": -94.390625, "logps/rejected": -191.04698181152344, "loss": 1.0586, "rewards/accuracies": 0.75, "rewards/chosen": -1.132453441619873, "rewards/margins": 0.5988330245018005, "rewards/rejected": -1.7312864065170288, "step": 476 }, { "epoch": 0.7210884353741497, "grad_norm": 10.94466781616211, "kl/avg_steps": 0.34375, "kl/beta": 0.017949020490050316, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.1042108616837692e-07, "logits/chosen": -1.2909693717956543, "logits/rejected": -1.2547205686569214, "logps/chosen": -147.51918029785156, "logps/ref_chosen": -76.51399993896484, "logps/ref_rejected": -99.14356231689453, "logps/rejected": -197.98812866210938, "loss": 1.203, "rewards/accuracies": 0.671875, "rewards/chosen": -1.2738592624664307, "rewards/margins": 0.49053436517715454, "rewards/rejected": -1.7643935680389404, "step": 477 }, { "epoch": 0.7226001511715797, "grad_norm": 14.016806602478027, "kl/avg_steps": 0.25, "kl/beta": 0.017887532711029053, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.0932609262554746e-07, "logits/chosen": -1.0645103454589844, "logits/rejected": -1.1659934520721436, "logps/chosen": -135.13836669921875, "logps/ref_chosen": -77.95185852050781, "logps/ref_rejected": -69.77754211425781, "logps/rejected": -157.57369995117188, "loss": 1.2105, "rewards/accuracies": 0.625, "rewards/chosen": -1.0235099792480469, "rewards/margins": 0.537567138671875, "rewards/rejected": -1.5610769987106323, "step": 478 }, { "epoch": 0.7241118669690099, "grad_norm": 9.91667366027832, "kl/avg_steps": 0.4375, "kl/beta": 0.01784292608499527, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.0823503403430734e-07, "logits/chosen": -1.2003769874572754, "logits/rejected": -1.3509670495986938, "logps/chosen": -132.9344482421875, "logps/ref_chosen": -76.56551361083984, "logps/ref_rejected": -84.33758544921875, "logps/rejected": -169.57940673828125, "loss": 1.1758, "rewards/accuracies": 0.703125, "rewards/chosen": -1.003492832183838, "rewards/margins": 0.5064426064491272, "rewards/rejected": -1.5099353790283203, "step": 479 }, { "epoch": 0.7256235827664399, "grad_norm": 15.995817184448242, "kl/avg_steps": 0.46875, "kl/beta": 0.01776520349085331, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.0714794091391072e-07, "logits/chosen": -1.562372088432312, "logits/rejected": -1.1541080474853516, "logps/chosen": -133.58810424804688, "logps/ref_chosen": -80.15884399414062, "logps/ref_rejected": -84.88697814941406, "logps/rejected": -176.2301788330078, "loss": 1.0802, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9479645490646362, "rewards/margins": 0.6631921529769897, "rewards/rejected": -1.611156702041626, "step": 480 }, { "epoch": 0.72713529856387, "grad_norm": 11.84682559967041, "kl/avg_steps": 0.484375, "kl/beta": 0.01768231764435768, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.0606484367268906e-07, "logits/chosen": -1.604417085647583, "logits/rejected": -1.5034980773925781, "logps/chosen": -142.07493591308594, "logps/ref_chosen": -84.56254577636719, "logps/ref_rejected": -90.06451416015625, "logps/rejected": -183.5068359375, "loss": 1.067, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0142407417297363, "rewards/margins": 0.6266486644744873, "rewards/rejected": -1.6408894062042236, "step": 481 }, { "epoch": 0.7286470143613001, "grad_norm": 13.854905128479004, "kl/avg_steps": 0.34375, "kl/beta": 0.017597081139683723, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.0498577260720048e-07, "logits/chosen": -1.169127106666565, "logits/rejected": -1.413175344467163, "logps/chosen": -147.22222900390625, "logps/ref_chosen": -78.88141632080078, "logps/ref_rejected": -125.41990661621094, "logps/rejected": -226.0455322265625, "loss": 1.1832, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2037124633789062, "rewards/margins": 0.5583795309066772, "rewards/rejected": -1.762091875076294, "step": 482 }, { "epoch": 0.7301587301587301, "grad_norm": 8.845148086547852, "kl/avg_steps": 0.453125, "kl/beta": 0.017536798492074013, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.0391075790138232e-07, "logits/chosen": -1.3627674579620361, "logits/rejected": -1.4596325159072876, "logps/chosen": -127.99396514892578, "logps/ref_chosen": -72.690185546875, "logps/ref_rejected": -98.37237548828125, "logps/rejected": -193.22244262695312, "loss": 1.0312, "rewards/accuracies": 0.734375, "rewards/chosen": -0.968032956123352, "rewards/margins": 0.6844265460968018, "rewards/rejected": -1.6524595022201538, "step": 483 }, { "epoch": 0.7316704459561603, "grad_norm": 10.676874160766602, "kl/avg_steps": 0.46875, "kl/beta": 0.017457693815231323, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.0283982962570681e-07, "logits/chosen": -1.6720235347747803, "logits/rejected": -1.4070327281951904, "logps/chosen": -128.504150390625, "logps/ref_chosen": -73.98435974121094, "logps/ref_rejected": -89.99177551269531, "logps/rejected": -174.51548767089844, "loss": 1.0856, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9500585794448853, "rewards/margins": 0.5166537761688232, "rewards/rejected": -1.466712474822998, "step": 484 }, { "epoch": 0.7331821617535903, "grad_norm": 9.896965026855469, "kl/avg_steps": 0.53125, "kl/beta": 0.017376242205500603, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.0177301773633992e-07, "logits/chosen": -1.1052076816558838, "logits/rejected": -1.263450026512146, "logps/chosen": -133.36056518554688, "logps/ref_chosen": -78.0927963256836, "logps/ref_rejected": -89.14010620117188, "logps/rejected": -179.0596923828125, "loss": 1.0581, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9572244882583618, "rewards/margins": 0.5939412713050842, "rewards/rejected": -1.5511658191680908, "step": 485 }, { "epoch": 0.7346938775510204, "grad_norm": 8.270903587341309, "kl/avg_steps": 0.421875, "kl/beta": 0.017284419387578964, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.007103520743035e-07, "logits/chosen": -0.9377778172492981, "logits/rejected": -1.2564281225204468, "logps/chosen": -140.08824157714844, "logps/ref_chosen": -73.74685668945312, "logps/ref_rejected": -107.752685546875, "logps/rejected": -210.189453125, "loss": 1.1214, "rewards/accuracies": 0.765625, "rewards/chosen": -1.144246220588684, "rewards/margins": 0.6144800186157227, "rewards/rejected": -1.7587262392044067, "step": 486 }, { "epoch": 0.7362055933484505, "grad_norm": 9.55226993560791, "kl/avg_steps": 0.46875, "kl/beta": 0.017211806029081345, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 9.965186236464046e-08, "logits/chosen": -1.361416220664978, "logits/rejected": -1.6992472410202026, "logps/chosen": -143.29461669921875, "logps/ref_chosen": -79.57780456542969, "logps/ref_rejected": -102.29163360595703, "logps/rejected": -202.83920288085938, "loss": 1.0364, "rewards/accuracies": 0.75, "rewards/chosen": -1.0940316915512085, "rewards/margins": 0.6254779100418091, "rewards/rejected": -1.7195096015930176, "step": 487 }, { "epoch": 0.7377173091458806, "grad_norm": 13.117379188537598, "kl/avg_steps": 0.375, "kl/beta": 0.01713150180876255, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 9.859757821558337e-08, "logits/chosen": -1.5597364902496338, "logits/rejected": -1.713277816772461, "logps/chosen": -137.8337860107422, "logps/ref_chosen": -80.62767791748047, "logps/ref_rejected": -100.45410919189453, "logps/rejected": -197.331787109375, "loss": 1.0627, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9779952764511108, "rewards/margins": 0.6699644327163696, "rewards/rejected": -1.6479597091674805, "step": 488 }, { "epoch": 0.7392290249433107, "grad_norm": 10.174800872802734, "kl/avg_steps": 0.3125, "kl/beta": 0.017067499458789825, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 9.754752911772615e-08, "logits/chosen": -1.595609188079834, "logits/rejected": -1.548938274383545, "logps/chosen": -150.69955444335938, "logps/ref_chosen": -85.39521026611328, "logps/ref_rejected": -101.97309875488281, "logps/rejected": -184.8128204345703, "loss": 1.2807, "rewards/accuracies": 0.609375, "rewards/chosen": -1.1136564016342163, "rewards/margins": 0.2926303744316101, "rewards/rejected": -1.4062868356704712, "step": 489 }, { "epoch": 0.7407407407407407, "grad_norm": 10.684184074401855, "kl/avg_steps": 0.28125, "kl/beta": 0.017014330253005028, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 9.650174444319956e-08, "logits/chosen": -1.3730194568634033, "logits/rejected": -1.2876062393188477, "logps/chosen": -136.75741577148438, "logps/ref_chosen": -77.75589752197266, "logps/ref_rejected": -88.98885345458984, "logps/rejected": -183.05950927734375, "loss": 1.1311, "rewards/accuracies": 0.671875, "rewards/chosen": -1.0055129528045654, "rewards/margins": 0.5872576236724854, "rewards/rejected": -1.5927705764770508, "step": 490 }, { "epoch": 0.7422524565381708, "grad_norm": 8.054734230041504, "kl/avg_steps": 0.5, "kl/beta": 0.01696661114692688, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 9.546025344484868e-08, "logits/chosen": -1.355436086654663, "logits/rejected": -1.3936882019042969, "logps/chosen": -132.57339477539062, "logps/ref_chosen": -74.33360290527344, "logps/ref_rejected": -91.4105224609375, "logps/rejected": -186.3638916015625, "loss": 1.0384, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9841344356536865, "rewards/margins": 0.6149394512176514, "rewards/rejected": -1.599073886871338, "step": 491 }, { "epoch": 0.7437641723356009, "grad_norm": 10.10815715789795, "kl/avg_steps": 0.25, "kl/beta": 0.01688219979405403, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 9.442308525541589e-08, "logits/chosen": -1.2477145195007324, "logits/rejected": -1.6418591737747192, "logps/chosen": -160.24461364746094, "logps/ref_chosen": -85.14178466796875, "logps/ref_rejected": -103.44204711914062, "logps/rejected": -204.75823974609375, "loss": 1.1862, "rewards/accuracies": 0.640625, "rewards/chosen": -1.2674788236618042, "rewards/margins": 0.4344896674156189, "rewards/rejected": -1.7019684314727783, "step": 492 }, { "epoch": 0.745275888133031, "grad_norm": 8.398894309997559, "kl/avg_steps": 0.53125, "kl/beta": 0.016840100288391113, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 9.339026888672468e-08, "logits/chosen": -1.0465887784957886, "logits/rejected": -1.2313003540039062, "logps/chosen": -134.85989379882812, "logps/ref_chosen": -75.81439971923828, "logps/ref_rejected": -95.30766296386719, "logps/rejected": -191.33767700195312, "loss": 1.0973, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9912848472595215, "rewards/margins": 0.6130992770195007, "rewards/rejected": -1.604384183883667, "step": 493 }, { "epoch": 0.7467876039304611, "grad_norm": 10.89151382446289, "kl/avg_steps": 0.40625, "kl/beta": 0.016751108691096306, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 9.236183322886945e-08, "logits/chosen": -1.4720494747161865, "logits/rejected": -1.2916717529296875, "logps/chosen": -153.79876708984375, "logps/ref_chosen": -93.83562469482422, "logps/ref_rejected": -112.21142578125, "logps/rejected": -202.45025634765625, "loss": 1.1706, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0018885135650635, "rewards/margins": 0.4987478256225586, "rewards/rejected": -1.500636339187622, "step": 494 }, { "epoch": 0.7482993197278912, "grad_norm": 10.691729545593262, "kl/avg_steps": 0.40625, "kl/beta": 0.01668333262205124, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 9.133780704940594e-08, "logits/chosen": -1.0881528854370117, "logits/rejected": -1.3684172630310059, "logps/chosen": -129.056884765625, "logps/ref_chosen": -68.52467346191406, "logps/ref_rejected": -89.65379333496094, "logps/rejected": -179.65640258789062, "loss": 1.1726, "rewards/accuracies": 0.703125, "rewards/chosen": -1.006978154182434, "rewards/margins": 0.483287513256073, "rewards/rejected": -1.4902657270431519, "step": 495 }, { "epoch": 0.7498110355253212, "grad_norm": 8.830723762512207, "kl/avg_steps": 0.453125, "kl/beta": 0.01661583222448826, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 9.031821899254797e-08, "logits/chosen": -1.227933406829834, "logits/rejected": -1.4093971252441406, "logps/chosen": -136.625732421875, "logps/ref_chosen": -73.13617706298828, "logps/ref_rejected": -111.5093002319336, "logps/rejected": -209.126220703125, "loss": 1.1085, "rewards/accuracies": 0.75, "rewards/chosen": -1.0537065267562866, "rewards/margins": 0.5583138465881348, "rewards/rejected": -1.6120203733444214, "step": 496 }, { "epoch": 0.7513227513227513, "grad_norm": 11.715909004211426, "kl/avg_steps": 0.5, "kl/beta": 0.016540881246328354, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 8.930309757836516e-08, "logits/chosen": -1.3785967826843262, "logits/rejected": -1.4464075565338135, "logps/chosen": -156.37203979492188, "logps/ref_chosen": -88.71475219726562, "logps/ref_rejected": -105.74935913085938, "logps/rejected": -214.29281616210938, "loss": 0.9959, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1154170036315918, "rewards/margins": 0.6677843332290649, "rewards/rejected": -1.7832013368606567, "step": 497 }, { "epoch": 0.7528344671201814, "grad_norm": 8.241373062133789, "kl/avg_steps": 0.4375, "kl/beta": 0.01645858772099018, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 8.829247120198563e-08, "logits/chosen": -1.360666036605835, "logits/rejected": -1.3529174327850342, "logps/chosen": -139.6077880859375, "logps/ref_chosen": -83.3353271484375, "logps/ref_rejected": -89.34942626953125, "logps/rejected": -183.6534423828125, "loss": 1.0478, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9240692853927612, "rewards/margins": 0.617653489112854, "rewards/rejected": -1.5417227745056152, "step": 498 }, { "epoch": 0.7543461829176115, "grad_norm": 11.564859390258789, "kl/avg_steps": 0.375, "kl/beta": 0.01638689450919628, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 8.728636813280163e-08, "logits/chosen": -1.5715296268463135, "logits/rejected": -1.7359554767608643, "logps/chosen": -143.91238403320312, "logps/ref_chosen": -79.373779296875, "logps/ref_rejected": -104.62533569335938, "logps/rejected": -201.68991088867188, "loss": 1.1696, "rewards/accuracies": 0.671875, "rewards/chosen": -1.0568264722824097, "rewards/margins": 0.5236354470252991, "rewards/rejected": -1.580461859703064, "step": 499 }, { "epoch": 0.7558578987150416, "grad_norm": 9.394903182983398, "kl/avg_steps": 0.46875, "kl/beta": 0.01632567308843136, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 8.628481651367875e-08, "logits/chosen": -1.5726468563079834, "logits/rejected": -1.5170438289642334, "logps/chosen": -148.0132293701172, "logps/ref_chosen": -85.953857421875, "logps/ref_rejected": -90.40995788574219, "logps/rejected": -189.79977416992188, "loss": 1.0454, "rewards/accuracies": 0.75, "rewards/chosen": -1.0117685794830322, "rewards/margins": 0.6012208461761475, "rewards/rejected": -1.6129894256591797, "step": 500 }, { "epoch": 0.7558578987150416, "eval_kl/n_epsilon_steps": 0.2680457830429077, "eval_kl/p_epsilon_steps": 0.7293133735656738, "eval_logits/chosen": -1.2671034336090088, "eval_logits/rejected": -1.397099256515503, "eval_logps/chosen": -145.2611083984375, "eval_logps/ref_chosen": -86.90177917480469, "eval_logps/ref_rejected": -96.69639587402344, "eval_logps/rejected": -190.7860870361328, "eval_loss": 0.5415008068084717, "eval_rewards/accuracies": 0.735035240650177, "eval_rewards/chosen": -0.9467713236808777, "eval_rewards/margins": 0.5722161531448364, "eval_rewards/rejected": -1.5189874172210693, "eval_runtime": 46.9472, "eval_samples_per_second": 49.055, "eval_steps_per_second": 1.534, "step": 500 }, { "epoch": 0.7573696145124716, "grad_norm": 7.951656818389893, "kl/avg_steps": 0.4375, "kl/beta": 0.016249503940343857, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 8.528784436016878e-08, "logits/chosen": -1.4893511533737183, "logits/rejected": -1.2975850105285645, "logps/chosen": -142.8973846435547, "logps/ref_chosen": -81.2226791381836, "logps/ref_rejected": -86.97892761230469, "logps/rejected": -178.79690551757812, "loss": 1.0854, "rewards/accuracies": 0.6875, "rewards/chosen": -0.999392569065094, "rewards/margins": 0.48362892866134644, "rewards/rejected": -1.4830214977264404, "step": 501 }, { "epoch": 0.7588813303099018, "grad_norm": 10.511534690856934, "kl/avg_steps": 0.46875, "kl/beta": 0.01617872156202793, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 8.4295479559726e-08, "logits/chosen": -1.8129953145980835, "logits/rejected": -1.8130979537963867, "logps/chosen": -143.57913208007812, "logps/ref_chosen": -83.15675354003906, "logps/ref_rejected": -106.74440002441406, "logps/rejected": -194.28616333007812, "loss": 1.1175, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9750958681106567, "rewards/margins": 0.4325859248638153, "rewards/rejected": -1.4076817035675049, "step": 502 }, { "epoch": 0.7603930461073318, "grad_norm": 8.138505935668945, "kl/avg_steps": 0.5625, "kl/beta": 0.016103237867355347, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 8.330774987092712e-08, "logits/chosen": -1.2757725715637207, "logits/rejected": -0.8529649972915649, "logps/chosen": -123.6448745727539, "logps/ref_chosen": -68.51583862304688, "logps/ref_rejected": -75.02178955078125, "logps/rejected": -169.09393310546875, "loss": 1.0351, "rewards/accuracies": 0.78125, "rewards/chosen": -0.88608318567276, "rewards/margins": 0.618719220161438, "rewards/rejected": -1.5048024654388428, "step": 503 }, { "epoch": 0.7619047619047619, "grad_norm": 22.84149742126465, "kl/avg_steps": 0.609375, "kl/beta": 0.016013164073228836, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 8.232468292269479e-08, "logits/chosen": -1.4639678001403809, "logits/rejected": -1.071527123451233, "logps/chosen": -139.56570434570312, "logps/ref_chosen": -85.15829467773438, "logps/ref_rejected": -96.16879272460938, "logps/rejected": -197.9119110107422, "loss": 0.917, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8674378991127014, "rewards/margins": 0.7494259476661682, "rewards/rejected": -1.6168639659881592, "step": 504 }, { "epoch": 0.763416477702192, "grad_norm": 12.432364463806152, "kl/avg_steps": 0.375, "kl/beta": 0.01591617427766323, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 8.134630621352483e-08, "logits/chosen": -1.513573408126831, "logits/rejected": -1.8110153675079346, "logps/chosen": -138.6746826171875, "logps/ref_chosen": -79.26185607910156, "logps/ref_rejected": -96.34947967529297, "logps/rejected": -183.6324920654297, "loss": 1.2128, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9458433389663696, "rewards/margins": 0.435134619474411, "rewards/rejected": -1.380977988243103, "step": 505 }, { "epoch": 0.764928193499622, "grad_norm": 10.85437297821045, "kl/avg_steps": 0.5, "kl/beta": 0.01585671305656433, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 8.037264711071698e-08, "logits/chosen": -1.711536169052124, "logits/rejected": -1.5470472574234009, "logps/chosen": -148.20022583007812, "logps/ref_chosen": -88.192626953125, "logps/ref_rejected": -100.86880493164062, "logps/rejected": -192.3444061279297, "loss": 1.1906, "rewards/accuracies": 0.703125, "rewards/chosen": -0.950194776058197, "rewards/margins": 0.48996031284332275, "rewards/rejected": -1.440155029296875, "step": 506 }, { "epoch": 0.7664399092970522, "grad_norm": 8.822526931762695, "kl/avg_steps": 0.5, "kl/beta": 0.015777822583913803, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 7.940373284960933e-08, "logits/chosen": -1.528835654258728, "logits/rejected": -1.691847801208496, "logps/chosen": -153.1142578125, "logps/ref_chosen": -86.04632568359375, "logps/ref_rejected": -111.44412994384766, "logps/rejected": -212.93182373046875, "loss": 1.1061, "rewards/accuracies": 0.75, "rewards/chosen": -1.054894208908081, "rewards/margins": 0.5349427461624146, "rewards/rejected": -1.5898369550704956, "step": 507 }, { "epoch": 0.7679516250944822, "grad_norm": 9.772814750671387, "kl/avg_steps": 0.40625, "kl/beta": 0.015699326992034912, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 7.843959053281663e-08, "logits/chosen": -1.3107966184616089, "logits/rejected": -1.7589651346206665, "logps/chosen": -139.20782470703125, "logps/ref_chosen": -79.25038146972656, "logps/ref_rejected": -118.49089813232422, "logps/rejected": -216.39041137695312, "loss": 1.0755, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9390788674354553, "rewards/margins": 0.5872691869735718, "rewards/rejected": -1.5263481140136719, "step": 508 }, { "epoch": 0.7694633408919124, "grad_norm": 8.684977531433105, "kl/avg_steps": 0.53125, "kl/beta": 0.015635807067155838, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 7.748024712947204e-08, "logits/chosen": -1.1436741352081299, "logits/rejected": -1.225794792175293, "logps/chosen": -136.03652954101562, "logps/ref_chosen": -80.7039566040039, "logps/ref_rejected": -90.50444793701172, "logps/rejected": -183.49346923828125, "loss": 1.0166, "rewards/accuracies": 0.765625, "rewards/chosen": -0.862238883972168, "rewards/margins": 0.5809536576271057, "rewards/rejected": -1.443192481994629, "step": 509 }, { "epoch": 0.7709750566893424, "grad_norm": 9.093579292297363, "kl/avg_steps": 0.5, "kl/beta": 0.015553180128335953, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 7.652572947447272e-08, "logits/chosen": -1.6221449375152588, "logits/rejected": -1.680511474609375, "logps/chosen": -123.47549438476562, "logps/ref_chosen": -67.64491271972656, "logps/ref_rejected": -108.92274475097656, "logps/rejected": -207.5115966796875, "loss": 1.0332, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8659316301345825, "rewards/margins": 0.6559004187583923, "rewards/rejected": -1.52183198928833, "step": 510 }, { "epoch": 0.7724867724867724, "grad_norm": 9.894843101501465, "kl/avg_steps": 0.46875, "kl/beta": 0.015475801192224026, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 7.557606426772961e-08, "logits/chosen": -1.6408867835998535, "logits/rejected": -1.515726089477539, "logps/chosen": -135.91790771484375, "logps/ref_chosen": -75.66263580322266, "logps/ref_rejected": -104.26296997070312, "logps/rejected": -210.71661376953125, "loss": 1.0095, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9304167032241821, "rewards/margins": 0.7054550647735596, "rewards/rejected": -1.6358717679977417, "step": 511 }, { "epoch": 0.7739984882842026, "grad_norm": 9.156575202941895, "kl/avg_steps": 0.34375, "kl/beta": 0.015403596684336662, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 7.463127807341966e-08, "logits/chosen": -1.653001070022583, "logits/rejected": -1.8152376413345337, "logps/chosen": -139.13134765625, "logps/ref_chosen": -79.31925964355469, "logps/ref_rejected": -82.22052001953125, "logps/rejected": -171.9754180908203, "loss": 1.1527, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9223538637161255, "rewards/margins": 0.4540788233280182, "rewards/rejected": -1.3764326572418213, "step": 512 }, { "epoch": 0.7755102040816326, "grad_norm": 7.260792255401611, "kl/avg_steps": 0.5625, "kl/beta": 0.01535082794725895, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 7.369139731924401e-08, "logits/chosen": -0.9878244400024414, "logits/rejected": -1.1484776735305786, "logps/chosen": -127.03314971923828, "logps/ref_chosen": -72.02534484863281, "logps/ref_rejected": -86.56224060058594, "logps/rejected": -179.06666564941406, "loss": 1.0315, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8418123722076416, "rewards/margins": 0.568528413772583, "rewards/rejected": -1.4103407859802246, "step": 513 }, { "epoch": 0.7770219198790628, "grad_norm": 9.606550216674805, "kl/avg_steps": 0.453125, "kl/beta": 0.015264962799847126, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 7.275644829568747e-08, "logits/chosen": -1.5925896167755127, "logits/rejected": -1.3574076890945435, "logps/chosen": -144.4132537841797, "logps/ref_chosen": -84.94092559814453, "logps/ref_rejected": -102.44367218017578, "logps/rejected": -203.36727905273438, "loss": 1.0128, "rewards/accuracies": 0.75, "rewards/chosen": -0.906743049621582, "rewards/margins": 0.6244295835494995, "rewards/rejected": -1.5311726331710815, "step": 514 }, { "epoch": 0.7785336356764928, "grad_norm": 10.59394359588623, "kl/avg_steps": 0.46875, "kl/beta": 0.01519610546529293, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 7.182645715528435e-08, "logits/chosen": -0.9610180854797363, "logits/rejected": -1.0923925638198853, "logps/chosen": -137.7475128173828, "logps/ref_chosen": -72.9662094116211, "logps/ref_rejected": -102.53651428222656, "logps/rejected": -201.09156799316406, "loss": 1.1114, "rewards/accuracies": 0.75, "rewards/chosen": -0.9821167588233948, "rewards/margins": 0.5052266120910645, "rewards/rejected": -1.4873433113098145, "step": 515 }, { "epoch": 0.780045351473923, "grad_norm": 11.120777130126953, "kl/avg_steps": 0.6875, "kl/beta": 0.015125205740332603, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 7.090144991188568e-08, "logits/chosen": -0.9688495397567749, "logits/rejected": -1.2150192260742188, "logps/chosen": -126.80594635009766, "logps/ref_chosen": -76.63414001464844, "logps/ref_rejected": -91.01750183105469, "logps/rejected": -173.46884155273438, "loss": 1.1035, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7560686469078064, "rewards/margins": 0.48095256090164185, "rewards/rejected": -1.2370212078094482, "step": 516 }, { "epoch": 0.781557067271353, "grad_norm": 9.966470718383789, "kl/avg_steps": 0.3125, "kl/beta": 0.015021930448710918, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 6.998145243993284e-08, "logits/chosen": -1.5444862842559814, "logits/rejected": -1.267151117324829, "logps/chosen": -144.53884887695312, "logps/ref_chosen": -77.06816864013672, "logps/ref_rejected": -80.048583984375, "logps/rejected": -176.18679809570312, "loss": 1.2028, "rewards/accuracies": 0.671875, "rewards/chosen": -1.0134719610214233, "rewards/margins": 0.42320364713668823, "rewards/rejected": -1.4366756677627563, "step": 517 }, { "epoch": 0.783068783068783, "grad_norm": 7.715234279632568, "kl/avg_steps": 0.375, "kl/beta": 0.014975132420659065, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 6.906649047373245e-08, "logits/chosen": -1.4431686401367188, "logits/rejected": -1.650618314743042, "logps/chosen": -130.3668212890625, "logps/ref_chosen": -78.69026184082031, "logps/ref_rejected": -97.58125305175781, "logps/rejected": -183.90213012695312, "loss": 1.1056, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7733626365661621, "rewards/margins": 0.5110686421394348, "rewards/rejected": -1.2844313383102417, "step": 518 }, { "epoch": 0.7845804988662132, "grad_norm": 9.29223346710205, "kl/avg_steps": 0.3125, "kl/beta": 0.014919186010956764, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 6.815658960673781e-08, "logits/chosen": -1.1009117364883423, "logits/rejected": -1.2734102010726929, "logps/chosen": -145.77960205078125, "logps/ref_chosen": -78.35087585449219, "logps/ref_rejected": -95.79212188720703, "logps/rejected": -184.5224151611328, "loss": 1.3496, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0064210891723633, "rewards/margins": 0.30923837423324585, "rewards/rejected": -1.3156594038009644, "step": 519 }, { "epoch": 0.7860922146636432, "grad_norm": 9.650991439819336, "kl/avg_steps": 0.5625, "kl/beta": 0.014872708357870579, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 6.725177529083209e-08, "logits/chosen": -1.399254560470581, "logits/rejected": -1.952781319618225, "logps/chosen": -138.1239471435547, "logps/ref_chosen": -80.40513610839844, "logps/ref_rejected": -93.02791595458984, "logps/rejected": -183.82025146484375, "loss": 1.0674, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8555145859718323, "rewards/margins": 0.48555320501327515, "rewards/rejected": -1.3410677909851074, "step": 520 }, { "epoch": 0.7876039304610734, "grad_norm": 10.00867748260498, "kl/avg_steps": 0.5, "kl/beta": 0.014789517968893051, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 6.63520728356167e-08, "logits/chosen": -1.0487563610076904, "logits/rejected": -1.4563612937927246, "logps/chosen": -147.1343994140625, "logps/ref_chosen": -86.5218276977539, "logps/ref_rejected": -109.20257568359375, "logps/rejected": -205.20623779296875, "loss": 1.0869, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8942147493362427, "rewards/margins": 0.5161318778991699, "rewards/rejected": -1.4103467464447021, "step": 521 }, { "epoch": 0.7891156462585034, "grad_norm": 8.67531681060791, "kl/avg_steps": 0.34375, "kl/beta": 0.01471593789756298, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 6.545750740770336e-08, "logits/chosen": -1.360521674156189, "logits/rejected": -0.9759971499443054, "logps/chosen": -138.8271484375, "logps/ref_chosen": -78.2425537109375, "logps/ref_rejected": -85.23554992675781, "logps/rejected": -174.7383575439453, "loss": 1.1936, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8904162049293518, "rewards/margins": 0.4182692766189575, "rewards/rejected": -1.3086854219436646, "step": 522 }, { "epoch": 0.7906273620559335, "grad_norm": 8.602282524108887, "kl/avg_steps": 0.53125, "kl/beta": 0.014665525406599045, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 6.456810403001012e-08, "logits/chosen": -1.4247796535491943, "logits/rejected": -1.5719811916351318, "logps/chosen": -149.7041778564453, "logps/ref_chosen": -83.50096893310547, "logps/ref_rejected": -117.4521713256836, "logps/rejected": -217.1758575439453, "loss": 1.1016, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9677379131317139, "rewards/margins": 0.4840565621852875, "rewards/rejected": -1.4517945051193237, "step": 523 }, { "epoch": 0.7921390778533636, "grad_norm": 7.558107376098633, "kl/avg_steps": 0.4375, "kl/beta": 0.014588026329874992, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 6.368388758106134e-08, "logits/chosen": -1.659937858581543, "logits/rejected": -1.5439331531524658, "logps/chosen": -143.3587646484375, "logps/ref_chosen": -93.22590637207031, "logps/ref_rejected": -108.17863464355469, "logps/rejected": -189.51800537109375, "loss": 1.1301, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7300490140914917, "rewards/margins": 0.4482584297657013, "rewards/rejected": -1.1783075332641602, "step": 524 }, { "epoch": 0.7936507936507936, "grad_norm": 9.238025665283203, "kl/avg_steps": 0.265625, "kl/beta": 0.014524482190608978, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.625, "learning_rate": 6.280488279429185e-08, "logits/chosen": -1.7793774604797363, "logits/rejected": -1.6026604175567627, "logps/chosen": -159.54000854492188, "logps/ref_chosen": -94.08831787109375, "logps/ref_rejected": -100.682373046875, "logps/rejected": -191.29754638671875, "loss": 1.199, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9504117965698242, "rewards/margins": 0.35912391543388367, "rewards/rejected": -1.3095357418060303, "step": 525 }, { "epoch": 0.7951625094482238, "grad_norm": 8.711167335510254, "kl/avg_steps": 0.375, "kl/beta": 0.014486003667116165, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 6.193111425735515e-08, "logits/chosen": -1.275062918663025, "logits/rejected": -1.5753190517425537, "logps/chosen": -144.21517944335938, "logps/ref_chosen": -77.78373718261719, "logps/ref_rejected": -100.29583740234375, "logps/rejected": -194.74301147460938, "loss": 1.1563, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9615944623947144, "rewards/margins": 0.39926403760910034, "rewards/rejected": -1.36085844039917, "step": 526 }, { "epoch": 0.7966742252456538, "grad_norm": 10.081496238708496, "kl/avg_steps": 0.46875, "kl/beta": 0.014431883580982685, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 6.106260641143546e-08, "logits/chosen": -1.1058743000030518, "logits/rejected": -1.5538671016693115, "logps/chosen": -149.97344970703125, "logps/ref_chosen": -76.695068359375, "logps/ref_rejected": -107.68281555175781, "logps/rejected": -203.9995574951172, "loss": 1.2175, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0548813343048096, "rewards/margins": 0.3263258934020996, "rewards/rejected": -1.3812072277069092, "step": 527 }, { "epoch": 0.7981859410430839, "grad_norm": 8.379530906677246, "kl/avg_steps": 0.4375, "kl/beta": 0.01436454989016056, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 6.019938355056422e-08, "logits/chosen": -1.0639445781707764, "logits/rejected": -1.2222740650177002, "logps/chosen": -134.2487030029297, "logps/ref_chosen": -75.0361328125, "logps/ref_rejected": -94.67579650878906, "logps/rejected": -180.45919799804688, "loss": 1.2042, "rewards/accuracies": 0.703125, "rewards/chosen": -0.8484035730361938, "rewards/margins": 0.37475496530532837, "rewards/rejected": -1.2231584787368774, "step": 528 }, { "epoch": 0.799697656840514, "grad_norm": 7.623414993286133, "kl/avg_steps": 0.5625, "kl/beta": 0.01430197898298502, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 5.934146982094049e-08, "logits/chosen": -1.2689933776855469, "logits/rejected": -1.3251078128814697, "logps/chosen": -130.27891540527344, "logps/ref_chosen": -72.84869384765625, "logps/ref_rejected": -93.25855255126953, "logps/rejected": -204.9322052001953, "loss": 0.9281, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8188842535018921, "rewards/margins": 0.7665591835975647, "rewards/rejected": -1.5854434967041016, "step": 529 }, { "epoch": 0.8012093726379441, "grad_norm": 8.185614585876465, "kl/avg_steps": 0.34375, "kl/beta": 0.0142219802364707, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 5.848888922025552e-08, "logits/chosen": -1.2606332302093506, "logits/rejected": -1.3375990390777588, "logps/chosen": -137.87921142578125, "logps/ref_chosen": -79.49717712402344, "logps/ref_rejected": -93.59564208984375, "logps/rejected": -189.87530517578125, "loss": 1.0558, "rewards/accuracies": 0.703125, "rewards/chosen": -0.8295097947120667, "rewards/margins": 0.5320351123809814, "rewards/rejected": -1.3615449666976929, "step": 530 }, { "epoch": 0.8027210884353742, "grad_norm": 10.98009967803955, "kl/avg_steps": 0.375, "kl/beta": 0.01417325995862484, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 5.7641665597021435e-08, "logits/chosen": -1.1490346193313599, "logits/rejected": -1.1379203796386719, "logps/chosen": -130.52809143066406, "logps/ref_chosen": -69.45396423339844, "logps/ref_rejected": -96.30017852783203, "logps/rejected": -187.3482666015625, "loss": 1.1267, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8644498586654663, "rewards/margins": 0.41859444975852966, "rewards/rejected": -1.2830442190170288, "step": 531 }, { "epoch": 0.8042328042328042, "grad_norm": 7.920943260192871, "kl/avg_steps": 0.5, "kl/beta": 0.014120308682322502, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.679982264990424e-08, "logits/chosen": -1.4232215881347656, "logits/rejected": -1.4004766941070557, "logps/chosen": -149.61624145507812, "logps/ref_chosen": -76.52011108398438, "logps/ref_rejected": -94.79593658447266, "logps/rejected": -202.38143920898438, "loss": 1.1168, "rewards/accuracies": 0.75, "rewards/chosen": -1.0281970500946045, "rewards/margins": 0.47959816455841064, "rewards/rejected": -1.5077950954437256, "step": 532 }, { "epoch": 0.8057445200302343, "grad_norm": 10.504121780395508, "kl/avg_steps": 0.5, "kl/beta": 0.014050058089196682, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.596338392706076e-08, "logits/chosen": -1.232681393623352, "logits/rejected": -1.6070609092712402, "logps/chosen": -123.11308288574219, "logps/ref_chosen": -72.31800842285156, "logps/ref_rejected": -89.26652526855469, "logps/rejected": -177.32489013671875, "loss": 1.0956, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7114356756210327, "rewards/margins": 0.5157041549682617, "rewards/rejected": -1.227139949798584, "step": 533 }, { "epoch": 0.8072562358276644, "grad_norm": 9.713748931884766, "kl/avg_steps": 0.40625, "kl/beta": 0.013980157673358917, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 5.513237282548033e-08, "logits/chosen": -1.711059331893921, "logits/rejected": -1.5232676267623901, "logps/chosen": -129.87283325195312, "logps/ref_chosen": -77.87559509277344, "logps/ref_rejected": -92.21171569824219, "logps/rejected": -180.92959594726562, "loss": 1.0767, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7261112928390503, "rewards/margins": 0.5059158205986023, "rewards/rejected": -1.2320270538330078, "step": 534 }, { "epoch": 0.8087679516250945, "grad_norm": 6.756545543670654, "kl/avg_steps": 0.4375, "kl/beta": 0.013923592865467072, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 5.430681259032957e-08, "logits/chosen": -1.16239333152771, "logits/rejected": -1.4077019691467285, "logps/chosen": -142.74710083007812, "logps/ref_chosen": -78.16358184814453, "logps/ref_rejected": -97.78164672851562, "logps/rejected": -195.040771484375, "loss": 1.1583, "rewards/accuracies": 0.71875, "rewards/chosen": -0.898932695388794, "rewards/margins": 0.4476352632045746, "rewards/rejected": -1.3465681076049805, "step": 535 }, { "epoch": 0.8102796674225246, "grad_norm": 6.646007537841797, "kl/avg_steps": 0.6875, "kl/beta": 0.013862942345440388, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 5.3486726314303175e-08, "logits/chosen": -1.2160377502441406, "logits/rejected": -1.4614768028259277, "logps/chosen": -123.78688049316406, "logps/ref_chosen": -66.65623474121094, "logps/ref_rejected": -89.49085998535156, "logps/rejected": -190.17738342285156, "loss": 0.973, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7878190875053406, "rewards/margins": 0.5973426103591919, "rewards/rejected": -1.3851616382598877, "step": 536 }, { "epoch": 0.8117913832199547, "grad_norm": 9.00659465789795, "kl/avg_steps": 0.3125, "kl/beta": 0.013768285512924194, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 5.267213693697695e-08, "logits/chosen": -1.0336707830429077, "logits/rejected": -1.3107452392578125, "logps/chosen": -148.0464324951172, "logps/ref_chosen": -74.99390411376953, "logps/ref_rejected": -110.6627197265625, "logps/rejected": -216.4197235107422, "loss": 1.14, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0053305625915527, "rewards/margins": 0.44309288263320923, "rewards/rejected": -1.4484233856201172, "step": 537 }, { "epoch": 0.8133030990173847, "grad_norm": 7.129688739776611, "kl/avg_steps": 0.53125, "kl/beta": 0.013725393451750278, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 5.1863067244167144e-08, "logits/chosen": -1.4286093711853027, "logits/rejected": -1.3614468574523926, "logps/chosen": -154.15213012695312, "logps/ref_chosen": -87.61151885986328, "logps/ref_rejected": -98.1150131225586, "logps/rejected": -204.0767059326172, "loss": 1.0504, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9111753106117249, "rewards/margins": 0.533942699432373, "rewards/rejected": -1.4451179504394531, "step": 538 }, { "epoch": 0.8148148148148148, "grad_norm": 9.4677152633667, "kl/avg_steps": 0.5, "kl/beta": 0.0136528629809618, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.105953986729195e-08, "logits/chosen": -0.8625047206878662, "logits/rejected": -1.7717041969299316, "logps/chosen": -143.09307861328125, "logps/ref_chosen": -78.86481475830078, "logps/ref_rejected": -100.84349822998047, "logps/rejected": -197.5748291015625, "loss": 1.1097, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8741194009780884, "rewards/margins": 0.4376823902130127, "rewards/rejected": -1.3118019104003906, "step": 539 }, { "epoch": 0.8163265306122449, "grad_norm": 8.074810981750488, "kl/avg_steps": 0.578125, "kl/beta": 0.013584937900304794, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 5.026157728273966e-08, "logits/chosen": -1.2832505702972412, "logits/rejected": -1.7057878971099854, "logps/chosen": -145.83828735351562, "logps/ref_chosen": -83.66409301757812, "logps/ref_rejected": -114.8860092163086, "logps/rejected": -219.39007568359375, "loss": 1.0522, "rewards/accuracies": 0.75, "rewards/chosen": -0.8425248265266418, "rewards/margins": 0.5671800374984741, "rewards/rejected": -1.4097049236297607, "step": 540 }, { "epoch": 0.817838246409675, "grad_norm": 11.797026634216309, "kl/avg_steps": 0.4375, "kl/beta": 0.01350685115903616, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.9469201811239035e-08, "logits/chosen": -1.5112102031707764, "logits/rejected": -1.2197654247283936, "logps/chosen": -140.15115356445312, "logps/ref_chosen": -83.12225341796875, "logps/ref_rejected": -74.80526733398438, "logps/rejected": -176.5388946533203, "loss": 1.0053, "rewards/accuracies": 0.75, "rewards/chosen": -0.7699480056762695, "rewards/margins": 0.5961588621139526, "rewards/rejected": -1.3661069869995117, "step": 541 }, { "epoch": 0.8193499622071051, "grad_norm": 9.535528182983398, "kl/avg_steps": 0.5, "kl/beta": 0.013448016718029976, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.868243561723534e-08, "logits/chosen": -1.2761826515197754, "logits/rejected": -1.1277570724487305, "logps/chosen": -116.5569076538086, "logps/ref_chosen": -66.3132553100586, "logps/ref_rejected": -83.24588012695312, "logps/rejected": -182.03997802734375, "loss": 0.9986, "rewards/accuracies": 0.765625, "rewards/chosen": -0.675069272518158, "rewards/margins": 0.644609272480011, "rewards/rejected": -1.319678544998169, "step": 542 }, { "epoch": 0.8208616780045351, "grad_norm": 6.756430625915527, "kl/avg_steps": 0.65625, "kl/beta": 0.0133811105042696, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.790130070827028e-08, "logits/chosen": -1.245871663093567, "logits/rejected": -1.3569632768630981, "logps/chosen": -128.3046417236328, "logps/ref_chosen": -68.11430358886719, "logps/ref_rejected": -94.62380981445312, "logps/rejected": -196.82461547851562, "loss": 1.0586, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8011859059333801, "rewards/margins": 0.5547268390655518, "rewards/rejected": -1.355912685394287, "step": 543 }, { "epoch": 0.8223733938019653, "grad_norm": 8.612573623657227, "kl/avg_steps": 0.34375, "kl/beta": 0.01329386979341507, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.7125818934366454e-08, "logits/chosen": -1.2979918718338013, "logits/rejected": -1.4303150177001953, "logps/chosen": -141.66336059570312, "logps/ref_chosen": -81.187255859375, "logps/ref_rejected": -105.84722900390625, "logps/rejected": -211.17544555664062, "loss": 1.0335, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8036271333694458, "rewards/margins": 0.588148832321167, "rewards/rejected": -1.3917759656906128, "step": 544 }, { "epoch": 0.8238851095993953, "grad_norm": 8.018584251403809, "kl/avg_steps": 0.375, "kl/beta": 0.013248329050838947, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.635601198741607e-08, "logits/chosen": -1.3559893369674683, "logits/rejected": -1.618308186531067, "logps/chosen": -144.0803985595703, "logps/ref_chosen": -78.81717681884766, "logps/ref_rejected": -98.65876770019531, "logps/rejected": -194.8354034423828, "loss": 1.1542, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8626161813735962, "rewards/margins": 0.40304312109947205, "rewards/rejected": -1.265659213066101, "step": 545 }, { "epoch": 0.8253968253968254, "grad_norm": 8.517661094665527, "kl/avg_steps": 0.4375, "kl/beta": 0.013198832981288433, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.559190140057428e-08, "logits/chosen": -1.4798083305358887, "logits/rejected": -1.1381564140319824, "logps/chosen": -131.03048706054688, "logps/ref_chosen": -74.2529296875, "logps/ref_rejected": -80.32308959960938, "logps/rejected": -172.29904174804688, "loss": 1.0965, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7476749420166016, "rewards/margins": 0.45806336402893066, "rewards/rejected": -1.2057383060455322, "step": 546 }, { "epoch": 0.8269085411942555, "grad_norm": 9.102763175964355, "kl/avg_steps": 0.625, "kl/beta": 0.013141339644789696, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.483350854765672e-08, "logits/chosen": -1.624086856842041, "logits/rejected": -1.6054657697677612, "logps/chosen": -124.8319320678711, "logps/ref_chosen": -69.9368896484375, "logps/ref_rejected": -90.25672912597656, "logps/rejected": -191.79208374023438, "loss": 0.9961, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7193492650985718, "rewards/margins": 0.6049911975860596, "rewards/rejected": -1.324340581893921, "step": 547 }, { "epoch": 0.8284202569916855, "grad_norm": 7.866860389709473, "kl/avg_steps": 0.4375, "kl/beta": 0.013059716671705246, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.4080854642541826e-08, "logits/chosen": -1.3938990831375122, "logits/rejected": -1.667206883430481, "logps/chosen": -150.2476806640625, "logps/ref_chosen": -81.16053009033203, "logps/ref_rejected": -99.7246322631836, "logps/rejected": -199.9046630859375, "loss": 1.1613, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9006539583206177, "rewards/margins": 0.39945849776268005, "rewards/rejected": -1.3001124858856201, "step": 548 }, { "epoch": 0.8299319727891157, "grad_norm": 10.859885215759277, "kl/avg_steps": 0.375, "kl/beta": 0.013002828694880009, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.333396073857723e-08, "logits/chosen": -1.2998642921447754, "logits/rejected": -1.5371828079223633, "logps/chosen": -145.29257202148438, "logps/ref_chosen": -80.49800872802734, "logps/ref_rejected": -113.20750427246094, "logps/rejected": -204.1650390625, "loss": 1.2344, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8418738842010498, "rewards/margins": 0.3334413170814514, "rewards/rejected": -1.1753151416778564, "step": 549 }, { "epoch": 0.8314436885865457, "grad_norm": 13.323412895202637, "kl/avg_steps": 0.359375, "kl/beta": 0.012954250909388065, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.259284772799099e-08, "logits/chosen": -1.330683946609497, "logits/rejected": -1.3588637113571167, "logps/chosen": -143.0060272216797, "logps/ref_chosen": -75.13760375976562, "logps/ref_rejected": -79.04876708984375, "logps/rejected": -175.20448303222656, "loss": 1.206, "rewards/accuracies": 0.703125, "rewards/chosen": -0.8783211708068848, "rewards/margins": 0.3595704436302185, "rewards/rejected": -1.237891674041748, "step": 550 }, { "epoch": 0.8329554043839759, "grad_norm": 10.381247520446777, "kl/avg_steps": 0.4375, "kl/beta": 0.012907862663269043, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.1857536341307176e-08, "logits/chosen": -1.2748838663101196, "logits/rejected": -1.6231095790863037, "logps/chosen": -150.6908721923828, "logps/ref_chosen": -85.44961547851562, "logps/ref_rejected": -103.48530578613281, "logps/rejected": -202.43276977539062, "loss": 1.1017, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8378610014915466, "rewards/margins": 0.42936253547668457, "rewards/rejected": -1.267223596572876, "step": 551 }, { "epoch": 0.8344671201814059, "grad_norm": 8.95328140258789, "kl/avg_steps": 0.46875, "kl/beta": 0.012851636856794357, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.112804714676593e-08, "logits/chosen": -1.331233263015747, "logits/rejected": -1.4290517568588257, "logps/chosen": -144.78970336914062, "logps/ref_chosen": -82.01036071777344, "logps/ref_rejected": -101.61883544921875, "logps/rejected": -194.92196655273438, "loss": 1.1654, "rewards/accuracies": 0.75, "rewards/chosen": -0.8052812814712524, "rewards/margins": 0.38555577397346497, "rewards/rejected": -1.190837025642395, "step": 552 }, { "epoch": 0.8359788359788359, "grad_norm": 14.167531967163086, "kl/avg_steps": 0.296875, "kl/beta": 0.012791676446795464, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.0404400549748144e-08, "logits/chosen": -1.1294629573822021, "logits/rejected": -1.3467081785202026, "logps/chosen": -143.1853485107422, "logps/ref_chosen": -73.81416320800781, "logps/ref_rejected": -104.27049255371094, "logps/rejected": -211.3553466796875, "loss": 1.1377, "rewards/accuracies": 0.640625, "rewards/chosen": -0.8864268660545349, "rewards/margins": 0.47457796335220337, "rewards/rejected": -1.3610048294067383, "step": 553 }, { "epoch": 0.8374905517762661, "grad_norm": 8.690716743469238, "kl/avg_steps": 0.5, "kl/beta": 0.012753813527524471, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.968661679220467e-08, "logits/chosen": -1.6826214790344238, "logits/rejected": -1.6105924844741821, "logps/chosen": -148.13287353515625, "logps/ref_chosen": -81.43980407714844, "logps/ref_rejected": -89.32518005371094, "logps/rejected": -194.12750244140625, "loss": 1.102, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8492714762687683, "rewards/margins": 0.4789280295372009, "rewards/rejected": -1.3281995058059692, "step": 554 }, { "epoch": 0.8390022675736961, "grad_norm": 7.373947620391846, "kl/avg_steps": 0.59375, "kl/beta": 0.012690361589193344, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.89747159520904e-08, "logits/chosen": -1.260491967201233, "logits/rejected": -1.153106451034546, "logps/chosen": -151.04708862304688, "logps/ref_chosen": -81.66071319580078, "logps/ref_rejected": -87.20857238769531, "logps/rejected": -195.9105224609375, "loss": 1.1195, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8781550526618958, "rewards/margins": 0.49082207679748535, "rewards/rejected": -1.3689770698547363, "step": 555 }, { "epoch": 0.8405139833711263, "grad_norm": 7.1971821784973145, "kl/avg_steps": 0.375, "kl/beta": 0.01261545717716217, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.826871794280192e-08, "logits/chosen": -1.067447543144226, "logits/rejected": -1.1644517183303833, "logps/chosen": -138.2876739501953, "logps/ref_chosen": -66.02448272705078, "logps/ref_rejected": -82.7474594116211, "logps/rejected": -190.40380859375, "loss": 1.125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9105790853500366, "rewards/margins": 0.4397725760936737, "rewards/rejected": -1.3503515720367432, "step": 556 }, { "epoch": 0.8420256991685563, "grad_norm": 7.1572651863098145, "kl/avg_steps": 0.46875, "kl/beta": 0.012568325735628605, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.756864251262143e-08, "logits/chosen": -1.3210291862487793, "logits/rejected": -1.946714162826538, "logps/chosen": -140.40304565429688, "logps/ref_chosen": -73.08985900878906, "logps/ref_rejected": -97.43034362792969, "logps/rejected": -209.40664672851562, "loss": 1.0402, "rewards/accuracies": 0.75, "rewards/chosen": -0.8453344106674194, "rewards/margins": 0.5539640188217163, "rewards/rejected": -1.3992985486984253, "step": 557 }, { "epoch": 0.8435374149659864, "grad_norm": 8.183286666870117, "kl/avg_steps": 0.46875, "kl/beta": 0.01250968687236309, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.687450924416341e-08, "logits/chosen": -1.392795443534851, "logits/rejected": -1.3525338172912598, "logps/chosen": -148.27149963378906, "logps/ref_chosen": -80.1357192993164, "logps/ref_rejected": -106.65797424316406, "logps/rejected": -216.39918518066406, "loss": 1.0724, "rewards/accuracies": 0.75, "rewards/chosen": -0.8508350253105164, "rewards/margins": 0.5131612420082092, "rewards/rejected": -1.3639963865280151, "step": 558 }, { "epoch": 0.8450491307634165, "grad_norm": 8.342129707336426, "kl/avg_steps": 0.40625, "kl/beta": 0.012451320886611938, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.6186337553827743e-08, "logits/chosen": -1.437471866607666, "logits/rejected": -1.7483036518096924, "logps/chosen": -144.3199462890625, "logps/ref_chosen": -79.42267608642578, "logps/ref_rejected": -98.59402465820312, "logps/rejected": -200.57098388671875, "loss": 1.1613, "rewards/accuracies": 0.703125, "rewards/chosen": -0.8071417808532715, "rewards/margins": 0.4540879726409912, "rewards/rejected": -1.2612297534942627, "step": 559 }, { "epoch": 0.8465608465608465, "grad_norm": 9.320462226867676, "kl/avg_steps": 0.40625, "kl/beta": 0.012400942854583263, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.550414669125573e-08, "logits/chosen": -1.2708487510681152, "logits/rejected": -1.1666545867919922, "logps/chosen": -143.94570922851562, "logps/ref_chosen": -77.49559020996094, "logps/ref_rejected": -92.61347961425781, "logps/rejected": -200.3899383544922, "loss": 1.074, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8230661749839783, "rewards/margins": 0.5055770874023438, "rewards/rejected": -1.3286433219909668, "step": 560 }, { "epoch": 0.8480725623582767, "grad_norm": 6.0093607902526855, "kl/avg_steps": 0.4375, "kl/beta": 0.012350767850875854, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.482795573879241e-08, "logits/chosen": -1.5282151699066162, "logits/rejected": -1.2929655313491821, "logps/chosen": -144.45321655273438, "logps/ref_chosen": -79.20771789550781, "logps/ref_rejected": -93.46514892578125, "logps/rejected": -201.19638061523438, "loss": 1.0516, "rewards/accuracies": 0.75, "rewards/chosen": -0.8044455647468567, "rewards/margins": 0.518153965473175, "rewards/rejected": -1.3225995302200317, "step": 561 }, { "epoch": 0.8495842781557067, "grad_norm": 7.105699062347412, "kl/avg_steps": 0.421875, "kl/beta": 0.012296968139708042, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.415778361095226e-08, "logits/chosen": -1.346250057220459, "logits/rejected": -1.459496021270752, "logps/chosen": -162.78598022460938, "logps/ref_chosen": -94.88652801513672, "logps/ref_rejected": -109.33815002441406, "logps/rejected": -219.45779418945312, "loss": 1.0508, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8341754078865051, "rewards/margins": 0.5127763152122498, "rewards/rejected": -1.3469517230987549, "step": 562 }, { "epoch": 0.8510959939531368, "grad_norm": 8.515876770019531, "kl/avg_steps": 0.46875, "kl/beta": 0.012245308607816696, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.349364905389032e-08, "logits/chosen": -1.0890107154846191, "logits/rejected": -1.182828664779663, "logps/chosen": -123.77273559570312, "logps/ref_chosen": -65.90719604492188, "logps/ref_rejected": -84.07121276855469, "logps/rejected": -183.04513549804688, "loss": 1.1116, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7075780630111694, "rewards/margins": 0.4956972599029541, "rewards/rejected": -1.2032753229141235, "step": 563 }, { "epoch": 0.8526077097505669, "grad_norm": 7.4643330574035645, "kl/avg_steps": 0.46875, "kl/beta": 0.01218817662447691, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.283557064487785e-08, "logits/chosen": -1.354698657989502, "logits/rejected": -1.3166128396987915, "logps/chosen": -131.1250457763672, "logps/ref_chosen": -72.32070922851562, "logps/ref_rejected": -88.05013275146484, "logps/rejected": -188.62979125976562, "loss": 1.1037, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7161024212837219, "rewards/margins": 0.501507580280304, "rewards/rejected": -1.2176098823547363, "step": 564 }, { "epoch": 0.854119425547997, "grad_norm": 6.4226274490356445, "kl/avg_steps": 0.375, "kl/beta": 0.012131310999393463, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.218356679178252e-08, "logits/chosen": -1.488287329673767, "logits/rejected": -1.343492031097412, "logps/chosen": -151.63772583007812, "logps/ref_chosen": -80.18453979492188, "logps/ref_rejected": -99.55126953125, "logps/rejected": -208.01727294921875, "loss": 1.1099, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8652534484863281, "rewards/margins": 0.44262397289276123, "rewards/rejected": -1.3078773021697998, "step": 565 }, { "epoch": 0.8556311413454271, "grad_norm": 10.024022102355957, "kl/avg_steps": 0.4375, "kl/beta": 0.012085988186299801, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.1537655732553764e-08, "logits/chosen": -1.485852599143982, "logits/rejected": -1.6031005382537842, "logps/chosen": -150.26303100585938, "logps/ref_chosen": -88.0877914428711, "logps/ref_rejected": -87.7589111328125, "logps/rejected": -187.7733154296875, "loss": 1.1223, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7501992583274841, "rewards/margins": 0.45009276270866394, "rewards/rejected": -1.2002918720245361, "step": 566 }, { "epoch": 0.8571428571428571, "grad_norm": 8.078023910522461, "kl/avg_steps": 0.53125, "kl/beta": 0.012033342383801937, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.089785553471233e-08, "logits/chosen": -1.1604113578796387, "logits/rejected": -1.6355693340301514, "logps/chosen": -135.47256469726562, "logps/ref_chosen": -69.93267822265625, "logps/ref_rejected": -95.71786499023438, "logps/rejected": -205.11859130859375, "loss": 1.0358, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7865634560585022, "rewards/margins": 0.5210832953453064, "rewards/rejected": -1.3076467514038086, "step": 567 }, { "epoch": 0.8586545729402872, "grad_norm": 6.185924053192139, "kl/avg_steps": 0.46875, "kl/beta": 0.011969752609729767, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.026418409484513e-08, "logits/chosen": -1.6622331142425537, "logits/rejected": -1.738410472869873, "logps/chosen": -126.7143325805664, "logps/ref_chosen": -70.33343505859375, "logps/ref_rejected": -108.86270904541016, "logps/rejected": -210.11373901367188, "loss": 1.0413, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6726874113082886, "rewards/margins": 0.5302541255950928, "rewards/rejected": -1.2029415369033813, "step": 568 }, { "epoch": 0.8601662887377173, "grad_norm": 8.796441078186035, "kl/avg_steps": 0.46875, "kl/beta": 0.011913906782865524, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.963665913810451e-08, "logits/chosen": -1.6774063110351562, "logits/rejected": -1.4297425746917725, "logps/chosen": -146.59378051757812, "logps/ref_chosen": -80.85043334960938, "logps/ref_rejected": -92.77810668945312, "logps/rejected": -190.6685791015625, "loss": 1.1798, "rewards/accuracies": 0.75, "rewards/chosen": -0.7816181778907776, "rewards/margins": 0.3761640191078186, "rewards/rejected": -1.1577821969985962, "step": 569 }, { "epoch": 0.8616780045351474, "grad_norm": 5.814436912536621, "kl/avg_steps": 0.625, "kl/beta": 0.0118583207949996, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.9015298217712453e-08, "logits/chosen": -1.6274659633636475, "logits/rejected": -1.604426383972168, "logps/chosen": -124.55775451660156, "logps/ref_chosen": -69.94769287109375, "logps/ref_rejected": -97.37059020996094, "logps/rejected": -204.11428833007812, "loss": 0.9832, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6452935338020325, "rewards/margins": 0.6108759641647339, "rewards/rejected": -1.2561695575714111, "step": 570 }, { "epoch": 0.8631897203325775, "grad_norm": 11.918949127197266, "kl/avg_steps": 0.4375, "kl/beta": 0.01178466621786356, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.840011871446962e-08, "logits/chosen": -1.4016327857971191, "logits/rejected": -1.4089144468307495, "logps/chosen": -135.0509796142578, "logps/ref_chosen": -72.28555297851562, "logps/ref_rejected": -84.57748413085938, "logps/rejected": -176.27073669433594, "loss": 1.2017, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7387359142303467, "rewards/margins": 0.3344320058822632, "rewards/rejected": -1.0731679201126099, "step": 571 }, { "epoch": 0.8647014361300076, "grad_norm": 7.906334400177002, "kl/avg_steps": 0.5, "kl/beta": 0.011733333580195904, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.7791137836269158e-08, "logits/chosen": -1.5348182916641235, "logits/rejected": -1.1779015064239502, "logps/chosen": -158.45730590820312, "logps/ref_chosen": -91.49070739746094, "logps/ref_rejected": -80.44602966308594, "logps/rejected": -184.7002716064453, "loss": 1.0925, "rewards/accuracies": 0.75, "rewards/chosen": -0.7837091684341431, "rewards/margins": 0.4318312406539917, "rewards/rejected": -1.2155404090881348, "step": 572 }, { "epoch": 0.8662131519274376, "grad_norm": 8.190794944763184, "kl/avg_steps": 0.3125, "kl/beta": 0.011674958281219006, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.718837261761528e-08, "logits/chosen": -1.6026732921600342, "logits/rejected": -1.552854061126709, "logps/chosen": -156.527587890625, "logps/ref_chosen": -87.54232788085938, "logps/ref_rejected": -104.32984924316406, "logps/rejected": -210.606689453125, "loss": 1.1659, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8060653209686279, "rewards/margins": 0.4282020926475525, "rewards/rejected": -1.2342674732208252, "step": 573 }, { "epoch": 0.8677248677248677, "grad_norm": 6.598454475402832, "kl/avg_steps": 0.46875, "kl/beta": 0.011638588272035122, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.659183991914696e-08, "logits/chosen": -1.359856367111206, "logits/rejected": -1.107245683670044, "logps/chosen": -137.58914184570312, "logps/ref_chosen": -75.36632537841797, "logps/ref_rejected": -103.27328491210938, "logps/rejected": -213.42083740234375, "loss": 1.0219, "rewards/accuracies": 0.765625, "rewards/chosen": -0.722530722618103, "rewards/margins": 0.5507799386978149, "rewards/rejected": -1.273310661315918, "step": 574 }, { "epoch": 0.8692365835222978, "grad_norm": 6.816895008087158, "kl/avg_steps": 0.34375, "kl/beta": 0.011584286577999592, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.600155642716606e-08, "logits/chosen": -1.1317976713180542, "logits/rejected": -1.4034576416015625, "logps/chosen": -150.38589477539062, "logps/ref_chosen": -81.678466796875, "logps/ref_rejected": -112.84233093261719, "logps/rejected": -212.930419921875, "loss": 1.193, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7960365414619446, "rewards/margins": 0.3575257360935211, "rewards/rejected": -1.153562307357788, "step": 575 }, { "epoch": 0.8707482993197279, "grad_norm": 6.988912105560303, "kl/avg_steps": 0.5625, "kl/beta": 0.011544601991772652, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.5417538653170754e-08, "logits/chosen": -1.3333091735839844, "logits/rejected": -1.4861361980438232, "logps/chosen": -125.4324722290039, "logps/ref_chosen": -68.78944396972656, "logps/ref_rejected": -102.79037475585938, "logps/rejected": -207.86514282226562, "loss": 1.0285, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6519675254821777, "rewards/margins": 0.5519933700561523, "rewards/rejected": -1.20396089553833, "step": 576 }, { "epoch": 0.872260015117158, "grad_norm": 7.1114277839660645, "kl/avg_steps": 0.390625, "kl/beta": 0.011480026878416538, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.4839802933393607e-08, "logits/chosen": -1.477907419204712, "logits/rejected": -1.5636188983917236, "logps/chosen": -141.4581298828125, "logps/ref_chosen": -79.84674835205078, "logps/ref_rejected": -84.08309936523438, "logps/rejected": -173.64480590820312, "loss": 1.2078, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7065240144729614, "rewards/margins": 0.3152022957801819, "rewards/rejected": -1.021726369857788, "step": 577 }, { "epoch": 0.873771730914588, "grad_norm": 6.939554691314697, "kl/avg_steps": 0.21875, "kl/beta": 0.011435357853770256, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 2.4268365428344733e-08, "logits/chosen": -1.0849415063858032, "logits/rejected": -1.3800511360168457, "logps/chosen": -138.2547149658203, "logps/ref_chosen": -74.91357421875, "logps/ref_rejected": -83.64881896972656, "logps/rejected": -173.4388885498047, "loss": 1.2029, "rewards/accuracies": 0.625, "rewards/chosen": -0.7240015268325806, "rewards/margins": 0.2974473834037781, "rewards/rejected": -1.0214489698410034, "step": 578 }, { "epoch": 0.8752834467120182, "grad_norm": 6.664099216461182, "kl/avg_steps": 0.65625, "kl/beta": 0.011410397477447987, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.3703242122359357e-08, "logits/chosen": -1.2816755771636963, "logits/rejected": -1.155468225479126, "logps/chosen": -143.69326782226562, "logps/ref_chosen": -75.51022338867188, "logps/ref_rejected": -84.83192443847656, "logps/rejected": -195.86459350585938, "loss": 1.059, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7744640707969666, "rewards/margins": 0.48263052105903625, "rewards/rejected": -1.2570946216583252, "step": 579 }, { "epoch": 0.8767951625094482, "grad_norm": 7.74460506439209, "kl/avg_steps": 0.53125, "kl/beta": 0.011336004361510277, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.3144448823151392e-08, "logits/chosen": -1.5838890075683594, "logits/rejected": -1.6980791091918945, "logps/chosen": -133.63670349121094, "logps/ref_chosen": -76.61564636230469, "logps/ref_rejected": -97.09959411621094, "logps/rejected": -192.4454345703125, "loss": 1.1247, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6440767645835876, "rewards/margins": 0.4278578460216522, "rewards/rejected": -1.0719345808029175, "step": 580 }, { "epoch": 0.8783068783068783, "grad_norm": 6.301844120025635, "kl/avg_steps": 0.453125, "kl/beta": 0.011276100762188435, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.259200116137039e-08, "logits/chosen": -1.5977568626403809, "logits/rejected": -1.468361735343933, "logps/chosen": -145.87472534179688, "logps/ref_chosen": -74.8531265258789, "logps/ref_rejected": -101.5344009399414, "logps/rejected": -210.183349609375, "loss": 1.1315, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8000357151031494, "rewards/margins": 0.41744327545166016, "rewards/rejected": -1.2174789905548096, "step": 581 }, { "epoch": 0.8798185941043084, "grad_norm": 6.888444423675537, "kl/avg_steps": 0.40625, "kl/beta": 0.011225235648453236, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.204591459016525e-08, "logits/chosen": -1.4427552223205566, "logits/rejected": -1.0628981590270996, "logps/chosen": -148.77474975585938, "logps/ref_chosen": -81.07638549804688, "logps/ref_rejected": -72.83570861816406, "logps/rejected": -178.07257080078125, "loss": 1.1383, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7592304348945618, "rewards/margins": 0.41485366225242615, "rewards/rejected": -1.1740840673446655, "step": 582 }, { "epoch": 0.8813303099017384, "grad_norm": 8.803990364074707, "kl/avg_steps": 0.375, "kl/beta": 0.011179817840456963, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.1506204384751064e-08, "logits/chosen": -1.0734798908233643, "logits/rejected": -1.4834539890289307, "logps/chosen": -133.71963500976562, "logps/ref_chosen": -66.78465270996094, "logps/ref_rejected": -106.45826721191406, "logps/rejected": -207.7127685546875, "loss": 1.1892, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7471137046813965, "rewards/margins": 0.3767395615577698, "rewards/rejected": -1.1238532066345215, "step": 583 }, { "epoch": 0.8828420256991686, "grad_norm": 7.770105361938477, "kl/avg_steps": 0.3125, "kl/beta": 0.011138049885630608, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.09728856419826e-08, "logits/chosen": -0.9404406547546387, "logits/rejected": -1.262127161026001, "logps/chosen": -116.41361236572266, "logps/ref_chosen": -60.80291748046875, "logps/ref_rejected": -99.45012664794922, "logps/rejected": -191.92051696777344, "loss": 1.1544, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6201804876327515, "rewards/margins": 0.4037685990333557, "rewards/rejected": -1.023949146270752, "step": 584 }, { "epoch": 0.8843537414965986, "grad_norm": 5.805047988891602, "kl/avg_steps": 0.46875, "kl/beta": 0.011103352531790733, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.044597327993153e-08, "logits/chosen": -1.5974360704421997, "logits/rejected": -1.2341102361679077, "logps/chosen": -140.76992797851562, "logps/ref_chosen": -75.92616271972656, "logps/ref_rejected": -94.47601318359375, "logps/rejected": -186.14279174804688, "loss": 1.2217, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7173092365264893, "rewards/margins": 0.2924202084541321, "rewards/rejected": -1.0097295045852661, "step": 585 }, { "epoch": 0.8858654572940288, "grad_norm": 7.581094264984131, "kl/avg_steps": 0.59375, "kl/beta": 0.011051548644900322, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.9925482037469187e-08, "logits/chosen": -1.2922358512878418, "logits/rejected": -1.3954906463623047, "logps/chosen": -128.05166625976562, "logps/ref_chosen": -68.62062072753906, "logps/ref_rejected": -81.98324584960938, "logps/rejected": -186.36367797851562, "loss": 1.0506, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6543450355529785, "rewards/margins": 0.4905773401260376, "rewards/rejected": -1.1449224948883057, "step": 586 }, { "epoch": 0.8873771730914588, "grad_norm": 10.84594440460205, "kl/avg_steps": 0.46875, "kl/beta": 0.010986316949129105, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.9411426473854687e-08, "logits/chosen": -1.5820139646530151, "logits/rejected": -1.4005577564239502, "logps/chosen": -133.91355895996094, "logps/ref_chosen": -77.67031860351562, "logps/ref_rejected": -79.35327911376953, "logps/rejected": -183.68438720703125, "loss": 1.0622, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6174869537353516, "rewards/margins": 0.5209751129150391, "rewards/rejected": -1.1384620666503906, "step": 587 }, { "epoch": 0.8888888888888888, "grad_norm": 12.146368026733398, "kl/avg_steps": 0.546875, "kl/beta": 0.010935058817267418, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.890382096832699e-08, "logits/chosen": -1.804541826248169, "logits/rejected": -1.8112810850143433, "logps/chosen": -143.16195678710938, "logps/ref_chosen": -77.94320678710938, "logps/ref_rejected": -98.41210174560547, "logps/rejected": -204.6304168701172, "loss": 1.0801, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7118812203407288, "rewards/margins": 0.44240012764930725, "rewards/rejected": -1.1542813777923584, "step": 588 }, { "epoch": 0.890400604686319, "grad_norm": 7.836277008056641, "kl/avg_steps": 0.6875, "kl/beta": 0.010875582695007324, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.840267971970344e-08, "logits/chosen": -1.6829819679260254, "logits/rejected": -1.4134776592254639, "logps/chosen": -132.46250915527344, "logps/ref_chosen": -75.18646240234375, "logps/ref_rejected": -93.35910034179688, "logps/rejected": -196.56468200683594, "loss": 1.0411, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6206521987915039, "rewards/margins": 0.493114709854126, "rewards/rejected": -1.1137669086456299, "step": 589 }, { "epoch": 0.891912320483749, "grad_norm": 8.583623886108398, "kl/avg_steps": 0.4375, "kl/beta": 0.010801323689520359, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.7908016745981856e-08, "logits/chosen": -1.4794654846191406, "logits/rejected": -1.2498228549957275, "logps/chosen": -152.60191345214844, "logps/ref_chosen": -86.9908447265625, "logps/ref_rejected": -100.61723327636719, "logps/rejected": -209.6766357421875, "loss": 1.0804, "rewards/accuracies": 0.75, "rewards/chosen": -0.7086925506591797, "rewards/margins": 0.463258296251297, "rewards/rejected": -1.1719508171081543, "step": 590 }, { "epoch": 0.8934240362811792, "grad_norm": 8.543137550354004, "kl/avg_steps": 0.421875, "kl/beta": 0.01075427420437336, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.7419845883949098e-08, "logits/chosen": -1.5658700466156006, "logits/rejected": -1.5437417030334473, "logps/chosen": -126.18582153320312, "logps/ref_chosen": -74.85809326171875, "logps/ref_rejected": -102.75840759277344, "logps/rejected": -199.054931640625, "loss": 1.0779, "rewards/accuracies": 0.75, "rewards/chosen": -0.5521831512451172, "rewards/margins": 0.476978600025177, "rewards/rejected": -1.029161810874939, "step": 591 }, { "epoch": 0.8949357520786092, "grad_norm": 6.509529113769531, "kl/avg_steps": 0.53125, "kl/beta": 0.010709094814956188, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.6938180788793556e-08, "logits/chosen": -1.451214075088501, "logits/rejected": -1.452272653579712, "logps/chosen": -123.59173583984375, "logps/ref_chosen": -67.90579223632812, "logps/ref_rejected": -100.35234069824219, "logps/rejected": -196.5985107421875, "loss": 1.0975, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5943005084991455, "rewards/margins": 0.42885279655456543, "rewards/rejected": -1.023153305053711, "step": 592 }, { "epoch": 0.8964474678760394, "grad_norm": 7.256924152374268, "kl/avg_steps": 0.5, "kl/beta": 0.010652503930032253, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.6463034933723336e-08, "logits/chosen": -1.1163625717163086, "logits/rejected": -1.4272210597991943, "logps/chosen": -113.16133117675781, "logps/ref_chosen": -59.29489517211914, "logps/ref_rejected": -85.31307983398438, "logps/rejected": -180.05154418945312, "loss": 1.1152, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5734096765518188, "rewards/margins": 0.4289560914039612, "rewards/rejected": -1.0023657083511353, "step": 593 }, { "epoch": 0.8979591836734694, "grad_norm": 8.194808959960938, "kl/avg_steps": 0.375, "kl/beta": 0.010599506087601185, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.5994421609589385e-08, "logits/chosen": -1.1510372161865234, "logits/rejected": -1.1432057619094849, "logps/chosen": -146.3094482421875, "logps/ref_chosen": -83.14643859863281, "logps/ref_rejected": -88.201904296875, "logps/rejected": -188.5896453857422, "loss": 1.1219, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6681286096572876, "rewards/margins": 0.3892432451248169, "rewards/rejected": -1.0573718547821045, "step": 594 }, { "epoch": 0.8994708994708994, "grad_norm": 9.508260726928711, "kl/avg_steps": 0.53125, "kl/beta": 0.010559906251728535, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.553235392451377e-08, "logits/chosen": -1.1011279821395874, "logits/rejected": -1.4126760959625244, "logps/chosen": -129.28802490234375, "logps/ref_chosen": -70.40016174316406, "logps/ref_rejected": -103.95550537109375, "logps/rejected": -212.98629760742188, "loss": 1.0528, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6204409599304199, "rewards/margins": 0.5221980810165405, "rewards/rejected": -1.14263916015625, "step": 595 }, { "epoch": 0.9009826152683296, "grad_norm": 7.493839740753174, "kl/avg_steps": 0.09375, "kl/beta": 0.010504103265702724, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 1.507684480352292e-08, "logits/chosen": -1.470296859741211, "logits/rejected": -1.4461066722869873, "logps/chosen": -156.56211853027344, "logps/ref_chosen": -86.083740234375, "logps/ref_rejected": -78.41990661621094, "logps/rejected": -165.70452880859375, "loss": 1.3176, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7421023845672607, "rewards/margins": 0.17188970744609833, "rewards/rejected": -0.9139920473098755, "step": 596 }, { "epoch": 0.9024943310657596, "grad_norm": 6.163330554962158, "kl/avg_steps": 0.40625, "kl/beta": 0.010494264774024487, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.4627906988186111e-08, "logits/chosen": -1.3851312398910522, "logits/rejected": -1.1142826080322266, "logps/chosen": -121.51483154296875, "logps/ref_chosen": -67.8086166381836, "logps/ref_rejected": -71.09245300292969, "logps/rejected": -162.0400390625, "loss": 1.1369, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5630039572715759, "rewards/margins": 0.3849661350250244, "rewards/rejected": -0.9479700922966003, "step": 597 }, { "epoch": 0.9040060468631897, "grad_norm": 6.907870292663574, "kl/avg_steps": 0.359375, "kl/beta": 0.010451804846525192, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.4185553036259095e-08, "logits/chosen": -1.369914174079895, "logits/rejected": -1.562995195388794, "logps/chosen": -149.61148071289062, "logps/ref_chosen": -74.31095123291016, "logps/ref_rejected": -98.08122253417969, "logps/rejected": -198.58651733398438, "loss": 1.2303, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7870241403579712, "rewards/margins": 0.2586115598678589, "rewards/rejected": -1.04563570022583, "step": 598 }, { "epoch": 0.9055177626606198, "grad_norm": 7.62575101852417, "kl/avg_steps": 0.34375, "kl/beta": 0.01041437778621912, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.3749795321332885e-08, "logits/chosen": -1.333286166191101, "logits/rejected": -1.314929723739624, "logps/chosen": -143.8874969482422, "logps/ref_chosen": -74.21861267089844, "logps/ref_rejected": -90.1492919921875, "logps/rejected": -192.62564086914062, "loss": 1.1717, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7245805263519287, "rewards/margins": 0.3361985683441162, "rewards/rejected": -1.060779094696045, "step": 599 }, { "epoch": 0.9070294784580499, "grad_norm": 7.58573579788208, "kl/avg_steps": 0.3125, "kl/beta": 0.01037870068103075, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.3320646032487393e-08, "logits/chosen": -1.4237275123596191, "logits/rejected": -1.5688844919204712, "logps/chosen": -144.674560546875, "logps/ref_chosen": -79.34190368652344, "logps/ref_rejected": -97.0519790649414, "logps/rejected": -193.67965698242188, "loss": 1.1764, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6775575876235962, "rewards/margins": 0.3196982741355896, "rewards/rejected": -0.997255802154541, "step": 600 }, { "epoch": 0.9070294784580499, "eval_kl/n_epsilon_steps": 0.26892605423927307, "eval_kl/p_epsilon_steps": 0.7288732528686523, "eval_logits/chosen": -1.305195689201355, "eval_logits/rejected": -1.431878685951233, "eval_logps/chosen": -149.94447326660156, "eval_logps/ref_chosen": -86.90177917480469, "eval_logps/ref_rejected": -96.69639587402344, "eval_logps/rejected": -198.72705078125, "eval_loss": 0.563292920589447, "eval_rewards/accuracies": 0.73283451795578, "eval_rewards/chosen": -0.6511958837509155, "eval_rewards/margins": 0.3975641429424286, "eval_rewards/rejected": -1.0487600564956665, "eval_runtime": 46.8107, "eval_samples_per_second": 49.198, "eval_steps_per_second": 1.538, "step": 600 }, { "epoch": 0.90854119425548, "grad_norm": 8.043039321899414, "kl/avg_steps": 0.5, "kl/beta": 0.0103463688865304, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.2898117173950868e-08, "logits/chosen": -1.517151117324829, "logits/rejected": -1.5460131168365479, "logps/chosen": -125.53500366210938, "logps/ref_chosen": -72.06497192382812, "logps/ref_rejected": -97.60928344726562, "logps/rejected": -198.99093627929688, "loss": 1.0588, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5525596141815186, "rewards/margins": 0.48932531476020813, "rewards/rejected": -1.0418848991394043, "step": 601 }, { "epoch": 0.91005291005291, "grad_norm": 8.432663917541504, "kl/avg_steps": 0.46875, "kl/beta": 0.010294893756508827, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.2482220564763667e-08, "logits/chosen": -1.4632465839385986, "logits/rejected": -1.6077954769134521, "logps/chosen": -126.64736938476562, "logps/ref_chosen": -77.80416870117188, "logps/ref_rejected": -89.05025482177734, "logps/rejected": -179.86968994140625, "loss": 1.0903, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5022462606430054, "rewards/margins": 0.4265432357788086, "rewards/rejected": -0.9287895560264587, "step": 602 }, { "epoch": 0.9115646258503401, "grad_norm": 7.695731163024902, "kl/avg_steps": 0.46875, "kl/beta": 0.010246861726045609, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.2072967838448051e-08, "logits/chosen": -1.5648958683013916, "logits/rejected": -1.3041167259216309, "logps/chosen": -133.23721313476562, "logps/ref_chosen": -68.30155944824219, "logps/ref_rejected": -90.542724609375, "logps/rejected": -192.48605346679688, "loss": 1.1478, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6637769937515259, "rewards/margins": 0.3735578656196594, "rewards/rejected": -1.0373347997665405, "step": 603 }, { "epoch": 0.9130763416477702, "grad_norm": 5.707242012023926, "kl/avg_steps": 0.40625, "kl/beta": 0.010199054144322872, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.1670370442682459e-08, "logits/chosen": -1.4136242866516113, "logits/rejected": -1.5028131008148193, "logps/chosen": -142.22543334960938, "logps/ref_chosen": -90.55952453613281, "logps/ref_rejected": -84.6327133178711, "logps/rejected": -173.35304260253906, "loss": 1.1714, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5263998508453369, "rewards/margins": 0.37176942825317383, "rewards/rejected": -0.8981692790985107, "step": 604 }, { "epoch": 0.9145880574452003, "grad_norm": 7.411596298217773, "kl/avg_steps": 0.4375, "kl/beta": 0.010157788172364235, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.1274439638981532e-08, "logits/chosen": -1.4782711267471313, "logits/rejected": -1.6140611171722412, "logps/chosen": -147.58096313476562, "logps/ref_chosen": -80.26661682128906, "logps/ref_rejected": -100.26485443115234, "logps/rejected": -201.37017822265625, "loss": 1.1592, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6824374794960022, "rewards/margins": 0.33790159225463867, "rewards/rejected": -1.0203391313552856, "step": 605 }, { "epoch": 0.9160997732426304, "grad_norm": 6.872825622558594, "kl/avg_steps": 0.375, "kl/beta": 0.010113541036844254, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.0885186502381016e-08, "logits/chosen": -1.3783996105194092, "logits/rejected": -1.3939440250396729, "logps/chosen": -126.33575439453125, "logps/ref_chosen": -70.73554992675781, "logps/ref_rejected": -95.9410400390625, "logps/rejected": -190.52963256835938, "loss": 1.1098, "rewards/accuracies": 0.75, "rewards/chosen": -0.5604730844497681, "rewards/margins": 0.38919681310653687, "rewards/rejected": -0.9496699571609497, "step": 606 }, { "epoch": 0.9176114890400605, "grad_norm": 6.968571186065674, "kl/avg_steps": 0.5625, "kl/beta": 0.010075757279992104, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.0502621921127774e-08, "logits/chosen": -1.3670051097869873, "logits/rejected": -1.3761980533599854, "logps/chosen": -146.70367431640625, "logps/ref_chosen": -81.26203918457031, "logps/ref_rejected": -92.71575927734375, "logps/rejected": -199.63870239257812, "loss": 1.0874, "rewards/accuracies": 0.75, "rewards/chosen": -0.6559546589851379, "rewards/margins": 0.4126221537590027, "rewards/rejected": -1.0685768127441406, "step": 607 }, { "epoch": 0.9191232048374905, "grad_norm": 8.588226318359375, "kl/avg_steps": 0.5, "kl/beta": 0.010019398294389248, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.0126756596375685e-08, "logits/chosen": -1.1897143125534058, "logits/rejected": -1.706724762916565, "logps/chosen": -152.38760375976562, "logps/ref_chosen": -82.65309143066406, "logps/ref_rejected": -110.64334106445312, "logps/rejected": -209.7471923828125, "loss": 1.1965, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6966760754585266, "rewards/margins": 0.28950873017311096, "rewards/rejected": -0.9861847758293152, "step": 608 }, { "epoch": 0.9206349206349206, "grad_norm": 7.8772454261779785, "kl/avg_steps": 0.65625, "kl/beta": 0.009969550184905529, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 9.757601041885694e-09, "logits/chosen": -1.3571372032165527, "logits/rejected": -1.4184623956680298, "logps/chosen": -123.52580261230469, "logps/ref_chosen": -68.20231628417969, "logps/ref_rejected": -81.90515899658203, "logps/rejected": -178.79531860351562, "loss": 1.0748, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5487229228019714, "rewards/margins": 0.4092875123023987, "rewards/rejected": -0.9580105543136597, "step": 609 }, { "epoch": 0.9221466364323507, "grad_norm": 9.354720115661621, "kl/avg_steps": 0.375, "kl/beta": 0.009904551319777966, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 9.395165583732379e-09, "logits/chosen": -1.4947376251220703, "logits/rejected": -1.5565390586853027, "logps/chosen": -162.8250732421875, "logps/ref_chosen": -99.01324462890625, "logps/ref_rejected": -102.26054382324219, "logps/rejected": -206.55862426757812, "loss": 1.1376, "rewards/accuracies": 0.6875, "rewards/chosen": -0.633036732673645, "rewards/margins": 0.39495325088500977, "rewards/rejected": -1.0279901027679443, "step": 610 }, { "epoch": 0.9236583522297808, "grad_norm": 6.941373348236084, "kl/avg_steps": 0.328125, "kl/beta": 0.009867548011243343, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 9.03946036001449e-09, "logits/chosen": -1.7924981117248535, "logits/rejected": -1.8070666790008545, "logps/chosen": -124.20469665527344, "logps/ref_chosen": -66.36254119873047, "logps/ref_rejected": -88.74557495117188, "logps/rejected": -175.82785034179688, "loss": 1.192, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5700873136520386, "rewards/margins": 0.28417256474494934, "rewards/rejected": -0.8542598485946655, "step": 611 }, { "epoch": 0.9251700680272109, "grad_norm": 6.236355781555176, "kl/avg_steps": 0.5, "kl/beta": 0.009835276752710342, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 8.690495320571839e-09, "logits/chosen": -1.3240669965744019, "logits/rejected": -1.4752863645553589, "logps/chosen": -145.45907592773438, "logps/ref_chosen": -78.6339111328125, "logps/ref_rejected": -108.34970092773438, "logps/rejected": -221.23175048828125, "loss": 1.0902, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6558361053466797, "rewards/margins": 0.4467426538467407, "rewards/rejected": -1.10257887840271, "step": 612 }, { "epoch": 0.926681783824641, "grad_norm": 6.665154933929443, "kl/avg_steps": 0.59375, "kl/beta": 0.00978634413331747, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 8.348280226706722e-09, "logits/chosen": -1.4203985929489136, "logits/rejected": -0.9556962251663208, "logps/chosen": -125.75762939453125, "logps/ref_chosen": -73.3539047241211, "logps/ref_rejected": -76.91837310791016, "logps/rejected": -180.04534912109375, "loss": 1.0513, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5114285945892334, "rewards/margins": 0.4900427460670471, "rewards/rejected": -1.0014714002609253, "step": 613 }, { "epoch": 0.9281934996220711, "grad_norm": 7.1324639320373535, "kl/avg_steps": 0.5625, "kl/beta": 0.009728580713272095, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 8.012824650910937e-09, "logits/chosen": -1.2377476692199707, "logits/rejected": -0.9786287546157837, "logps/chosen": -145.08615112304688, "logps/ref_chosen": -77.80007934570312, "logps/ref_rejected": -89.05572509765625, "logps/rejected": -194.45004272460938, "loss": 1.1136, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6522784233093262, "rewards/margins": 0.36581069231033325, "rewards/rejected": -1.0180891752243042, "step": 614 }, { "epoch": 0.9297052154195011, "grad_norm": 6.5117716789245605, "kl/avg_steps": 0.546875, "kl/beta": 0.009674163535237312, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 7.684137976598088e-09, "logits/chosen": -1.635801911354065, "logits/rejected": -1.423606038093567, "logps/chosen": -156.5076446533203, "logps/ref_chosen": -90.06971740722656, "logps/ref_rejected": -118.7764892578125, "logps/rejected": -231.023681640625, "loss": 1.0939, "rewards/accuracies": 0.78125, "rewards/chosen": -0.641755223274231, "rewards/margins": 0.43708616495132446, "rewards/rejected": -1.0788414478302002, "step": 615 }, { "epoch": 0.9312169312169312, "grad_norm": 6.9720258712768555, "kl/avg_steps": 0.390625, "kl/beta": 0.009621545672416687, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 7.36222939784098e-09, "logits/chosen": -1.3394547700881958, "logits/rejected": -1.4010214805603027, "logps/chosen": -138.58969116210938, "logps/ref_chosen": -74.62954711914062, "logps/ref_rejected": -93.655029296875, "logps/rejected": -195.61346435546875, "loss": 1.1321, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6138941049575806, "rewards/margins": 0.360870897769928, "rewards/rejected": -0.9747650623321533, "step": 616 }, { "epoch": 0.9327286470143613, "grad_norm": 7.865924835205078, "kl/avg_steps": 0.53125, "kl/beta": 0.009584108367562294, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 7.047107919114586e-09, "logits/chosen": -1.1265982389450073, "logits/rejected": -1.2221885919570923, "logps/chosen": -147.52825927734375, "logps/ref_chosen": -75.98182678222656, "logps/ref_rejected": -97.1640625, "logps/rejected": -204.99545288085938, "loss": 1.1393, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6835160255432129, "rewards/margins": 0.34294775128364563, "rewards/rejected": -1.0264636278152466, "step": 617 }, { "epoch": 0.9342403628117913, "grad_norm": 13.488085746765137, "kl/avg_steps": 0.375, "kl/beta": 0.00953346211463213, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 6.738782355044048e-09, "logits/chosen": -1.4303864240646362, "logits/rejected": -1.5783617496490479, "logps/chosen": -134.54786682128906, "logps/ref_chosen": -74.47208404541016, "logps/ref_rejected": -107.09980010986328, "logps/rejected": -201.03564453125, "loss": 1.1718, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5725345611572266, "rewards/margins": 0.31801384687423706, "rewards/rejected": -0.8905483484268188, "step": 618 }, { "epoch": 0.9357520786092215, "grad_norm": 6.205716133117676, "kl/avg_steps": 0.5625, "kl/beta": 0.009497844614088535, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 6.437261330158206e-09, "logits/chosen": -1.154737949371338, "logits/rejected": -1.3640652894973755, "logps/chosen": -131.15756225585938, "logps/ref_chosen": -70.84220886230469, "logps/ref_rejected": -98.07801818847656, "logps/rejected": -203.77041625976562, "loss": 1.0922, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5716289281845093, "rewards/margins": 0.425299733877182, "rewards/rejected": -0.9969286918640137, "step": 619 }, { "epoch": 0.9372637944066515, "grad_norm": 6.4136128425598145, "kl/avg_steps": 0.53125, "kl/beta": 0.00944471824914217, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 6.142553278648238e-09, "logits/chosen": -1.5454761981964111, "logits/rejected": -1.2460343837738037, "logps/chosen": -134.32882690429688, "logps/ref_chosen": -76.93606567382812, "logps/ref_rejected": -81.28453063964844, "logps/rejected": -172.30657958984375, "loss": 1.1796, "rewards/accuracies": 0.75, "rewards/chosen": -0.540812611579895, "rewards/margins": 0.31257152557373047, "rewards/rejected": -0.8533841371536255, "step": 620 }, { "epoch": 0.9387755102040817, "grad_norm": 6.1778082847595215, "kl/avg_steps": 0.40625, "kl/beta": 0.009394808672368526, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 5.854666444131934e-09, "logits/chosen": -1.2180217504501343, "logits/rejected": -1.4678857326507568, "logps/chosen": -135.63153076171875, "logps/ref_chosen": -69.87464904785156, "logps/ref_rejected": -105.61328887939453, "logps/rejected": -203.57086181640625, "loss": 1.1843, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6170968413352966, "rewards/margins": 0.29768118262290955, "rewards/rejected": -0.9147779941558838, "step": 621 }, { "epoch": 0.9402872260015117, "grad_norm": 6.1825947761535645, "kl/avg_steps": 0.53125, "kl/beta": 0.009356796741485596, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 5.573608879422875e-09, "logits/chosen": -1.7117747068405151, "logits/rejected": -1.6740427017211914, "logps/chosen": -144.62969970703125, "logps/ref_chosen": -78.9598388671875, "logps/ref_rejected": -97.906494140625, "logps/rejected": -199.75100708007812, "loss": 1.1453, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6132454872131348, "rewards/margins": 0.3336794972419739, "rewards/rejected": -0.9469249844551086, "step": 622 }, { "epoch": 0.9417989417989417, "grad_norm": 5.886653423309326, "kl/avg_steps": 0.46875, "kl/beta": 0.00930735096335411, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 5.299388446305342e-09, "logits/chosen": -1.468321442604065, "logits/rejected": -1.4235600233078003, "logps/chosen": -155.97503662109375, "logps/ref_chosen": -83.22647094726562, "logps/ref_rejected": -105.13624572753906, "logps/rejected": -216.748046875, "loss": 1.1266, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6760995388031006, "rewards/margins": 0.3567197918891907, "rewards/rejected": -1.0328192710876465, "step": 623 }, { "epoch": 0.9433106575963719, "grad_norm": 6.489195346832275, "kl/avg_steps": 0.484375, "kl/beta": 0.009263926185667515, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 5.03201281531429e-09, "logits/chosen": -1.2762702703475952, "logits/rejected": -1.4331917762756348, "logps/chosen": -121.82546997070312, "logps/ref_chosen": -66.10560607910156, "logps/ref_rejected": -91.66778564453125, "logps/rejected": -195.05531311035156, "loss": 1.0694, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5147716999053955, "rewards/margins": 0.4362095594406128, "rewards/rejected": -0.9509812593460083, "step": 624 }, { "epoch": 0.9448223733938019, "grad_norm": 6.928590297698975, "kl/avg_steps": 0.28125, "kl/beta": 0.009219270199537277, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.7714894655209174e-09, "logits/chosen": -1.1287927627563477, "logits/rejected": -1.2948472499847412, "logps/chosen": -138.32406616210938, "logps/ref_chosen": -73.20295715332031, "logps/ref_rejected": -105.31025695800781, "logps/rejected": -197.34921264648438, "loss": 1.2353, "rewards/accuracies": 0.640625, "rewards/chosen": -0.6002695560455322, "rewards/margins": 0.24321919679641724, "rewards/rejected": -0.8434886932373047, "step": 625 }, { "epoch": 0.9463340891912321, "grad_norm": 6.270933628082275, "kl/avg_steps": 0.375, "kl/beta": 0.009193413890898228, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.517825684323323e-09, "logits/chosen": -1.0171754360198975, "logits/rejected": -1.2670618295669556, "logps/chosen": -121.7349853515625, "logps/ref_chosen": -62.181278228759766, "logps/ref_rejected": -108.17747497558594, "logps/rejected": -211.99676513671875, "loss": 1.1301, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5475019216537476, "rewards/margins": 0.40066835284233093, "rewards/rejected": -0.9481702446937561, "step": 626 }, { "epoch": 0.9478458049886621, "grad_norm": 6.459384918212891, "kl/avg_steps": 0.59375, "kl/beta": 0.009159067645668983, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.271028567242818e-09, "logits/chosen": -1.36244797706604, "logits/rejected": -1.6176857948303223, "logps/chosen": -138.19891357421875, "logps/ref_chosen": -77.72123718261719, "logps/ref_rejected": -114.40547180175781, "logps/rejected": -227.02944946289062, "loss": 1.0443, "rewards/accuracies": 0.78125, "rewards/chosen": -0.551235556602478, "rewards/margins": 0.4718520939350128, "rewards/rejected": -1.023087501525879, "step": 627 }, { "epoch": 0.9493575207860923, "grad_norm": 6.517147541046143, "kl/avg_steps": 0.53125, "kl/beta": 0.009105006232857704, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.0311050177251895e-09, "logits/chosen": -1.5668630599975586, "logits/rejected": -1.0957720279693604, "logps/chosen": -128.68606567382812, "logps/ref_chosen": -70.71195983886719, "logps/ref_rejected": -93.85910034179688, "logps/rejected": -200.63211059570312, "loss": 1.0998, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5267102718353271, "rewards/margins": 0.4377779960632324, "rewards/rejected": -0.9644882678985596, "step": 628 }, { "epoch": 0.9508692365835223, "grad_norm": 7.322593688964844, "kl/avg_steps": 0.4375, "kl/beta": 0.00905689224600792, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.798061746947995e-09, "logits/chosen": -1.5906280279159546, "logits/rejected": -1.5747921466827393, "logps/chosen": -145.18203735351562, "logps/ref_chosen": -88.66283416748047, "logps/ref_rejected": -94.67845153808594, "logps/rejected": -190.83462524414062, "loss": 1.1164, "rewards/accuracies": 0.75, "rewards/chosen": -0.5112582445144653, "rewards/margins": 0.35449928045272827, "rewards/rejected": -0.8657574653625488, "step": 629 }, { "epoch": 0.9523809523809523, "grad_norm": 4.681629657745361, "kl/avg_steps": 0.5, "kl/beta": 0.009017440490424633, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.5719052736323806e-09, "logits/chosen": -1.400179386138916, "logits/rejected": -1.4847885370254517, "logps/chosen": -132.58355712890625, "logps/ref_chosen": -72.94979858398438, "logps/ref_rejected": -92.7632827758789, "logps/rejected": -196.56716918945312, "loss": 1.0964, "rewards/accuracies": 0.734375, "rewards/chosen": -0.535965085029602, "rewards/margins": 0.39343059062957764, "rewards/rejected": -0.9293956160545349, "step": 630 }, { "epoch": 0.9538926681783825, "grad_norm": 6.4820966720581055, "kl/avg_steps": 0.5625, "kl/beta": 0.00897257775068283, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.352641923861144e-09, "logits/chosen": -1.6833112239837646, "logits/rejected": -1.9373281002044678, "logps/chosen": -134.91073608398438, "logps/ref_chosen": -78.58656311035156, "logps/ref_rejected": -115.38685607910156, "logps/rejected": -221.26397705078125, "loss": 1.0794, "rewards/accuracies": 0.796875, "rewards/chosen": -0.504284143447876, "rewards/margins": 0.43889355659484863, "rewards/rejected": -0.9431777000427246, "step": 631 }, { "epoch": 0.9554043839758125, "grad_norm": 6.496461868286133, "kl/avg_steps": 0.5, "kl/beta": 0.008922388777136803, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.140277830901428e-09, "logits/chosen": -1.3603768348693848, "logits/rejected": -1.4694833755493164, "logps/chosen": -133.49276733398438, "logps/ref_chosen": -75.24861907958984, "logps/ref_rejected": -82.98665618896484, "logps/rejected": -185.62823486328125, "loss": 1.0996, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5186818242073059, "rewards/margins": 0.3908722400665283, "rewards/rejected": -0.9095540642738342, "step": 632 }, { "epoch": 0.9569160997732427, "grad_norm": 8.019292831420898, "kl/avg_steps": 0.453125, "kl/beta": 0.008877999149262905, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.9348189350335007e-09, "logits/chosen": -1.3611078262329102, "logits/rejected": -1.2718690633773804, "logps/chosen": -115.50188446044922, "logps/ref_chosen": -68.8402099609375, "logps/ref_rejected": -84.64610290527344, "logps/rejected": -178.7694091796875, "loss": 1.0889, "rewards/accuracies": 0.75, "rewards/chosen": -0.41344064474105835, "rewards/margins": 0.4160441756248474, "rewards/rejected": -0.8294848203659058, "step": 633 }, { "epoch": 0.9584278155706727, "grad_norm": 7.965339183807373, "kl/avg_steps": 0.25, "kl/beta": 0.00883795227855444, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 2.736270983384276e-09, "logits/chosen": -1.409487009048462, "logits/rejected": -1.4064576625823975, "logps/chosen": -145.90066528320312, "logps/ref_chosen": -77.0589599609375, "logps/ref_rejected": -74.37579345703125, "logps/rejected": -159.47303771972656, "loss": 1.3238, "rewards/accuracies": 0.625, "rewards/chosen": -0.6090530753135681, "rewards/margins": 0.13959276676177979, "rewards/rejected": -0.7486459016799927, "step": 634 }, { "epoch": 0.9599395313681028, "grad_norm": 5.619506359100342, "kl/avg_steps": 0.359375, "kl/beta": 0.008815912529826164, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.5446395297668287e-09, "logits/chosen": -1.5071735382080078, "logits/rejected": -1.6686877012252808, "logps/chosen": -162.4663543701172, "logps/ref_chosen": -85.60243225097656, "logps/ref_rejected": -104.29497528076172, "logps/rejected": -209.23851013183594, "loss": 1.2379, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6778690218925476, "rewards/margins": 0.24298283457756042, "rewards/rejected": -0.9208518266677856, "step": 635 }, { "epoch": 0.9614512471655329, "grad_norm": 6.538994312286377, "kl/avg_steps": 0.59375, "kl/beta": 0.008784343488514423, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.359929934524829e-09, "logits/chosen": -1.1326661109924316, "logits/rejected": -1.557888388633728, "logps/chosen": -122.93463134765625, "logps/ref_chosen": -68.72154235839844, "logps/ref_rejected": -97.44863891601562, "logps/rejected": -203.28976440429688, "loss": 1.0504, "rewards/accuracies": 0.765625, "rewards/chosen": -0.47411519289016724, "rewards/margins": 0.4482189416885376, "rewards/rejected": -0.9223341345787048, "step": 636 }, { "epoch": 0.9629629629629629, "grad_norm": 5.887253284454346, "kl/avg_steps": 0.40625, "kl/beta": 0.00873249489814043, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.1821473643827137e-09, "logits/chosen": -1.506112813949585, "logits/rejected": -1.5703227519989014, "logps/chosen": -167.31407165527344, "logps/ref_chosen": -92.38919067382812, "logps/ref_rejected": -103.70460510253906, "logps/rejected": -220.37106323242188, "loss": 1.1261, "rewards/accuracies": 0.734375, "rewards/chosen": -0.652953565120697, "rewards/margins": 0.35979682207107544, "rewards/rejected": -1.0127503871917725, "step": 637 }, { "epoch": 0.9644746787603931, "grad_norm": 5.912980556488037, "kl/avg_steps": 0.453125, "kl/beta": 0.008697162382304668, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.0112967923011646e-09, "logits/chosen": -1.3402650356292725, "logits/rejected": -1.5520743131637573, "logps/chosen": -152.41412353515625, "logps/ref_chosen": -83.36921691894531, "logps/ref_rejected": -103.04508209228516, "logps/rejected": -209.3965301513672, "loss": 1.1568, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5985315442085266, "rewards/margins": 0.31978681683540344, "rewards/rejected": -0.9183183312416077, "step": 638 }, { "epoch": 0.9659863945578231, "grad_norm": 5.9383931159973145, "kl/avg_steps": 0.4375, "kl/beta": 0.008657931350171566, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.847382997337943e-09, "logits/chosen": -1.5128107070922852, "logits/rejected": -1.6139025688171387, "logps/chosen": -128.01541137695312, "logps/ref_chosen": -70.45248413085938, "logps/ref_rejected": -93.77748107910156, "logps/rejected": -196.63209533691406, "loss": 1.1036, "rewards/accuracies": 0.71875, "rewards/chosen": -0.49751636385917664, "rewards/margins": 0.3872116506099701, "rewards/rejected": -0.8847280144691467, "step": 639 }, { "epoch": 0.9674981103552532, "grad_norm": 6.486922264099121, "kl/avg_steps": 0.53125, "kl/beta": 0.008620217442512512, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.690410564514244e-09, "logits/chosen": -1.3309577703475952, "logits/rejected": -1.6244860887527466, "logps/chosen": -132.64532470703125, "logps/ref_chosen": -68.51570129394531, "logps/ref_rejected": -92.35081481933594, "logps/rejected": -189.81338500976562, "loss": 1.2017, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5512025356292725, "rewards/margins": 0.2822040319442749, "rewards/rejected": -0.8334065675735474, "step": 640 }, { "epoch": 0.9690098261526833, "grad_norm": 6.680337905883789, "kl/avg_steps": 0.4375, "kl/beta": 0.00857466459274292, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.5403838846864692e-09, "logits/chosen": -1.489738941192627, "logits/rejected": -1.2841875553131104, "logps/chosen": -161.3524169921875, "logps/ref_chosen": -92.35102844238281, "logps/ref_rejected": -102.4269790649414, "logps/rejected": -206.33120727539062, "loss": 1.1733, "rewards/accuracies": 0.75, "rewards/chosen": -0.5910571813583374, "rewards/margins": 0.29499414563179016, "rewards/rejected": -0.8860512971878052, "step": 641 }, { "epoch": 0.9705215419501134, "grad_norm": 6.556896686553955, "kl/avg_steps": 0.4375, "kl/beta": 0.008537313900887966, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.3973071544233218e-09, "logits/chosen": -1.4611645936965942, "logits/rejected": -1.2641469240188599, "logps/chosen": -156.3619384765625, "logps/ref_chosen": -88.39617919921875, "logps/ref_rejected": -88.73035430908203, "logps/rejected": -188.46615600585938, "loss": 1.1993, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5795704126358032, "rewards/margins": 0.2666812539100647, "rewards/rejected": -0.8462516069412231, "step": 642 }, { "epoch": 0.9720332577475435, "grad_norm": 8.884759902954102, "kl/avg_steps": 0.4375, "kl/beta": 0.00850012619048357, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.261184375888541e-09, "logits/chosen": -1.7491729259490967, "logits/rejected": -2.072495222091675, "logps/chosen": -149.3084716796875, "logps/ref_chosen": -84.83087921142578, "logps/ref_rejected": -105.31499481201172, "logps/rejected": -205.96807861328125, "loss": 1.1847, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5469406247138977, "rewards/margins": 0.3024827241897583, "rewards/rejected": -0.849423348903656, "step": 643 }, { "epoch": 0.9735449735449735, "grad_norm": 5.750825881958008, "kl/avg_steps": 0.375, "kl/beta": 0.008463099598884583, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.1320193567288527e-09, "logits/chosen": -1.1256132125854492, "logits/rejected": -1.4251909255981445, "logps/chosen": -128.88160705566406, "logps/ref_chosen": -65.11122131347656, "logps/ref_rejected": -80.4027328491211, "logps/rejected": -177.0889892578125, "loss": 1.2123, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5400751233100891, "rewards/margins": 0.2735128402709961, "rewards/rejected": -0.81358802318573, "step": 644 }, { "epoch": 0.9750566893424036, "grad_norm": 6.494144439697266, "kl/avg_steps": 0.5, "kl/beta": 0.008431482128798962, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.0098157099674987e-09, "logits/chosen": -1.3750994205474854, "logits/rejected": -1.037635087966919, "logps/chosen": -139.2389373779297, "logps/ref_chosen": -76.93634033203125, "logps/ref_rejected": -89.14311981201172, "logps/rejected": -194.06703186035156, "loss": 1.1195, "rewards/accuracies": 0.75, "rewards/chosen": -0.5241885185241699, "rewards/margins": 0.35475391149520874, "rewards/rejected": -0.8789424300193787, "step": 645 }, { "epoch": 0.9765684051398337, "grad_norm": 6.124319076538086, "kl/avg_steps": 0.5625, "kl/beta": 0.008389534428715706, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 8.945768539031783e-10, "logits/chosen": -1.310829758644104, "logits/rejected": -1.2478933334350586, "logps/chosen": -150.88714599609375, "logps/ref_chosen": -77.69122314453125, "logps/ref_rejected": -98.14374542236328, "logps/rejected": -213.10272216796875, "loss": 1.1296, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6121255159378052, "rewards/margins": 0.34585699439048767, "rewards/rejected": -0.9579824209213257, "step": 646 }, { "epoch": 0.9780801209372638, "grad_norm": 7.085869789123535, "kl/avg_steps": 0.71875, "kl/beta": 0.008342606946825981, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 7.863060120144316e-10, "logits/chosen": -1.622401475906372, "logits/rejected": -1.438711404800415, "logps/chosen": -154.6905059814453, "logps/ref_chosen": -83.79997253417969, "logps/ref_rejected": -116.81964874267578, "logps/rejected": -240.5399932861328, "loss": 1.0493, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5890640616416931, "rewards/margins": 0.4357026517391205, "rewards/rejected": -1.0247666835784912, "step": 647 }, { "epoch": 0.9795918367346939, "grad_norm": 5.251621723175049, "kl/avg_steps": 0.5, "kl/beta": 0.008283072151243687, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 6.850062128694045e-10, "logits/chosen": -1.2734894752502441, "logits/rejected": -1.5064399242401123, "logps/chosen": -159.29747009277344, "logps/ref_chosen": -85.9629898071289, "logps/ref_rejected": -101.36550903320312, "logps/rejected": -210.07115173339844, "loss": 1.1738, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6054705381393433, "rewards/margins": 0.2887694835662842, "rewards/rejected": -0.8942400217056274, "step": 648 }, { "epoch": 0.981103552532124, "grad_norm": 6.989095211029053, "kl/avg_steps": 0.5, "kl/beta": 0.008241862989962101, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.906802900412788e-10, "logits/chosen": -1.115774393081665, "logits/rejected": -1.133882999420166, "logps/chosen": -133.21107482910156, "logps/ref_chosen": -68.64892578125, "logps/ref_rejected": -89.84898376464844, "logps/rejected": -193.7150421142578, "loss": 1.1629, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5316280722618103, "rewards/margins": 0.31877434253692627, "rewards/rejected": -0.8504023551940918, "step": 649 }, { "epoch": 0.982615268329554, "grad_norm": 6.10765266418457, "kl/avg_steps": 0.46875, "kl/beta": 0.00820085871964693, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 5.033308820289184e-10, "logits/chosen": -0.9832993745803833, "logits/rejected": -1.3950834274291992, "logps/chosen": -128.94815063476562, "logps/ref_chosen": -72.97265625, "logps/ref_rejected": -93.0461654663086, "logps/rejected": -192.0283203125, "loss": 1.1354, "rewards/accuracies": 0.765625, "rewards/chosen": -0.45911887288093567, "rewards/margins": 0.3477191925048828, "rewards/rejected": -0.8068380355834961, "step": 650 }, { "epoch": 0.9841269841269841, "grad_norm": 9.630555152893066, "kl/avg_steps": 0.40625, "kl/beta": 0.008162596262991428, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.2296043218295606e-10, "logits/chosen": -1.4361473321914673, "logits/rejected": -1.7182174921035767, "logps/chosen": -134.17694091796875, "logps/ref_chosen": -71.05281066894531, "logps/ref_rejected": -94.23469543457031, "logps/rejected": -193.00314331054688, "loss": 1.1756, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5147565603256226, "rewards/margins": 0.2868105173110962, "rewards/rejected": -0.8015670776367188, "step": 651 }, { "epoch": 0.9856386999244142, "grad_norm": 7.7754902839660645, "kl/avg_steps": 0.375, "kl/beta": 0.008129570633172989, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.4957118863768176e-10, "logits/chosen": -1.7230968475341797, "logits/rejected": -1.4568369388580322, "logps/chosen": -147.6585693359375, "logps/ref_chosen": -80.06941223144531, "logps/ref_rejected": -99.22327423095703, "logps/rejected": -207.12478637695312, "loss": 1.1528, "rewards/accuracies": 0.6875, "rewards/chosen": -0.54934161901474, "rewards/margins": 0.32308200001716614, "rewards/rejected": -0.8724236488342285, "step": 652 }, { "epoch": 0.9871504157218443, "grad_norm": 7.7242231369018555, "kl/avg_steps": 0.46875, "kl/beta": 0.008099198341369629, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.831652042480093e-10, "logits/chosen": -1.3579241037368774, "logits/rejected": -1.3024935722351074, "logps/chosen": -139.64584350585938, "logps/ref_chosen": -80.35701751708984, "logps/ref_rejected": -92.1295394897461, "logps/rejected": -197.7828369140625, "loss": 1.1113, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4791703522205353, "rewards/margins": 0.3704642653465271, "rewards/rejected": -0.8496346473693848, "step": 653 }, { "epoch": 0.9886621315192744, "grad_norm": 6.726657390594482, "kl/avg_steps": 0.46875, "kl/beta": 0.008061409927904606, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.2374433653205016e-10, "logits/chosen": -1.325247883796692, "logits/rejected": -1.6915186643600464, "logps/chosen": -146.48092651367188, "logps/ref_chosen": -78.06475830078125, "logps/ref_rejected": -106.05763244628906, "logps/rejected": -208.97283935546875, "loss": 1.1938, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5504453778266907, "rewards/margins": 0.2738248109817505, "rewards/rejected": -0.8242701888084412, "step": 654 }, { "epoch": 0.9901738473167044, "grad_norm": 6.092385768890381, "kl/avg_steps": 0.4375, "kl/beta": 0.00802379846572876, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.7131024761923852e-10, "logits/chosen": -1.2895491123199463, "logits/rejected": -1.8892626762390137, "logps/chosen": -125.02395629882812, "logps/ref_chosen": -67.03407287597656, "logps/ref_rejected": -97.57197570800781, "logps/rejected": -196.07412719726562, "loss": 1.1371, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4643661677837372, "rewards/margins": 0.32108306884765625, "rewards/rejected": -0.7854492664337158, "step": 655 }, { "epoch": 0.9916855631141346, "grad_norm": 4.83301305770874, "kl/avg_steps": 0.59375, "kl/beta": 0.00798884779214859, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.2586440420372934e-10, "logits/chosen": -1.4277193546295166, "logits/rejected": -1.4072365760803223, "logps/chosen": -158.8524627685547, "logps/ref_chosen": -89.31462860107422, "logps/ref_rejected": -105.14315795898438, "logps/rejected": -217.38514709472656, "loss": 1.1364, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5535690188407898, "rewards/margins": 0.33648404479026794, "rewards/rejected": -0.8900530338287354, "step": 656 }, { "epoch": 0.9931972789115646, "grad_norm": 7.831459999084473, "kl/avg_steps": 0.625, "kl/beta": 0.007941693998873234, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 8.740807750345913e-11, "logits/chosen": -1.1185672283172607, "logits/rejected": -1.3250389099121094, "logps/chosen": -120.76776123046875, "logps/ref_chosen": -64.89747619628906, "logps/ref_rejected": -94.21998596191406, "logps/rejected": -206.50807189941406, "loss": 1.0521, "rewards/accuracies": 0.796875, "rewards/chosen": -0.44156578183174133, "rewards/margins": 0.44281652569770813, "rewards/rejected": -0.8843823671340942, "step": 657 }, { "epoch": 0.9947089947089947, "grad_norm": 8.166825294494629, "kl/avg_steps": 0.40625, "kl/beta": 0.007892366498708725, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 5.594234322453539e-11, "logits/chosen": -1.4240777492523193, "logits/rejected": -1.3959312438964844, "logps/chosen": -142.5401611328125, "logps/ref_chosen": -81.16606140136719, "logps/ref_rejected": -97.72825622558594, "logps/rejected": -198.3736572265625, "loss": 1.1763, "rewards/accuracies": 0.703125, "rewards/chosen": -0.48336413502693176, "rewards/margins": 0.305160254240036, "rewards/rejected": -0.7885243892669678, "step": 658 }, { "epoch": 0.9962207105064248, "grad_norm": 5.3906121253967285, "kl/avg_steps": 0.421875, "kl/beta": 0.00786043331027031, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.146808153123293e-11, "logits/chosen": -1.1394093036651611, "logits/rejected": -1.665954828262329, "logps/chosen": -145.08837890625, "logps/ref_chosen": -74.42193603515625, "logps/ref_rejected": -87.81561279296875, "logps/rejected": -184.11553955078125, "loss": 1.2645, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5546475052833557, "rewards/margins": 0.19737288355827332, "rewards/rejected": -0.7520203590393066, "step": 659 }, { "epoch": 0.9977324263038548, "grad_norm": 6.83611536026001, "kl/avg_steps": 0.546875, "kl/beta": 0.007827411405742168, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.3985977021235829e-11, "logits/chosen": -1.5256366729736328, "logits/rejected": -1.4637858867645264, "logps/chosen": -136.3591766357422, "logps/ref_chosen": -71.68512725830078, "logps/ref_rejected": -98.01472473144531, "logps/rejected": -211.43475341796875, "loss": 1.0893, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5045329332351685, "rewards/margins": 0.3771495819091797, "rewards/rejected": -0.8816825151443481, "step": 660 }, { "epoch": 0.999244142101285, "grad_norm": 5.742647647857666, "kl/avg_steps": 0.34375, "kl/beta": 0.007784838322550058, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.4965187065971735e-12, "logits/chosen": -1.083855152130127, "logits/rejected": -1.362886667251587, "logps/chosen": -159.15509033203125, "logps/ref_chosen": -78.35111999511719, "logps/ref_rejected": -99.47113037109375, "logps/rejected": -208.73788452148438, "loss": 1.2429, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6288719177246094, "rewards/margins": 0.21748466789722443, "rewards/rejected": -0.8463565111160278, "step": 661 }, { "epoch": 0.999244142101285, "step": 661, "total_flos": 0.0, "train_loss": 1.1575256359739492, "train_runtime": 3253.0214, "train_samples_per_second": 13.014, "train_steps_per_second": 0.203 } ], "logging_steps": 1, "max_steps": 661, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }