{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989528795811519, "eval_steps": 200, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020942408376963353, "grad_norm": 14.28126049041748, "kl/avg_steps": -0.0390625, "kl/beta": 0.009999999776482582, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.4765625, "learning_rate": 0.0, "logits/chosen": -0.364409476518631, "logits/rejected": -0.3671390116214752, "logps/chosen": -275.48590087890625, "logps/ref_chosen": -275.43902587890625, "logps/ref_rejected": -223.14576721191406, "logps/rejected": -223.16470336914062, "loss": 2.7733, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.0004925209796056151, "rewards/margins": -0.0003269182052463293, "rewards/rejected": -0.00016560273070354015, "step": 1 }, { "epoch": 0.010471204188481676, "grad_norm": 14.75130844116211, "kl/avg_steps": 0.013671875, "kl/beta": 0.009998245164752007, "kl/n_epsilon_steps": 0.48828125, "kl/p_epsilon_steps": 0.501953125, "learning_rate": 4.166666666666666e-08, "logits/chosen": -0.45231470465660095, "logits/rejected": -0.4597889184951782, "logps/chosen": -292.59796142578125, "logps/ref_chosen": -292.61004638671875, "logps/ref_rejected": -276.7996520996094, "logps/rejected": -276.81085205078125, "loss": 2.7723, "rewards/accuracies": 0.5078125, "rewards/chosen": 9.182449139188975e-05, "rewards/margins": 0.0001786811335477978, "rewards/rejected": -8.685662760399282e-05, "step": 5 }, { "epoch": 0.020942408376963352, "grad_norm": 13.28615951538086, "kl/avg_steps": -0.0015625000232830644, "kl/beta": 0.00998986978083849, "kl/n_epsilon_steps": 0.4937500059604645, "kl/p_epsilon_steps": 0.4921875, "learning_rate": 9.375e-08, "logits/chosen": -0.4420033395290375, "logits/rejected": -0.43265849351882935, "logps/chosen": -288.40545654296875, "logps/ref_chosen": -288.4424133300781, "logps/ref_rejected": -255.2630615234375, "logps/rejected": -255.2399139404297, "loss": 2.7724, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": 0.0003403747396077961, "rewards/margins": 8.325525413965806e-05, "rewards/rejected": 0.0002571194781921804, "step": 10 }, { "epoch": 0.031413612565445025, "grad_norm": 15.162229537963867, "kl/avg_steps": 0.0390625, "kl/beta": 0.009990684688091278, "kl/n_epsilon_steps": 0.4765625, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.4583333333333335e-07, "logits/chosen": -0.41182345151901245, "logits/rejected": -0.42728322744369507, "logps/chosen": -287.8147277832031, "logps/ref_chosen": -287.860107421875, "logps/ref_rejected": -260.53314208984375, "logps/rejected": -260.57171630859375, "loss": 2.771, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.0004283771850168705, "rewards/margins": 0.0007861476624384522, "rewards/rejected": -0.00035777047742158175, "step": 15 }, { "epoch": 0.041884816753926704, "grad_norm": 14.730121612548828, "kl/avg_steps": 0.10000000149011612, "kl/beta": 0.009967166930437088, "kl/n_epsilon_steps": 0.4468750059604645, "kl/p_epsilon_steps": 0.546875, "learning_rate": 1.9791666666666664e-07, "logits/chosen": -0.402193546295166, "logits/rejected": -0.4104000926017761, "logps/chosen": -286.76837158203125, "logps/ref_chosen": -286.84619140625, "logps/ref_rejected": -258.8122253417969, "logps/rejected": -258.8099365234375, "loss": 2.7712, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0007459347834810615, "rewards/margins": 0.0006972032715566456, "rewards/rejected": 4.8731650167610496e-05, "step": 20 }, { "epoch": 0.05235602094240838, "grad_norm": 13.414973258972168, "kl/avg_steps": 0.14531250298023224, "kl/beta": 0.009911659173667431, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.567187488079071, "learning_rate": 2.5e-07, "logits/chosen": -0.45143261551856995, "logits/rejected": -0.41997185349464417, "logps/chosen": -278.1541748046875, "logps/ref_chosen": -278.32708740234375, "logps/ref_rejected": -265.2242431640625, "logps/rejected": -265.2095947265625, "loss": 2.7696, "rewards/accuracies": 0.5640624761581421, "rewards/chosen": 0.0016819715965539217, "rewards/margins": 0.0015082823811098933, "rewards/rejected": 0.0001736890699248761, "step": 25 }, { "epoch": 0.06282722513089005, "grad_norm": 14.05941390991211, "kl/avg_steps": 0.18125000596046448, "kl/beta": 0.009822528809309006, "kl/n_epsilon_steps": 0.4046874940395355, "kl/p_epsilon_steps": 0.5859375, "learning_rate": 3.020833333333333e-07, "logits/chosen": -0.42877644300460815, "logits/rejected": -0.44940271973609924, "logps/chosen": -284.7930603027344, "logps/ref_chosen": -285.1208190917969, "logps/ref_rejected": -253.87570190429688, "logps/rejected": -253.77908325195312, "loss": 2.7682, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0031784414313733578, "rewards/margins": 0.0022038619499653578, "rewards/rejected": 0.0009745795396156609, "step": 30 }, { "epoch": 0.07329842931937172, "grad_norm": 12.731877326965332, "kl/avg_steps": 0.2953124940395355, "kl/beta": 0.009719033725559711, "kl/n_epsilon_steps": 0.3453125059604645, "kl/p_epsilon_steps": 0.640625, "learning_rate": 3.541666666666667e-07, "logits/chosen": -0.4260304868221283, "logits/rejected": -0.4479770064353943, "logps/chosen": -288.73638916015625, "logps/ref_chosen": -289.319580078125, "logps/ref_rejected": -253.91830444335938, "logps/rejected": -253.723388671875, "loss": 2.7653, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 0.005606816615909338, "rewards/margins": 0.003685607109218836, "rewards/rejected": 0.0019212098559364676, "step": 35 }, { "epoch": 0.08376963350785341, "grad_norm": 12.928390502929688, "kl/avg_steps": 0.34687501192092896, "kl/beta": 0.009557623416185379, "kl/n_epsilon_steps": 0.32343751192092896, "kl/p_epsilon_steps": 0.6703125238418579, "learning_rate": 4.0625e-07, "logits/chosen": -0.40764012932777405, "logits/rejected": -0.4099349081516266, "logps/chosen": -289.9876708984375, "logps/ref_chosen": -290.99627685546875, "logps/ref_rejected": -269.1242370605469, "logps/rejected": -268.88873291015625, "loss": 2.7582, "rewards/accuracies": 0.671875, "rewards/chosen": 0.009543242864310741, "rewards/margins": 0.007252500858157873, "rewards/rejected": 0.0022907420061528683, "step": 40 }, { "epoch": 0.09424083769633508, "grad_norm": 13.4513578414917, "kl/avg_steps": 0.42500001192092896, "kl/beta": 0.009382685646414757, "kl/n_epsilon_steps": 0.28437501192092896, "kl/p_epsilon_steps": 0.7093750238418579, "learning_rate": 4.5833333333333327e-07, "logits/chosen": -0.44510626792907715, "logits/rejected": -0.45678257942199707, "logps/chosen": -293.55364990234375, "logps/ref_chosen": -294.90985107421875, "logps/ref_rejected": -272.50750732421875, "logps/rejected": -272.3128967285156, "loss": 2.7515, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.012580705806612968, "rewards/margins": 0.010696337558329105, "rewards/rejected": 0.0018843680154532194, "step": 45 }, { "epoch": 0.10471204188481675, "grad_norm": 12.670825004577637, "kl/avg_steps": 0.33906251192092896, "kl/beta": 0.009193787351250648, "kl/n_epsilon_steps": 0.3265624940395355, "kl/p_epsilon_steps": 0.6656249761581421, "learning_rate": 4.999932966293553e-07, "logits/chosen": -0.3990762233734131, "logits/rejected": -0.43204984068870544, "logps/chosen": -276.26300048828125, "logps/ref_chosen": -278.0777587890625, "logps/ref_rejected": -264.7014465332031, "logps/rejected": -264.21429443359375, "loss": 2.7492, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.01650671288371086, "rewards/margins": 0.011964295990765095, "rewards/rejected": 0.004542418755590916, "step": 50 }, { "epoch": 0.11518324607329843, "grad_norm": 11.116233825683594, "kl/avg_steps": 0.3343749940395355, "kl/beta": 0.009037832729518414, "kl/n_epsilon_steps": 0.33125001192092896, "kl/p_epsilon_steps": 0.6656249761581421, "learning_rate": 4.997587164001815e-07, "logits/chosen": -0.458177387714386, "logits/rejected": -0.4686247408390045, "logps/chosen": -275.80706787109375, "logps/ref_chosen": -278.2171630859375, "logps/ref_rejected": -266.28826904296875, "logps/rejected": -266.1267395019531, "loss": 2.734, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.021547086536884308, "rewards/margins": 0.019951194524765015, "rewards/rejected": 0.0015958904987201095, "step": 55 }, { "epoch": 0.1256544502617801, "grad_norm": 12.35992431640625, "kl/avg_steps": 0.37968748807907104, "kl/beta": 0.008887865580618382, "kl/n_epsilon_steps": 0.30781251192092896, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.991893270335525e-07, "logits/chosen": -0.4476288855075836, "logits/rejected": -0.42895251512527466, "logps/chosen": -272.4042663574219, "logps/ref_chosen": -275.2093505859375, "logps/ref_rejected": -257.0248107910156, "logps/rejected": -257.15692138671875, "loss": 2.7234, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.024633441120386124, "rewards/margins": 0.02564925327897072, "rewards/rejected": -0.0010158123914152384, "step": 60 }, { "epoch": 0.13612565445026178, "grad_norm": 12.078445434570312, "kl/avg_steps": 0.3343749940395355, "kl/beta": 0.008730259723961353, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.6625000238418579, "learning_rate": 4.982858918131906e-07, "logits/chosen": -0.48387449979782104, "logits/rejected": -0.47897014021873474, "logps/chosen": -271.87811279296875, "logps/ref_chosen": -275.43511962890625, "logps/ref_rejected": -263.5926818847656, "logps/rejected": -263.5385437011719, "loss": 2.7153, "rewards/accuracies": 0.659375011920929, "rewards/chosen": 0.030704837292432785, "rewards/margins": 0.03002239391207695, "rewards/rejected": 0.0006824458832852542, "step": 65 }, { "epoch": 0.14659685863874344, "grad_norm": 12.209461212158203, "kl/avg_steps": 0.375, "kl/beta": 0.008580431342124939, "kl/n_epsilon_steps": 0.30781251192092896, "kl/p_epsilon_steps": 0.682812511920929, "learning_rate": 4.970496218214204e-07, "logits/chosen": -0.5278276801109314, "logits/rejected": -0.5665954351425171, "logps/chosen": -276.12548828125, "logps/ref_chosen": -279.77947998046875, "logps/ref_rejected": -256.8297424316406, "logps/rejected": -257.9794921875, "loss": 2.6963, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.0309266597032547, "rewards/margins": 0.040461957454681396, "rewards/rejected": -0.009535295888781548, "step": 70 }, { "epoch": 0.15706806282722513, "grad_norm": 12.27260684967041, "kl/avg_steps": 0.37812501192092896, "kl/beta": 0.008418848738074303, "kl/n_epsilon_steps": 0.3062500059604645, "kl/p_epsilon_steps": 0.684374988079071, "learning_rate": 4.954821743156767e-07, "logits/chosen": -0.5069125294685364, "logits/rejected": -0.502475380897522, "logps/chosen": -277.47296142578125, "logps/ref_chosen": -281.63433837890625, "logps/ref_rejected": -277.03350830078125, "logps/rejected": -278.06256103515625, "loss": 2.693, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.034517042338848114, "rewards/margins": 0.0428413525223732, "rewards/rejected": -0.008324312046170235, "step": 75 }, { "epoch": 0.16753926701570682, "grad_norm": 11.939748764038086, "kl/avg_steps": 0.36250001192092896, "kl/beta": 0.008260714821517467, "kl/n_epsilon_steps": 0.3140625059604645, "kl/p_epsilon_steps": 0.676562488079071, "learning_rate": 4.935856505068998e-07, "logits/chosen": -0.47688254714012146, "logits/rejected": -0.47220802307128906, "logps/chosen": -276.2677917480469, "logps/ref_chosen": -279.67755126953125, "logps/ref_rejected": -247.29833984375, "logps/rejected": -251.18466186523438, "loss": 2.6628, "rewards/accuracies": 0.6890624761581421, "rewards/chosen": 0.027681510895490646, "rewards/margins": 0.059294771403074265, "rewards/rejected": -0.03161326050758362, "step": 80 }, { "epoch": 0.17801047120418848, "grad_norm": 11.864156723022461, "kl/avg_steps": 0.3609375059604645, "kl/beta": 0.008115144446492195, "kl/n_epsilon_steps": 0.3187499940395355, "kl/p_epsilon_steps": 0.6796875, "learning_rate": 4.913625927427995e-07, "logits/chosen": -0.5454100370407104, "logits/rejected": -0.5279535055160522, "logps/chosen": -271.1054992675781, "logps/ref_chosen": -272.01007080078125, "logps/ref_rejected": -258.8889465332031, "logps/rejected": -265.29791259765625, "loss": 2.6678, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.006850575562566519, "rewards/margins": 0.05821793153882027, "rewards/rejected": -0.05136735364794731, "step": 85 }, { "epoch": 0.18848167539267016, "grad_norm": 11.893303871154785, "kl/avg_steps": 0.37812501192092896, "kl/beta": 0.007967790588736534, "kl/n_epsilon_steps": 0.3062500059604645, "kl/p_epsilon_steps": 0.684374988079071, "learning_rate": 4.8881598109976e-07, "logits/chosen": -0.6225690841674805, "logits/rejected": -0.5903512239456177, "logps/chosen": -285.7995910644531, "logps/ref_chosen": -285.41748046875, "logps/ref_rejected": -263.9450378417969, "logps/rejected": -273.43133544921875, "loss": 2.6438, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0035442456137388945, "rewards/margins": 0.07133002579212189, "rewards/rejected": -0.07487426698207855, "step": 90 }, { "epoch": 0.19895287958115182, "grad_norm": 13.124085426330566, "kl/avg_steps": 0.359375, "kl/beta": 0.007824316620826721, "kl/n_epsilon_steps": 0.31562501192092896, "kl/p_epsilon_steps": 0.675000011920929, "learning_rate": 4.859492293879573e-07, "logits/chosen": -0.5456125140190125, "logits/rejected": -0.5421279072761536, "logps/chosen": -274.5228576660156, "logps/ref_chosen": -271.7696533203125, "logps/ref_rejected": -255.344970703125, "logps/rejected": -267.8470153808594, "loss": 2.6403, "rewards/accuracies": 0.682812511920929, "rewards/chosen": -0.022010665386915207, "rewards/margins": 0.07486502826213837, "rewards/rejected": -0.09687568247318268, "step": 95 }, { "epoch": 0.2094240837696335, "grad_norm": 13.929049491882324, "kl/avg_steps": 0.359375, "kl/beta": 0.0076828403398394585, "kl/n_epsilon_steps": 0.31718748807907104, "kl/p_epsilon_steps": 0.676562488079071, "learning_rate": 4.827661805750437e-07, "logits/chosen": -0.5994928479194641, "logits/rejected": -0.6089519262313843, "logps/chosen": -295.6308898925781, "logps/ref_chosen": -289.942626953125, "logps/ref_rejected": -262.18438720703125, "logps/rejected": -279.8243713378906, "loss": 2.6153, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04416309669613838, "rewards/margins": 0.09017638117074966, "rewards/rejected": -0.13433948159217834, "step": 100 }, { "epoch": 0.2198952879581152, "grad_norm": 13.462470054626465, "kl/avg_steps": 0.44999998807907104, "kl/beta": 0.007534568663686514, "kl/n_epsilon_steps": 0.27031248807907104, "kl/p_epsilon_steps": 0.7203124761581421, "learning_rate": 4.792711016345321e-07, "logits/chosen": -0.6025761961936951, "logits/rejected": -0.6042689085006714, "logps/chosen": -270.66156005859375, "logps/ref_chosen": -264.43994140625, "logps/ref_rejected": -259.32550048828125, "logps/rejected": -280.54681396484375, "loss": 2.578, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.04736360162496567, "rewards/margins": 0.1112045869231224, "rewards/rejected": -0.15856818854808807, "step": 105 }, { "epoch": 0.23036649214659685, "grad_norm": 13.279642105102539, "kl/avg_steps": 0.3828125, "kl/beta": 0.007380378432571888, "kl/n_epsilon_steps": 0.3046875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.75468677825789e-07, "logits/chosen": -0.6601926684379578, "logits/rejected": -0.6502302289009094, "logps/chosen": -308.3574523925781, "logps/ref_chosen": -299.7341613769531, "logps/ref_rejected": -267.6495361328125, "logps/rejected": -294.60247802734375, "loss": 2.5437, "rewards/accuracies": 0.729687511920929, "rewards/chosen": -0.06412671506404877, "rewards/margins": 0.1331605762243271, "rewards/rejected": -0.19728729128837585, "step": 110 }, { "epoch": 0.24083769633507854, "grad_norm": 16.528404235839844, "kl/avg_steps": 0.3187499940395355, "kl/beta": 0.007241943385452032, "kl/n_epsilon_steps": 0.33906251192092896, "kl/p_epsilon_steps": 0.6578124761581421, "learning_rate": 4.7136400641330245e-07, "logits/chosen": -0.7043158411979675, "logits/rejected": -0.6803773045539856, "logps/chosen": -302.77886962890625, "logps/ref_chosen": -286.24127197265625, "logps/ref_rejected": -270.0053405761719, "logps/rejected": -304.2045593261719, "loss": 2.5712, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.12022699415683746, "rewards/margins": 0.12564857304096222, "rewards/rejected": -0.24587556719779968, "step": 115 }, { "epoch": 0.2513089005235602, "grad_norm": 15.809136390686035, "kl/avg_steps": 0.31562501192092896, "kl/beta": 0.007125412113964558, "kl/n_epsilon_steps": 0.3343749940395355, "kl/p_epsilon_steps": 0.6499999761581421, "learning_rate": 4.669625898336438e-07, "logits/chosen": -0.7761000990867615, "logits/rejected": -0.7452162504196167, "logps/chosen": -316.8116760253906, "logps/ref_chosen": -289.09954833984375, "logps/ref_rejected": -265.402587890625, "logps/rejected": -313.4027404785156, "loss": 2.5454, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.19777658581733704, "rewards/margins": 0.1420091986656189, "rewards/rejected": -0.33978578448295593, "step": 120 }, { "epoch": 0.2617801047120419, "grad_norm": 20.728435516357422, "kl/avg_steps": 0.3187499940395355, "kl/beta": 0.007016216870397329, "kl/n_epsilon_steps": 0.3343749940395355, "kl/p_epsilon_steps": 0.653124988079071, "learning_rate": 4.6227032831928483e-07, "logits/chosen": -0.8145838975906372, "logits/rejected": -0.7571443915367126, "logps/chosen": -308.98565673828125, "logps/ref_chosen": -276.1886291503906, "logps/ref_rejected": -255.31884765625, "logps/rejected": -309.42779541015625, "loss": 2.5476, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2306874692440033, "rewards/margins": 0.1467394083738327, "rewards/rejected": -0.3774269223213196, "step": 125 }, { "epoch": 0.27225130890052357, "grad_norm": 19.640256881713867, "kl/avg_steps": 0.4078125059604645, "kl/beta": 0.006901729851961136, "kl/n_epsilon_steps": 0.29374998807907104, "kl/p_epsilon_steps": 0.7015625238418579, "learning_rate": 4.5729351198915705e-07, "logits/chosen": -0.7584047317504883, "logits/rejected": -0.7613896131515503, "logps/chosen": -321.8742980957031, "logps/ref_chosen": -296.58355712890625, "logps/ref_rejected": -276.31829833984375, "logps/rejected": -330.4574279785156, "loss": 2.4667, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.1750645786523819, "rewards/margins": 0.19592006504535675, "rewards/rejected": -0.37098461389541626, "step": 130 }, { "epoch": 0.28272251308900526, "grad_norm": 21.653127670288086, "kl/avg_steps": 0.359375, "kl/beta": 0.006763220764696598, "kl/n_epsilon_steps": 0.3140625059604645, "kl/p_epsilon_steps": 0.6734374761581421, "learning_rate": 4.520388124165564e-07, "logits/chosen": -0.74022376537323, "logits/rejected": -0.7336807250976562, "logps/chosen": -333.85150146484375, "logps/ref_chosen": -295.8021545410156, "logps/ref_rejected": -277.921142578125, "logps/rejected": -343.9541320800781, "loss": 2.4937, "rewards/accuracies": 0.6859375238418579, "rewards/chosen": -0.2576160430908203, "rewards/margins": 0.18604378402233124, "rewards/rejected": -0.44365978240966797, "step": 135 }, { "epoch": 0.2931937172774869, "grad_norm": 25.029287338256836, "kl/avg_steps": 0.3296875059604645, "kl/beta": 0.006647522561252117, "kl/n_epsilon_steps": 0.33125001192092896, "kl/p_epsilon_steps": 0.660937488079071, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -0.8026041984558105, "logits/rejected": -0.7918664216995239, "logps/chosen": -334.2804260253906, "logps/ref_chosen": -283.0990295410156, "logps/ref_rejected": -264.1083679199219, "logps/rejected": -344.59429931640625, "loss": 2.4961, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.3406330943107605, "rewards/margins": 0.19122302532196045, "rewards/rejected": -0.5318561792373657, "step": 140 }, { "epoch": 0.3036649214659686, "grad_norm": 19.541704177856445, "kl/avg_steps": 0.4000000059604645, "kl/beta": 0.006527472287416458, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6968749761581421, "learning_rate": 4.4072430294890166e-07, "logits/chosen": -0.8155800104141235, "logits/rejected": -0.7769054174423218, "logps/chosen": -337.3866271972656, "logps/ref_chosen": -293.6390380859375, "logps/ref_rejected": -251.7206573486328, "logps/rejected": -329.2652282714844, "loss": 2.4545, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.28576841950416565, "rewards/margins": 0.21693627536296844, "rewards/rejected": -0.5027046799659729, "step": 145 }, { "epoch": 0.31413612565445026, "grad_norm": 22.123804092407227, "kl/avg_steps": 0.375, "kl/beta": 0.00640533585101366, "kl/n_epsilon_steps": 0.30781251192092896, "kl/p_epsilon_steps": 0.682812511920929, "learning_rate": 4.346796604970912e-07, "logits/chosen": -0.8539741635322571, "logits/rejected": -0.8217877149581909, "logps/chosen": -334.0752868652344, "logps/ref_chosen": -280.3023986816406, "logps/ref_rejected": -266.30657958984375, "logps/rejected": -355.8968811035156, "loss": 2.4396, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3443171977996826, "rewards/margins": 0.22578899562358856, "rewards/rejected": -0.5701061487197876, "step": 150 }, { "epoch": 0.32460732984293195, "grad_norm": 32.74282455444336, "kl/avg_steps": 0.42500001192092896, "kl/beta": 0.00627851951867342, "kl/n_epsilon_steps": 0.28437501192092896, "kl/p_epsilon_steps": 0.7093750238418579, "learning_rate": 4.2838744935687716e-07, "logits/chosen": -0.881779670715332, "logits/rejected": -0.8399287462234497, "logps/chosen": -348.90155029296875, "logps/ref_chosen": -283.4206848144531, "logps/ref_rejected": -275.6944885253906, "logps/rejected": -391.3532409667969, "loss": 2.3244, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.41083288192749023, "rewards/margins": 0.3107456564903259, "rewards/rejected": -0.7215785384178162, "step": 155 }, { "epoch": 0.33507853403141363, "grad_norm": 24.432859420776367, "kl/avg_steps": 0.3921875059604645, "kl/beta": 0.006150397472083569, "kl/n_epsilon_steps": 0.30000001192092896, "kl/p_epsilon_steps": 0.692187488079071, "learning_rate": 4.218561044282098e-07, "logits/chosen": -0.8856340646743774, "logits/rejected": -0.8543170690536499, "logps/chosen": -361.45648193359375, "logps/ref_chosen": -287.5817565917969, "logps/ref_rejected": -257.6918029785156, "logps/rejected": -380.94830322265625, "loss": 2.3581, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.45420369505882263, "rewards/margins": 0.2992251217365265, "rewards/rejected": -0.7534288167953491, "step": 160 }, { "epoch": 0.34554973821989526, "grad_norm": 29.309368133544922, "kl/avg_steps": 0.4000000059604645, "kl/beta": 0.0060306694358587265, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6968749761581421, "learning_rate": 4.1509438117713863e-07, "logits/chosen": -0.8547463417053223, "logits/rejected": -0.8155299425125122, "logps/chosen": -364.8583984375, "logps/ref_chosen": -289.0608215332031, "logps/ref_rejected": -249.4071807861328, "logps/rejected": -372.29840087890625, "loss": 2.3786, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4568546712398529, "rewards/margins": 0.2797931730747223, "rewards/rejected": -0.7366477847099304, "step": 165 }, { "epoch": 0.35602094240837695, "grad_norm": 45.036048889160156, "kl/avg_steps": 0.41093748807907104, "kl/beta": 0.005911406595259905, "kl/n_epsilon_steps": 0.2906250059604645, "kl/p_epsilon_steps": 0.7015625238418579, "learning_rate": 4.081113438988443e-07, "logits/chosen": -0.7270597219467163, "logits/rejected": -0.6853420734405518, "logps/chosen": -375.37933349609375, "logps/ref_chosen": -288.40557861328125, "logps/ref_rejected": -255.679443359375, "logps/rejected": -396.35137939453125, "loss": 2.3365, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.5136893391609192, "rewards/margins": 0.3125835359096527, "rewards/rejected": -0.8262729644775391, "step": 170 }, { "epoch": 0.36649214659685864, "grad_norm": 34.29857635498047, "kl/avg_steps": 0.44999998807907104, "kl/beta": 0.005786406807601452, "kl/n_epsilon_steps": 0.2718749940395355, "kl/p_epsilon_steps": 0.721875011920929, "learning_rate": 4.00916353566676e-07, "logits/chosen": -0.7422696352005005, "logits/rejected": -0.7540820837020874, "logps/chosen": -393.28900146484375, "logps/ref_chosen": -303.4944763183594, "logps/ref_rejected": -274.523193359375, "logps/rejected": -417.24163818359375, "loss": 2.3502, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5188406109809875, "rewards/margins": 0.3016633689403534, "rewards/rejected": -0.8205038905143738, "step": 175 }, { "epoch": 0.3769633507853403, "grad_norm": 36.96628189086914, "kl/avg_steps": 0.4453125, "kl/beta": 0.005661297123879194, "kl/n_epsilon_steps": 0.2750000059604645, "kl/p_epsilon_steps": 0.7203124761581421, "learning_rate": 3.935190552834828e-07, "logits/chosen": -0.7044585943222046, "logits/rejected": -0.6638351082801819, "logps/chosen": -356.0911865234375, "logps/ref_chosen": -272.7525634765625, "logps/ref_rejected": -258.00250244140625, "logps/rejected": -394.07452392578125, "loss": 2.3785, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4715401530265808, "rewards/margins": 0.29359155893325806, "rewards/rejected": -0.7651317119598389, "step": 180 }, { "epoch": 0.387434554973822, "grad_norm": 34.58934020996094, "kl/avg_steps": 0.4468750059604645, "kl/beta": 0.005536334123462439, "kl/n_epsilon_steps": 0.2718749940395355, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.859293653520604e-07, "logits/chosen": -0.8004829287528992, "logits/rejected": -0.8089984059333801, "logps/chosen": -384.07379150390625, "logps/ref_chosen": -288.7179870605469, "logps/ref_rejected": -262.846923828125, "logps/rejected": -421.9869079589844, "loss": 2.2846, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.5269938707351685, "rewards/margins": 0.3479762673377991, "rewards/rejected": -0.8749701380729675, "step": 185 }, { "epoch": 0.39790575916230364, "grad_norm": 37.24195861816406, "kl/avg_steps": 0.3984375, "kl/beta": 0.005422582384198904, "kl/n_epsilon_steps": 0.2953124940395355, "kl/p_epsilon_steps": 0.6937500238418579, "learning_rate": 3.781574579820464e-07, "logits/chosen": -0.8119276165962219, "logits/rejected": -0.7783881425857544, "logps/chosen": -398.28216552734375, "logps/ref_chosen": -284.51885986328125, "logps/ref_rejected": -257.11376953125, "logps/rejected": -432.58270263671875, "loss": 2.3371, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -0.6162558197975159, "rewards/margins": 0.32933706045150757, "rewards/rejected": -0.9455928802490234, "step": 190 }, { "epoch": 0.4083769633507853, "grad_norm": 40.757484436035156, "kl/avg_steps": 0.41718751192092896, "kl/beta": 0.005306036677211523, "kl/n_epsilon_steps": 0.2890625, "kl/p_epsilon_steps": 0.706250011920929, "learning_rate": 3.7021375165108377e-07, "logits/chosen": -0.8377869725227356, "logits/rejected": -0.7861225008964539, "logps/chosen": -397.78460693359375, "logps/ref_chosen": -270.699951171875, "logps/ref_rejected": -265.62664794921875, "logps/rejected": -450.6075134277344, "loss": 2.3781, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.6730450987815857, "rewards/margins": 0.30246153473854065, "rewards/rejected": -0.9755066633224487, "step": 195 }, { "epoch": 0.418848167539267, "grad_norm": 48.02657699584961, "kl/avg_steps": 0.4140625, "kl/beta": 0.005196661688387394, "kl/n_epsilon_steps": 0.2906250059604645, "kl/p_epsilon_steps": 0.7046874761581421, "learning_rate": 3.621088951385353e-07, "logits/chosen": -0.820801854133606, "logits/rejected": -0.7757973074913025, "logps/chosen": -411.86456298828125, "logps/ref_chosen": -294.84271240234375, "logps/ref_rejected": -259.71832275390625, "logps/rejected": -441.1104431152344, "loss": 2.3277, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.607452392578125, "rewards/margins": 0.32958686351776123, "rewards/rejected": -0.9370392560958862, "step": 200 }, { "epoch": 0.418848167539267, "eval_kl/n_epsilon_steps": 0.30443549156188965, "eval_kl/p_epsilon_steps": 0.6885080933570862, "eval_logits/chosen": -0.8135029077529907, "eval_logits/rejected": -0.7841039896011353, "eval_logps/chosen": -411.3474426269531, "eval_logps/ref_chosen": -287.9388427734375, "eval_logps/ref_rejected": -266.7934875488281, "eval_logps/rejected": -452.2705993652344, "eval_loss": 0.590430498123169, "eval_rewards/accuracies": 0.7011088728904724, "eval_rewards/chosen": -0.6331002116203308, "eval_rewards/margins": 0.3137185871601105, "eval_rewards/rejected": -0.9468188881874084, "eval_runtime": 50.808, "eval_samples_per_second": 39.364, "eval_steps_per_second": 1.24, "step": 200 }, { "epoch": 0.4293193717277487, "grad_norm": 34.794654846191406, "kl/avg_steps": 0.44843751192092896, "kl/beta": 0.005094348452985287, "kl/n_epsilon_steps": 0.26875001192092896, "kl/p_epsilon_steps": 0.7171875238418579, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -0.780733585357666, "logits/rejected": -0.7451142072677612, "logps/chosen": -400.80535888671875, "logps/ref_chosen": -285.2747802734375, "logps/ref_rejected": -260.1707458496094, "logps/rejected": -440.553466796875, "loss": 2.3228, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5877382159233093, "rewards/margins": 0.32529404759407043, "rewards/rejected": -0.9130322337150574, "step": 205 }, { "epoch": 0.4397905759162304, "grad_norm": 31.52602195739746, "kl/avg_steps": 0.4515624940395355, "kl/beta": 0.00497779855504632, "kl/n_epsilon_steps": 0.27031248807907104, "kl/p_epsilon_steps": 0.721875011920929, "learning_rate": 3.454593922550693e-07, "logits/chosen": -0.7406963109970093, "logits/rejected": -0.7560266852378845, "logps/chosen": -416.0575256347656, "logps/ref_chosen": -289.1589050292969, "logps/ref_rejected": -283.6126708984375, "logps/rejected": -476.3191833496094, "loss": 2.352, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.6303154230117798, "rewards/margins": 0.32247281074523926, "rewards/rejected": -0.952788233757019, "step": 210 }, { "epoch": 0.450261780104712, "grad_norm": 46.989559173583984, "kl/avg_steps": 0.37812501192092896, "kl/beta": 0.004868713207542896, "kl/n_epsilon_steps": 0.30781251192092896, "kl/p_epsilon_steps": 0.6859375238418579, "learning_rate": 3.3693706504794243e-07, "logits/chosen": -0.8139835596084595, "logits/rejected": -0.7599457502365112, "logps/chosen": -433.3778381347656, "logps/ref_chosen": -282.78741455078125, "logps/ref_rejected": -270.6185607910156, "logps/rejected": -487.9143981933594, "loss": 2.3546, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7319310307502747, "rewards/margins": 0.3200669586658478, "rewards/rejected": -1.0519979000091553, "step": 215 }, { "epoch": 0.4607329842931937, "grad_norm": 46.117774963378906, "kl/avg_steps": 0.4375, "kl/beta": 0.0047774300910532475, "kl/n_epsilon_steps": 0.27656251192092896, "kl/p_epsilon_steps": 0.714062511920929, "learning_rate": 3.2829819606729477e-07, "logits/chosen": -0.8140425682067871, "logits/rejected": -0.7845497727394104, "logps/chosen": -469.22198486328125, "logps/ref_chosen": -306.7879943847656, "logps/ref_rejected": -276.37646484375, "logps/rejected": -520.35498046875, "loss": 2.2658, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7745460271835327, "rewards/margins": 0.38397735357284546, "rewards/rejected": -1.158523440361023, "step": 220 }, { "epoch": 0.4712041884816754, "grad_norm": 37.04048538208008, "kl/avg_steps": 0.39375001192092896, "kl/beta": 0.004678776487708092, "kl/n_epsilon_steps": 0.30156248807907104, "kl/p_epsilon_steps": 0.6953125, "learning_rate": 3.1955436597911315e-07, "logits/chosen": -0.7957875728607178, "logits/rejected": -0.7486377954483032, "logps/chosen": -461.7236328125, "logps/ref_chosen": -289.04058837890625, "logps/ref_rejected": -266.5843811035156, "logps/rejected": -510.07672119140625, "loss": 2.3447, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.8065211176872253, "rewards/margins": 0.32613497972488403, "rewards/rejected": -1.1326560974121094, "step": 225 }, { "epoch": 0.4816753926701571, "grad_norm": 31.180253982543945, "kl/avg_steps": 0.44843751192092896, "kl/beta": 0.00458576250821352, "kl/n_epsilon_steps": 0.2734375, "kl/p_epsilon_steps": 0.721875011920929, "learning_rate": 3.1071729615293424e-07, "logits/chosen": -0.7394207119941711, "logits/rejected": -0.7154208421707153, "logps/chosen": -430.73114013671875, "logps/ref_chosen": -275.30206298828125, "logps/ref_rejected": -255.2294158935547, "logps/rejected": -489.6163635253906, "loss": 2.2842, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.7114227414131165, "rewards/margins": 0.3566770553588867, "rewards/rejected": -1.0680997371673584, "step": 230 }, { "epoch": 0.49214659685863876, "grad_norm": 39.24580383300781, "kl/avg_steps": 0.4156250059604645, "kl/beta": 0.004480619449168444, "kl/n_epsilon_steps": 0.2874999940395355, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.017988329489923e-07, "logits/chosen": -0.7794148921966553, "logits/rejected": -0.7514842748641968, "logps/chosen": -441.8724060058594, "logps/ref_chosen": -292.72894287109375, "logps/ref_rejected": -268.83807373046875, "logps/rejected": -489.39251708984375, "loss": 2.3459, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.667233943939209, "rewards/margins": 0.31493279337882996, "rewards/rejected": -0.9821667671203613, "step": 235 }, { "epoch": 0.5026178010471204, "grad_norm": 27.304569244384766, "kl/avg_steps": 0.4156250059604645, "kl/beta": 0.004393292590975761, "kl/n_epsilon_steps": 0.28593748807907104, "kl/p_epsilon_steps": 0.7015625238418579, "learning_rate": 2.9281093183781403e-07, "logits/chosen": -0.7677779197692871, "logits/rejected": -0.7548068165779114, "logps/chosen": -432.1578063964844, "logps/ref_chosen": -283.89190673828125, "logps/ref_rejected": -262.6282653808594, "logps/rejected": -484.4010314941406, "loss": 2.3406, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6503124237060547, "rewards/margins": 0.3181357979774475, "rewards/rejected": -0.9684481620788574, "step": 240 }, { "epoch": 0.5130890052356021, "grad_norm": 33.80092239379883, "kl/avg_steps": 0.43437498807907104, "kl/beta": 0.004302392713725567, "kl/n_epsilon_steps": 0.27812498807907104, "kl/p_epsilon_steps": 0.7124999761581421, "learning_rate": 2.837656413735479e-07, "logits/chosen": -0.8011455535888672, "logits/rejected": -0.7369574308395386, "logps/chosen": -439.2716369628906, "logps/ref_chosen": -293.95233154296875, "logps/ref_rejected": -262.296630859375, "logps/rejected": -485.175537109375, "loss": 2.3139, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6242814660072327, "rewards/margins": 0.32874006032943726, "rewards/rejected": -0.9530216455459595, "step": 245 }, { "epoch": 0.5235602094240838, "grad_norm": 29.35762596130371, "kl/avg_steps": 0.359375, "kl/beta": 0.004214797168970108, "kl/n_epsilon_steps": 0.31718748807907104, "kl/p_epsilon_steps": 0.676562488079071, "learning_rate": 2.7467508704251135e-07, "logits/chosen": -0.7795218229293823, "logits/rejected": -0.7625783085823059, "logps/chosen": -438.1741638183594, "logps/ref_chosen": -279.92138671875, "logps/ref_rejected": -255.0957794189453, "logps/rejected": -482.84124755859375, "loss": 2.3962, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6663497686386108, "rewards/margins": 0.2879102826118469, "rewards/rejected": -0.9542601704597473, "step": 250 }, { "epoch": 0.5340314136125655, "grad_norm": 40.971168518066406, "kl/avg_steps": 0.38749998807907104, "kl/beta": 0.0041356319561600685, "kl/n_epsilon_steps": 0.3046875, "kl/p_epsilon_steps": 0.692187488079071, "learning_rate": 2.655514550086086e-07, "logits/chosen": -0.8010396957397461, "logits/rejected": -0.7391474843025208, "logps/chosen": -453.9644470214844, "logps/ref_chosen": -286.27587890625, "logps/ref_rejected": -257.4590759277344, "logps/rejected": -503.93212890625, "loss": 2.3536, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6924406290054321, "rewards/margins": 0.32072776556015015, "rewards/rejected": -1.013168454170227, "step": 255 }, { "epoch": 0.5445026178010471, "grad_norm": 46.01213073730469, "kl/avg_steps": 0.3734374940395355, "kl/beta": 0.004054487682878971, "kl/n_epsilon_steps": 0.3109374940395355, "kl/p_epsilon_steps": 0.684374988079071, "learning_rate": 2.5640697577740815e-07, "logits/chosen": -0.7733880877494812, "logits/rejected": -0.7513821721076965, "logps/chosen": -470.3720703125, "logps/ref_chosen": -290.8160095214844, "logps/ref_rejected": -260.7832946777344, "logps/rejected": -515.3809814453125, "loss": 2.3782, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.7269195914268494, "rewards/margins": 0.2996986210346222, "rewards/rejected": -1.0266181230545044, "step": 260 }, { "epoch": 0.5549738219895288, "grad_norm": 33.39901351928711, "kl/avg_steps": 0.4437499940395355, "kl/beta": 0.003978157881647348, "kl/n_epsilon_steps": 0.26875001192092896, "kl/p_epsilon_steps": 0.7124999761581421, "learning_rate": 2.4725390780077905e-07, "logits/chosen": -0.7148987054824829, "logits/rejected": -0.70106440782547, "logps/chosen": -483.0361328125, "logps/ref_chosen": -275.0474548339844, "logps/ref_rejected": -260.8862609863281, "logps/rejected": -555.9755859375, "loss": 2.3094, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -0.8254634737968445, "rewards/margins": 0.3411196172237396, "rewards/rejected": -1.1665830612182617, "step": 265 }, { "epoch": 0.5654450261780105, "grad_norm": 39.24085235595703, "kl/avg_steps": 0.3812499940395355, "kl/beta": 0.003893459914252162, "kl/n_epsilon_steps": 0.3062500059604645, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.381045210440644e-07, "logits/chosen": -0.8372025489807129, "logits/rejected": -0.8036754727363586, "logps/chosen": -514.378662109375, "logps/ref_chosen": -286.2037353515625, "logps/ref_rejected": -257.1638488769531, "logps/rejected": -565.0552368164062, "loss": 2.3754, "rewards/accuracies": 0.703125, "rewards/chosen": -0.886857807636261, "rewards/margins": 0.30536893010139465, "rewards/rejected": -1.192226767539978, "step": 270 }, { "epoch": 0.5759162303664922, "grad_norm": 26.569904327392578, "kl/avg_steps": 0.43281251192092896, "kl/beta": 0.003820503130555153, "kl/n_epsilon_steps": 0.27812498807907104, "kl/p_epsilon_steps": 0.7109375, "learning_rate": 2.2897108053782e-07, "logits/chosen": -0.7596295475959778, "logits/rejected": -0.681503415107727, "logps/chosen": -490.3072204589844, "logps/ref_chosen": -279.13299560546875, "logps/ref_rejected": -259.39117431640625, "logps/rejected": -567.443115234375, "loss": 2.2678, "rewards/accuracies": 0.739062488079071, "rewards/chosen": -0.8055984377861023, "rewards/margins": 0.364768922328949, "rewards/rejected": -1.1703672409057617, "step": 275 }, { "epoch": 0.5863874345549738, "grad_norm": 35.188594818115234, "kl/avg_steps": 0.3515625, "kl/beta": 0.0037416163831949234, "kl/n_epsilon_steps": 0.3203125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.1986582993616925e-07, "logits/chosen": -0.748282253742218, "logits/rejected": -0.7246442437171936, "logps/chosen": -495.932373046875, "logps/ref_chosen": -282.1095886230469, "logps/ref_rejected": -264.97418212890625, "logps/rejected": -564.8289794921875, "loss": 2.3771, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.7993821501731873, "rewards/margins": 0.31663891673088074, "rewards/rejected": -1.1160210371017456, "step": 280 }, { "epoch": 0.5968586387434555, "grad_norm": 47.78409194946289, "kl/avg_steps": 0.3578124940395355, "kl/beta": 0.003674892010167241, "kl/n_epsilon_steps": 0.31718748807907104, "kl/p_epsilon_steps": 0.675000011920929, "learning_rate": 2.1080097510381294e-07, "logits/chosen": -0.7291465997695923, "logits/rejected": -0.6791597604751587, "logps/chosen": -517.5823364257812, "logps/ref_chosen": -290.4418029785156, "logps/ref_rejected": -268.6685791015625, "logps/rejected": -578.9339599609375, "loss": 2.3874, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.8336542248725891, "rewards/margins": 0.3005620241165161, "rewards/rejected": -1.13421630859375, "step": 285 }, { "epoch": 0.6073298429319371, "grad_norm": 39.73606872558594, "kl/avg_steps": 0.34375, "kl/beta": 0.003612424712628126, "kl/n_epsilon_steps": 0.32343751192092896, "kl/p_epsilon_steps": 0.667187511920929, "learning_rate": 2.0178866775369774e-07, "logits/chosen": -0.7862906455993652, "logits/rejected": -0.7113832831382751, "logps/chosen": -526.5277709960938, "logps/ref_chosen": -299.27069091796875, "logps/ref_rejected": -273.0187683105469, "logps/rejected": -576.5018310546875, "loss": 2.4388, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -0.8203716278076172, "rewards/margins": 0.2704153060913086, "rewards/rejected": -1.0907868146896362, "step": 290 }, { "epoch": 0.6178010471204188, "grad_norm": 34.82834243774414, "kl/avg_steps": 0.43281251192092896, "kl/beta": 0.0035443275701254606, "kl/n_epsilon_steps": 0.2796874940395355, "kl/p_epsilon_steps": 0.7124999761581421, "learning_rate": 1.928409891572757e-07, "logits/chosen": -0.7738717794418335, "logits/rejected": -0.7536409497261047, "logps/chosen": -464.02081298828125, "logps/ref_chosen": -265.9072265625, "logps/ref_rejected": -260.17999267578125, "logps/rejected": -550.2824096679688, "loss": 2.3322, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.7006224393844604, "rewards/margins": 0.3213840126991272, "rewards/rejected": -1.0220063924789429, "step": 295 }, { "epoch": 0.6282722513089005, "grad_norm": 39.66903305053711, "kl/avg_steps": 0.42656248807907104, "kl/beta": 0.00346914934925735, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.707812488079071, "learning_rate": 1.839699339491937e-07, "logits/chosen": -0.770916223526001, "logits/rejected": -0.7498027682304382, "logps/chosen": -492.57708740234375, "logps/ref_chosen": -297.228515625, "logps/ref_rejected": -277.4806823730469, "logps/rejected": -560.9464721679688, "loss": 2.3602, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6763466596603394, "rewards/margins": 0.3012104332447052, "rewards/rejected": -0.9775570631027222, "step": 300 }, { "epoch": 0.6387434554973822, "grad_norm": 37.02199935913086, "kl/avg_steps": 0.48124998807907104, "kl/beta": 0.0033984233159571886, "kl/n_epsilon_steps": 0.25468748807907104, "kl/p_epsilon_steps": 0.7359374761581421, "learning_rate": 1.7518739404812155e-07, "logits/chosen": -0.7336605191230774, "logits/rejected": -0.6881910562515259, "logps/chosen": -477.9002990722656, "logps/ref_chosen": -280.66046142578125, "logps/ref_rejected": -260.27734375, "logps/rejected": -557.06005859375, "loss": 2.3082, "rewards/accuracies": 0.745312511920929, "rewards/chosen": -0.6685991883277893, "rewards/margins": 0.3329974114894867, "rewards/rejected": -1.0015965700149536, "step": 305 }, { "epoch": 0.6492146596858639, "grad_norm": 35.926116943359375, "kl/avg_steps": 0.39375001192092896, "kl/beta": 0.0033192094415426254, "kl/n_epsilon_steps": 0.3031249940395355, "kl/p_epsilon_steps": 0.6968749761581421, "learning_rate": 1.6650514271527465e-07, "logits/chosen": -0.7663410305976868, "logits/rejected": -0.7497197389602661, "logps/chosen": -523.6968994140625, "logps/ref_chosen": -291.5494079589844, "logps/ref_rejected": -259.37451171875, "logps/rejected": -582.4730224609375, "loss": 2.3585, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.7690061330795288, "rewards/margins": 0.2971481680870056, "rewards/rejected": -1.0661542415618896, "step": 310 }, { "epoch": 0.6596858638743456, "grad_norm": 39.415775299072266, "kl/avg_steps": 0.40937501192092896, "kl/beta": 0.003254226641729474, "kl/n_epsilon_steps": 0.29218751192092896, "kl/p_epsilon_steps": 0.7015625238418579, "learning_rate": 1.5793481877199943e-07, "logits/chosen": -0.7955919504165649, "logits/rejected": -0.7476423382759094, "logps/chosen": -537.8627319335938, "logps/ref_chosen": -292.489501953125, "logps/ref_rejected": -265.90142822265625, "logps/rejected": -614.3427734375, "loss": 2.3441, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.797197699546814, "rewards/margins": 0.3302631378173828, "rewards/rejected": -1.1274608373641968, "step": 315 }, { "epoch": 0.6701570680628273, "grad_norm": 44.21771240234375, "kl/avg_steps": 0.44062501192092896, "kl/beta": 0.0031848729122430086, "kl/n_epsilon_steps": 0.2734375, "kl/p_epsilon_steps": 0.714062511920929, "learning_rate": 1.4948791099758052e-07, "logits/chosen": -0.837963879108429, "logits/rejected": -0.783177375793457, "logps/chosen": -533.9610595703125, "logps/ref_chosen": -287.98382568359375, "logps/ref_rejected": -254.04556274414062, "logps/rejected": -604.9327392578125, "loss": 2.3005, "rewards/accuracies": 0.745312511920929, "rewards/chosen": -0.7814763784408569, "rewards/margins": 0.329379677772522, "rewards/rejected": -1.1108559370040894, "step": 320 }, { "epoch": 0.680628272251309, "grad_norm": 46.65283966064453, "kl/avg_steps": 0.38749998807907104, "kl/beta": 0.003123135305941105, "kl/n_epsilon_steps": 0.30156248807907104, "kl/p_epsilon_steps": 0.6890624761581421, "learning_rate": 1.4117574272818386e-07, "logits/chosen": -0.7863418459892273, "logits/rejected": -0.7105034589767456, "logps/chosen": -537.3060302734375, "logps/ref_chosen": -279.3980712890625, "logps/ref_rejected": -248.03665161132812, "logps/rejected": -595.1973876953125, "loss": 2.4272, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.804741382598877, "rewards/margins": 0.2737797200679779, "rewards/rejected": -1.0785211324691772, "step": 325 }, { "epoch": 0.6910994764397905, "grad_norm": 29.98026466369629, "kl/avg_steps": 0.4937500059604645, "kl/beta": 0.0030527953058481216, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.7437499761581421, "learning_rate": 1.3300945667758012e-07, "logits/chosen": -0.8432635068893433, "logits/rejected": -0.8058542013168335, "logps/chosen": -524.1719970703125, "logps/ref_chosen": -288.5478210449219, "logps/ref_rejected": -284.4470520019531, "logps/rejected": -632.7179565429688, "loss": 2.3035, "rewards/accuracies": 0.75, "rewards/chosen": -0.717328667640686, "rewards/margins": 0.33894094824790955, "rewards/rejected": -1.0562695264816284, "step": 330 }, { "epoch": 0.7015706806282722, "grad_norm": 31.66542625427246, "kl/avg_steps": 0.4078125059604645, "kl/beta": 0.002983611077070236, "kl/n_epsilon_steps": 0.2906250059604645, "kl/p_epsilon_steps": 0.698437511920929, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.812516987323761, "logits/rejected": -0.7697084546089172, "logps/chosen": -526.1358642578125, "logps/ref_chosen": -284.29949951171875, "logps/ref_rejected": -253.87112426757812, "logps/rejected": -591.4185180664062, "loss": 2.4022, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.7203218340873718, "rewards/margins": 0.2809144854545593, "rewards/rejected": -1.0012364387512207, "step": 335 }, { "epoch": 0.7120418848167539, "grad_norm": 37.4275016784668, "kl/avg_steps": 0.40937501192092896, "kl/beta": 0.0029245249461382627, "kl/n_epsilon_steps": 0.2906250059604645, "kl/p_epsilon_steps": 0.699999988079071, "learning_rate": 1.1715810961514072e-07, "logits/chosen": -0.7679350972175598, "logits/rejected": -0.7031491994857788, "logps/chosen": -506.499267578125, "logps/ref_chosen": -271.03009033203125, "logps/ref_rejected": -258.16107177734375, "logps/rejected": -602.6541748046875, "loss": 2.3478, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6875978708267212, "rewards/margins": 0.3139950633049011, "rewards/rejected": -1.0015928745269775, "step": 340 }, { "epoch": 0.7225130890052356, "grad_norm": 39.2715950012207, "kl/avg_steps": 0.3765625059604645, "kl/beta": 0.0028661820106208324, "kl/n_epsilon_steps": 0.30937498807907104, "kl/p_epsilon_steps": 0.6859375238418579, "learning_rate": 1.09494297815e-07, "logits/chosen": -0.8377294540405273, "logits/rejected": -0.798270046710968, "logps/chosen": -541.9393310546875, "logps/ref_chosen": -296.1241149902344, "logps/ref_rejected": -271.4391784667969, "logps/rejected": -609.2572631835938, "loss": 2.4116, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.7037110328674316, "rewards/margins": 0.2594611346721649, "rewards/rejected": -0.9631722569465637, "step": 345 }, { "epoch": 0.7329842931937173, "grad_norm": 44.71064758300781, "kl/avg_steps": 0.41874998807907104, "kl/beta": 0.0028099946212023497, "kl/n_epsilon_steps": 0.2874999940395355, "kl/p_epsilon_steps": 0.706250011920929, "learning_rate": 1.0201883817182949e-07, "logits/chosen": -0.8447354435920715, "logits/rejected": -0.791462242603302, "logps/chosen": -526.6029052734375, "logps/ref_chosen": -289.80242919921875, "logps/ref_rejected": -255.99264526367188, "logps/rejected": -595.5274658203125, "loss": 2.379, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.6641503572463989, "rewards/margins": 0.2842598557472229, "rewards/rejected": -0.9484102129936218, "step": 350 }, { "epoch": 0.743455497382199, "grad_norm": 52.74271774291992, "kl/avg_steps": 0.3656249940395355, "kl/beta": 0.0027572487015277147, "kl/n_epsilon_steps": 0.3109374940395355, "kl/p_epsilon_steps": 0.676562488079071, "learning_rate": 9.474175176609956e-08, "logits/chosen": -0.8031132817268372, "logits/rejected": -0.7760835886001587, "logps/chosen": -517.4700927734375, "logps/ref_chosen": -277.7060241699219, "logps/ref_rejected": -261.61639404296875, "logps/rejected": -599.2596435546875, "loss": 2.4192, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -0.6601977944374084, "rewards/margins": 0.2653747498989105, "rewards/rejected": -0.9255725741386414, "step": 355 }, { "epoch": 0.7539267015706806, "grad_norm": 35.9310302734375, "kl/avg_steps": 0.40312498807907104, "kl/beta": 0.002704120706766844, "kl/n_epsilon_steps": 0.29218751192092896, "kl/p_epsilon_steps": 0.6953125, "learning_rate": 8.76727937529367e-08, "logits/chosen": -0.7600405812263489, "logits/rejected": -0.7285404205322266, "logps/chosen": -517.8834228515625, "logps/ref_chosen": -276.4765930175781, "logps/ref_rejected": -245.36392211914062, "logps/rejected": -587.1243896484375, "loss": 2.4082, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.6516150236129761, "rewards/margins": 0.2669870853424072, "rewards/rejected": -0.9186019897460938, "step": 360 }, { "epoch": 0.7643979057591623, "grad_norm": 37.35695266723633, "kl/avg_steps": 0.37031251192092896, "kl/beta": 0.0026484958361834288, "kl/n_epsilon_steps": 0.3109374940395355, "kl/p_epsilon_steps": 0.6812499761581421, "learning_rate": 8.082144028504231e-08, "logits/chosen": -0.7748720049858093, "logits/rejected": -0.718481719493866, "logps/chosen": -530.27294921875, "logps/ref_chosen": -286.0633850097656, "logps/ref_rejected": -263.576904296875, "logps/rejected": -618.2557373046875, "loss": 2.3771, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.6457995176315308, "rewards/margins": 0.288248211145401, "rewards/rejected": -0.9340476989746094, "step": 365 }, { "epoch": 0.774869109947644, "grad_norm": 40.185733795166016, "kl/avg_steps": 0.46562498807907104, "kl/beta": 0.0025993292219936848, "kl/n_epsilon_steps": 0.2640624940395355, "kl/p_epsilon_steps": 0.729687511920929, "learning_rate": 7.419687580962222e-08, "logits/chosen": -0.7946727871894836, "logits/rejected": -0.7593673467636108, "logps/chosen": -518.7752685546875, "logps/ref_chosen": -283.3466796875, "logps/ref_rejected": -256.1686706542969, "logps/rejected": -609.0264282226562, "loss": 2.3308, "rewards/accuracies": 0.7515624761581421, "rewards/chosen": -0.6107124090194702, "rewards/margins": 0.30089524388313293, "rewards/rejected": -0.9116076231002808, "step": 370 }, { "epoch": 0.7853403141361257, "grad_norm": 30.292072296142578, "kl/avg_steps": 0.35468751192092896, "kl/beta": 0.0025424479972571135, "kl/n_epsilon_steps": 0.3203125, "kl/p_epsilon_steps": 0.675000011920929, "learning_rate": 6.780798075635675e-08, "logits/chosen": -0.8589094877243042, "logits/rejected": -0.7770653963088989, "logps/chosen": -571.2823486328125, "logps/ref_chosen": -316.7373962402344, "logps/ref_rejected": -270.4641418457031, "logps/rejected": -624.1759643554688, "loss": 2.4183, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6464765071868896, "rewards/margins": 0.24820394814014435, "rewards/rejected": -0.894680380821228, "step": 375 }, { "epoch": 0.7958115183246073, "grad_norm": 36.047645568847656, "kl/avg_steps": 0.35624998807907104, "kl/beta": 0.002498726826161146, "kl/n_epsilon_steps": 0.31718748807907104, "kl/p_epsilon_steps": 0.6734374761581421, "learning_rate": 6.166331963291519e-08, "logits/chosen": -0.7931220531463623, "logits/rejected": -0.7626051902770996, "logps/chosen": -546.7606201171875, "logps/ref_chosen": -289.0906982421875, "logps/ref_rejected": -268.543701171875, "logps/rejected": -626.6883544921875, "loss": 2.4314, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6431035995483398, "rewards/margins": 0.24701526761054993, "rewards/rejected": -0.8901188969612122, "step": 380 }, { "epoch": 0.806282722513089, "grad_norm": 32.30388259887695, "kl/avg_steps": 0.4234375059604645, "kl/beta": 0.0024520312435925007, "kl/n_epsilon_steps": 0.2828125059604645, "kl/p_epsilon_steps": 0.706250011920929, "learning_rate": 5.57711295439732e-08, "logits/chosen": -0.8205176591873169, "logits/rejected": -0.7670890092849731, "logps/chosen": -525.959228515625, "logps/ref_chosen": -274.06439208984375, "logps/ref_rejected": -266.3952941894531, "logps/rejected": -634.7651977539062, "loss": 2.3642, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.6165703535079956, "rewards/margins": 0.28162333369255066, "rewards/rejected": -0.8981936573982239, "step": 385 }, { "epoch": 0.8167539267015707, "grad_norm": 24.1032657623291, "kl/avg_steps": 0.38749998807907104, "kl/beta": 0.0024003933649510145, "kl/n_epsilon_steps": 0.2984375059604645, "kl/p_epsilon_steps": 0.6859375238418579, "learning_rate": 5.013930914912476e-08, "logits/chosen": -0.7663313150405884, "logits/rejected": -0.7912431359291077, "logps/chosen": -555.1474609375, "logps/ref_chosen": -286.0129089355469, "logps/ref_rejected": -267.3469543457031, "logps/rejected": -642.2298583984375, "loss": 2.423, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.644939661026001, "rewards/margins": 0.24981026351451874, "rewards/rejected": -0.8947499990463257, "step": 390 }, { "epoch": 0.8272251308900523, "grad_norm": 34.481441497802734, "kl/avg_steps": 0.4124999940395355, "kl/beta": 0.002353919204324484, "kl/n_epsilon_steps": 0.2906250059604645, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.477540807448832e-08, "logits/chosen": -0.803361713886261, "logits/rejected": -0.7479076385498047, "logps/chosen": -565.6026611328125, "logps/ref_chosen": -295.1082458496094, "logps/ref_rejected": -267.33929443359375, "logps/rejected": -656.7734985351562, "loss": 2.3829, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.6354493498802185, "rewards/margins": 0.27587801218032837, "rewards/rejected": -0.9113273620605469, "step": 395 }, { "epoch": 0.837696335078534, "grad_norm": 27.193138122558594, "kl/avg_steps": 0.3359375, "kl/beta": 0.002311053918674588, "kl/n_epsilon_steps": 0.3265624940395355, "kl/p_epsilon_steps": 0.6625000238418579, "learning_rate": 3.968661679220467e-08, "logits/chosen": -0.8362157940864563, "logits/rejected": -0.7958248853683472, "logps/chosen": -570.918212890625, "logps/ref_chosen": -291.07147216796875, "logps/ref_rejected": -268.5450744628906, "logps/rejected": -645.52734375, "loss": 2.4805, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6461815237998962, "rewards/margins": 0.2202996462583542, "rewards/rejected": -0.8664811253547668, "step": 400 }, { "epoch": 0.837696335078534, "eval_kl/n_epsilon_steps": 0.31854838132858276, "eval_kl/p_epsilon_steps": 0.6733871102333069, "eval_logits/chosen": -0.8105864524841309, "eval_logits/rejected": -0.770939290523529, "eval_logps/chosen": -567.7598876953125, "eval_logps/ref_chosen": -287.9388427734375, "eval_logps/ref_rejected": -266.7934875488281, "eval_logps/rejected": -657.15625, "eval_loss": 0.6085002422332764, "eval_rewards/accuracies": 0.6905242204666138, "eval_rewards/chosen": -0.6393237709999084, "eval_rewards/margins": 0.2487860471010208, "eval_rewards/rejected": -0.8881098628044128, "eval_runtime": 50.723, "eval_samples_per_second": 39.43, "eval_steps_per_second": 1.242, "step": 400 }, { "epoch": 0.8481675392670157, "grad_norm": 25.73514747619629, "kl/avg_steps": 0.40312498807907104, "kl/beta": 0.0022695644292980433, "kl/n_epsilon_steps": 0.2953124940395355, "kl/p_epsilon_steps": 0.698437511920929, "learning_rate": 3.487975698139084e-08, "logits/chosen": -0.8461328744888306, "logits/rejected": -0.7856892347335815, "logps/chosen": -571.0203857421875, "logps/ref_chosen": -298.4881896972656, "logps/ref_rejected": -272.38616943359375, "logps/rejected": -665.0057983398438, "loss": 2.3891, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6175375580787659, "rewards/margins": 0.2684290409088135, "rewards/rejected": -0.8859665989875793, "step": 405 }, { "epoch": 0.8586387434554974, "grad_norm": 25.438684463500977, "kl/avg_steps": 0.3765625059604645, "kl/beta": 0.0022252278868108988, "kl/n_epsilon_steps": 0.30781251192092896, "kl/p_epsilon_steps": 0.684374988079071, "learning_rate": 3.036127238347164e-08, "logits/chosen": -0.8197180032730103, "logits/rejected": -0.7961743474006653, "logps/chosen": -564.6549072265625, "logps/ref_chosen": -283.31024169921875, "logps/ref_rejected": -264.3026428222656, "logps/rejected": -655.8562622070312, "loss": 2.4405, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -0.6252355575561523, "rewards/margins": 0.2411845475435257, "rewards/rejected": -0.8664200901985168, "step": 410 }, { "epoch": 0.8691099476439791, "grad_norm": 31.828752517700195, "kl/avg_steps": 0.3843750059604645, "kl/beta": 0.002183457836508751, "kl/n_epsilon_steps": 0.3062500059604645, "kl/p_epsilon_steps": 0.690625011920929, "learning_rate": 2.613722016414943e-08, "logits/chosen": -0.810443103313446, "logits/rejected": -0.784401535987854, "logps/chosen": -565.6261596679688, "logps/ref_chosen": -284.89312744140625, "logps/ref_rejected": -269.9698486328125, "logps/rejected": -664.2562255859375, "loss": 2.4109, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6117661595344543, "rewards/margins": 0.24424156546592712, "rewards/rejected": -0.8560077548027039, "step": 415 }, { "epoch": 0.8795811518324608, "grad_norm": 53.15812301635742, "kl/avg_steps": 0.38593751192092896, "kl/beta": 0.0021418784745037556, "kl/n_epsilon_steps": 0.3031249940395355, "kl/p_epsilon_steps": 0.6890624761581421, "learning_rate": 2.2213262793589482e-08, "logits/chosen": -0.8010333180427551, "logits/rejected": -0.7181005477905273, "logps/chosen": -580.8268432617188, "logps/ref_chosen": -292.8439025878906, "logps/ref_rejected": -262.83221435546875, "logps/rejected": -674.4088745117188, "loss": 2.4085, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.6158552169799805, "rewards/margins": 0.2605220675468445, "rewards/rejected": -0.876377284526825, "step": 420 }, { "epoch": 0.8900523560209425, "grad_norm": 30.515869140625, "kl/avg_steps": 0.44999998807907104, "kl/beta": 0.0020984853617846966, "kl/n_epsilon_steps": 0.27031248807907104, "kl/p_epsilon_steps": 0.7203124761581421, "learning_rate": 1.8594660455706763e-08, "logits/chosen": -0.8028408288955688, "logits/rejected": -0.7873013019561768, "logps/chosen": -572.7071533203125, "logps/ref_chosen": -294.400390625, "logps/ref_rejected": -257.50152587890625, "logps/rejected": -652.4434204101562, "loss": 2.4132, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5826085805892944, "rewards/margins": 0.24108126759529114, "rewards/rejected": -0.8236897587776184, "step": 425 }, { "epoch": 0.900523560209424, "grad_norm": 29.375471115112305, "kl/avg_steps": 0.3765625059604645, "kl/beta": 0.002055021934211254, "kl/n_epsilon_steps": 0.3062500059604645, "kl/p_epsilon_steps": 0.682812511920929, "learning_rate": 1.5286263996730026e-08, "logits/chosen": -0.7730949521064758, "logits/rejected": -0.7212635278701782, "logps/chosen": -577.01123046875, "logps/ref_chosen": -288.0412902832031, "logps/ref_rejected": -265.40423583984375, "logps/rejected": -691.1805419921875, "loss": 2.3683, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.5929520726203918, "rewards/margins": 0.27714505791664124, "rewards/rejected": -0.8700970411300659, "step": 430 }, { "epoch": 0.9109947643979057, "grad_norm": 39.444175720214844, "kl/avg_steps": 0.4124999940395355, "kl/beta": 0.0020161038264632225, "kl/n_epsilon_steps": 0.2890625, "kl/p_epsilon_steps": 0.7015625238418579, "learning_rate": 1.2292508422495157e-08, "logits/chosen": -0.8105589747428894, "logits/rejected": -0.7723300457000732, "logps/chosen": -557.72216796875, "logps/ref_chosen": -273.5352783203125, "logps/ref_rejected": -256.591552734375, "logps/rejected": -656.2905883789062, "loss": 2.4303, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5717044472694397, "rewards/margins": 0.2294149398803711, "rewards/rejected": -0.801119327545166, "step": 435 }, { "epoch": 0.9214659685863874, "grad_norm": 35.571556091308594, "kl/avg_steps": 0.34843748807907104, "kl/beta": 0.0019766315817832947, "kl/n_epsilon_steps": 0.31718748807907104, "kl/p_epsilon_steps": 0.6656249761581421, "learning_rate": 9.617406953185136e-09, "logits/chosen": -0.793804943561554, "logits/rejected": -0.739010214805603, "logps/chosen": -584.5250244140625, "logps/ref_chosen": -284.5547180175781, "logps/ref_rejected": -264.2243957519531, "logps/rejected": -664.7174072265625, "loss": 2.4964, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5920853018760681, "rewards/margins": 0.19534674286842346, "rewards/rejected": -0.7874319553375244, "step": 440 }, { "epoch": 0.9319371727748691, "grad_norm": 25.380355834960938, "kl/avg_steps": 0.41718751192092896, "kl/beta": 0.0019389099907130003, "kl/n_epsilon_steps": 0.2874999940395355, "kl/p_epsilon_steps": 0.7046874761581421, "learning_rate": 7.2645456434869965e-09, "logits/chosen": -0.8345752954483032, "logits/rejected": -0.8067754507064819, "logps/chosen": -579.0960693359375, "logps/ref_chosen": -283.0409851074219, "logps/ref_rejected": -267.3383483886719, "logps/rejected": -677.9078369140625, "loss": 2.4509, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5728877782821655, "rewards/margins": 0.2185221165418625, "rewards/rejected": -0.791409969329834, "step": 445 }, { "epoch": 0.9424083769633508, "grad_norm": 28.54435920715332, "kl/avg_steps": 0.44999998807907104, "kl/beta": 0.0018983843037858605, "kl/n_epsilon_steps": 0.2671875059604645, "kl/p_epsilon_steps": 0.7171875238418579, "learning_rate": 5.2370785753763356e-09, "logits/chosen": -0.7563034892082214, "logits/rejected": -0.7171027660369873, "logps/chosen": -579.7213134765625, "logps/ref_chosen": -290.37457275390625, "logps/ref_rejected": -251.1839599609375, "logps/rejected": -664.0587768554688, "loss": 2.4174, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.547935962677002, "rewards/margins": 0.23112261295318604, "rewards/rejected": -0.779058575630188, "step": 450 }, { "epoch": 0.9528795811518325, "grad_norm": 26.56414794921875, "kl/avg_steps": 0.4046874940395355, "kl/beta": 0.0018582321936264634, "kl/n_epsilon_steps": 0.2953124940395355, "kl/p_epsilon_steps": 0.699999988079071, "learning_rate": 3.5377236299748147e-09, "logits/chosen": -0.764384388923645, "logits/rejected": -0.7353655099868774, "logps/chosen": -600.6401977539062, "logps/ref_chosen": -299.91766357421875, "logps/ref_rejected": -284.15386962890625, "logps/rejected": -705.1951293945312, "loss": 2.4529, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5578422546386719, "rewards/margins": 0.2201496660709381, "rewards/rejected": -0.7779918909072876, "step": 455 }, { "epoch": 0.9633507853403142, "grad_norm": 26.40264320373535, "kl/avg_steps": 0.34687501192092896, "kl/beta": 0.0018218166660517454, "kl/n_epsilon_steps": 0.32343751192092896, "kl/p_epsilon_steps": 0.6703125238418579, "learning_rate": 2.168758844148272e-09, "logits/chosen": -0.7754079103469849, "logits/rejected": -0.7311118841171265, "logps/chosen": -614.3809204101562, "logps/ref_chosen": -307.8611145019531, "logps/ref_rejected": -278.6595764160156, "logps/rejected": -698.3309936523438, "loss": 2.4819, "rewards/accuracies": 0.6890624761581421, "rewards/chosen": -0.5579292178153992, "rewards/margins": 0.2028123438358307, "rewards/rejected": -0.7607415914535522, "step": 460 }, { "epoch": 0.9738219895287958, "grad_norm": 22.700458526611328, "kl/avg_steps": 0.3984375, "kl/beta": 0.0017896599601954222, "kl/n_epsilon_steps": 0.2984375059604645, "kl/p_epsilon_steps": 0.6968749761581421, "learning_rate": 1.1320193567288527e-09, "logits/chosen": -0.8142029643058777, "logits/rejected": -0.7302736043930054, "logps/chosen": -581.177978515625, "logps/ref_chosen": -288.8356018066406, "logps/ref_rejected": -253.9193878173828, "logps/rejected": -668.072998046875, "loss": 2.4467, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.5222727060317993, "rewards/margins": 0.2147037535905838, "rewards/rejected": -0.7369765043258667, "step": 465 }, { "epoch": 0.9842931937172775, "grad_norm": 25.03793716430664, "kl/avg_steps": 0.4124999940395355, "kl/beta": 0.0017548914765939116, "kl/n_epsilon_steps": 0.2874999940395355, "kl/p_epsilon_steps": 0.699999988079071, "learning_rate": 4.288949484559934e-10, "logits/chosen": -0.780587375164032, "logits/rejected": -0.7298108339309692, "logps/chosen": -582.7492065429688, "logps/ref_chosen": -297.07720947265625, "logps/ref_rejected": -262.2540588378906, "logps/rejected": -669.369140625, "loss": 2.4556, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5004380345344543, "rewards/margins": 0.20986874401569366, "rewards/rejected": -0.7103067636489868, "step": 470 }, { "epoch": 0.9947643979057592, "grad_norm": 26.781949996948242, "kl/avg_steps": 0.4453125, "kl/beta": 0.0017176285618916154, "kl/n_epsilon_steps": 0.2750000059604645, "kl/p_epsilon_steps": 0.7203124761581421, "learning_rate": 6.032817893297793e-11, "logits/chosen": -0.7933133840560913, "logits/rejected": -0.7786288857460022, "logps/chosen": -558.02197265625, "logps/ref_chosen": -273.3193359375, "logps/ref_rejected": -263.99151611328125, "logps/rejected": -678.0519409179688, "loss": 2.4323, "rewards/accuracies": 0.7359374761581421, "rewards/chosen": -0.4878809452056885, "rewards/margins": 0.2191971242427826, "rewards/rejected": -0.7070780396461487, "step": 475 }, { "epoch": 0.9989528795811519, "step": 477, "total_flos": 0.0, "train_loss": 2.463846208664356, "train_runtime": 4358.2481, "train_samples_per_second": 14.027, "train_steps_per_second": 0.109 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }