{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1984, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005040957781978576, "grad_norm": 25.75, "learning_rate": 3e-08, "logits/chosen": -0.07891461253166199, "logits/rejected": 0.004119270481169224, "logps/chosen": -62.022430419921875, "logps/rejected": -65.60428619384766, "loss": 0.6949, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": 0.0029429681599140167, "rewards/margins": -0.0032222606241703033, "rewards/rejected": 0.006165228318423033, "step": 10 }, { "epoch": 0.010081915563957152, "grad_norm": 19.5, "learning_rate": 6.333333333333333e-08, "logits/chosen": -0.058114223182201385, "logits/rejected": 0.02890823781490326, "logps/chosen": -61.93231201171875, "logps/rejected": -65.11712646484375, "loss": 0.6907, "rewards/accuracies": 0.53125, "rewards/chosen": 0.008925376459956169, "rewards/margins": 0.005283808801323175, "rewards/rejected": 0.0036415669601410627, "step": 20 }, { "epoch": 0.015122873345935728, "grad_norm": 25.0, "learning_rate": 9.666666666666666e-08, "logits/chosen": -0.04496127367019653, "logits/rejected": 0.017042722553014755, "logps/chosen": -61.5279541015625, "logps/rejected": -65.24421691894531, "loss": 0.691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0035877160262316465, "rewards/margins": 0.004559223540127277, "rewards/rejected": -0.0009715079213492572, "step": 30 }, { "epoch": 0.020163831127914304, "grad_norm": 29.25, "learning_rate": 1.3e-07, "logits/chosen": -0.0775897353887558, "logits/rejected": -0.00422413507476449, "logps/chosen": -61.86296463012695, "logps/rejected": -65.46966552734375, "loss": 0.6852, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.006382000632584095, "rewards/margins": 0.016284234821796417, "rewards/rejected": -0.009902234189212322, "step": 40 }, { "epoch": 0.02520478890989288, "grad_norm": 32.0, "learning_rate": 1.6333333333333331e-07, "logits/chosen": -0.10396875441074371, "logits/rejected": -0.028726909309625626, "logps/chosen": -61.53949737548828, "logps/rejected": -65.4787826538086, "loss": 0.6777, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0072999573312699795, "rewards/margins": 0.03146541863679886, "rewards/rejected": -0.02416546270251274, "step": 50 }, { "epoch": 0.030245746691871456, "grad_norm": 28.125, "learning_rate": 1.9666666666666665e-07, "logits/chosen": -0.15684179961681366, "logits/rejected": -0.08579285442829132, "logps/chosen": -62.388755798339844, "logps/rejected": -65.70417785644531, "loss": 0.6749, "rewards/accuracies": 0.8125, "rewards/chosen": -0.004609875846654177, "rewards/margins": 0.03752168267965317, "rewards/rejected": -0.04213155806064606, "step": 60 }, { "epoch": 0.03528670447385003, "grad_norm": 32.75, "learning_rate": 2.3e-07, "logits/chosen": -0.14031846821308136, "logits/rejected": -0.054399728775024414, "logps/chosen": -62.0086669921875, "logps/rejected": -66.37232971191406, "loss": 0.6629, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.01638166978955269, "rewards/margins": 0.0622851625084877, "rewards/rejected": -0.07866682857275009, "step": 70 }, { "epoch": 0.04032766225582861, "grad_norm": 30.25, "learning_rate": 2.633333333333333e-07, "logits/chosen": -0.19501006603240967, "logits/rejected": -0.1102059856057167, "logps/chosen": -62.68883514404297, "logps/rejected": -66.286865234375, "loss": 0.6564, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.03311951085925102, "rewards/margins": 0.07680130004882812, "rewards/rejected": -0.10992081463336945, "step": 80 }, { "epoch": 0.045368620037807186, "grad_norm": 32.0, "learning_rate": 2.966666666666667e-07, "logits/chosen": -0.24648718535900116, "logits/rejected": -0.16621707379817963, "logps/chosen": -62.35799026489258, "logps/rejected": -67.0835189819336, "loss": 0.6371, "rewards/accuracies": 0.875, "rewards/chosen": -0.039066143333911896, "rewards/margins": 0.1181197538971901, "rewards/rejected": -0.1571858823299408, "step": 90 }, { "epoch": 0.05040957781978576, "grad_norm": 26.0, "learning_rate": 3.3e-07, "logits/chosen": -0.2515576481819153, "logits/rejected": -0.19324862957000732, "logps/chosen": -62.55232620239258, "logps/rejected": -67.55314636230469, "loss": 0.6217, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.06582323461771011, "rewards/margins": 0.15158048272132874, "rewards/rejected": -0.21740372478961945, "step": 100 }, { "epoch": 0.05040957781978576, "eval_logits/chosen": -0.3329714834690094, "eval_logits/rejected": -0.27068039774894714, "eval_logps/chosen": -62.82643508911133, "eval_logps/rejected": -67.86115264892578, "eval_loss": 0.6163578629493713, "eval_rewards/accuracies": 0.8772454857826233, "eval_rewards/chosen": -0.08201639354228973, "eval_rewards/margins": 0.16533808410167694, "eval_rewards/rejected": -0.24735447764396667, "eval_runtime": 71.6288, "eval_samples_per_second": 23.315, "eval_steps_per_second": 23.315, "step": 100 }, { "epoch": 0.055450535601764335, "grad_norm": 34.5, "learning_rate": 3.6333333333333333e-07, "logits/chosen": -0.38161173462867737, "logits/rejected": -0.3071076273918152, "logps/chosen": -62.72125244140625, "logps/rejected": -68.94508361816406, "loss": 0.5933, "rewards/accuracies": 0.90625, "rewards/chosen": -0.07404644042253494, "rewards/margins": 0.2169165164232254, "rewards/rejected": -0.29096299409866333, "step": 110 }, { "epoch": 0.06049149338374291, "grad_norm": 25.375, "learning_rate": 3.9666666666666665e-07, "logits/chosen": -0.40286582708358765, "logits/rejected": -0.34470418095588684, "logps/chosen": -63.3223762512207, "logps/rejected": -68.89364624023438, "loss": 0.5858, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.1222512498497963, "rewards/margins": 0.23771443963050842, "rewards/rejected": -0.3599656820297241, "step": 120 }, { "epoch": 0.06553245116572148, "grad_norm": 23.125, "learning_rate": 4.2999999999999996e-07, "logits/chosen": -0.4781390130519867, "logits/rejected": -0.4242860674858093, "logps/chosen": -63.421775817871094, "logps/rejected": -69.99397277832031, "loss": 0.5631, "rewards/accuracies": 0.90625, "rewards/chosen": -0.15090402960777283, "rewards/margins": 0.29352977871894836, "rewards/rejected": -0.4444337785243988, "step": 130 }, { "epoch": 0.07057340894770006, "grad_norm": 23.25, "learning_rate": 4.633333333333333e-07, "logits/chosen": -0.5767303705215454, "logits/rejected": -0.5158231854438782, "logps/chosen": -64.02821350097656, "logps/rejected": -70.48841857910156, "loss": 0.5487, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.18629249930381775, "rewards/margins": 0.34241610765457153, "rewards/rejected": -0.5287086367607117, "step": 140 }, { "epoch": 0.07561436672967864, "grad_norm": 23.125, "learning_rate": 4.966666666666666e-07, "logits/chosen": -0.6403996348381042, "logits/rejected": -0.6139777898788452, "logps/chosen": -63.662750244140625, "logps/rejected": -71.99102020263672, "loss": 0.5035, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.180726557970047, "rewards/margins": 0.4561690390110016, "rewards/rejected": -0.6368955373764038, "step": 150 }, { "epoch": 0.08065532451165722, "grad_norm": 24.0, "learning_rate": 4.975463467829879e-07, "logits/chosen": -0.6926871538162231, "logits/rejected": -0.6587594747543335, "logps/chosen": -63.4420051574707, "logps/rejected": -72.86981201171875, "loss": 0.4668, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1938735544681549, "rewards/margins": 0.5592324137687683, "rewards/rejected": -0.7531059980392456, "step": 160 }, { "epoch": 0.0856962822936358, "grad_norm": 23.125, "learning_rate": 4.948200654307524e-07, "logits/chosen": -0.7586608529090881, "logits/rejected": -0.737440824508667, "logps/chosen": -64.16552734375, "logps/rejected": -72.98350524902344, "loss": 0.4687, "rewards/accuracies": 0.875, "rewards/chosen": -0.22095069289207458, "rewards/margins": 0.5723496675491333, "rewards/rejected": -0.7933003306388855, "step": 170 }, { "epoch": 0.09073724007561437, "grad_norm": 22.5, "learning_rate": 4.920937840785169e-07, "logits/chosen": -0.8342425227165222, "logits/rejected": -0.8266761898994446, "logps/chosen": -64.02726745605469, "logps/rejected": -74.54716491699219, "loss": 0.438, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.22078147530555725, "rewards/margins": 0.6804903745651245, "rewards/rejected": -0.9012719392776489, "step": 180 }, { "epoch": 0.09577819785759294, "grad_norm": 22.125, "learning_rate": 4.893675027262814e-07, "logits/chosen": -0.8242881894111633, "logits/rejected": -0.8162325620651245, "logps/chosen": -64.03907012939453, "logps/rejected": -75.18043518066406, "loss": 0.4107, "rewards/accuracies": 0.90625, "rewards/chosen": -0.21485848724842072, "rewards/margins": 0.7709285616874695, "rewards/rejected": -0.985787034034729, "step": 190 }, { "epoch": 0.10081915563957151, "grad_norm": 19.375, "learning_rate": 4.866412213740458e-07, "logits/chosen": -0.8690522909164429, "logits/rejected": -0.8740390539169312, "logps/chosen": -63.641624450683594, "logps/rejected": -75.82588195800781, "loss": 0.3957, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1833191215991974, "rewards/margins": 0.8537132143974304, "rewards/rejected": -1.0370323657989502, "step": 200 }, { "epoch": 0.10081915563957151, "eval_logits/chosen": -0.8788526058197021, "eval_logits/rejected": -0.8778823018074036, "eval_logps/chosen": -64.24597930908203, "eval_logps/rejected": -75.78997802734375, "eval_loss": 0.41245341300964355, "eval_rewards/accuracies": 0.8772454857826233, "eval_rewards/chosen": -0.22397060692310333, "eval_rewards/margins": 0.8162661790847778, "eval_rewards/rejected": -1.0402368307113647, "eval_runtime": 73.4377, "eval_samples_per_second": 22.74, "eval_steps_per_second": 22.74, "step": 200 }, { "epoch": 0.10586011342155009, "grad_norm": 16.375, "learning_rate": 4.839149400218102e-07, "logits/chosen": -0.9034843444824219, "logits/rejected": -0.9223787188529968, "logps/chosen": -63.76435089111328, "logps/rejected": -76.31708526611328, "loss": 0.3748, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.18892225623130798, "rewards/margins": 0.9179438352584839, "rewards/rejected": -1.1068661212921143, "step": 210 }, { "epoch": 0.11090107120352867, "grad_norm": 19.25, "learning_rate": 4.811886586695747e-07, "logits/chosen": -0.9078966379165649, "logits/rejected": -0.9262178540229797, "logps/chosen": -63.7225456237793, "logps/rejected": -76.8105239868164, "loss": 0.3902, "rewards/accuracies": 0.875, "rewards/chosen": -0.18831631541252136, "rewards/margins": 0.9241956472396851, "rewards/rejected": -1.1125118732452393, "step": 220 }, { "epoch": 0.11594202898550725, "grad_norm": 20.5, "learning_rate": 4.784623773173392e-07, "logits/chosen": -0.9634265899658203, "logits/rejected": -0.9702705144882202, "logps/chosen": -63.542388916015625, "logps/rejected": -77.10391998291016, "loss": 0.3663, "rewards/accuracies": 0.90625, "rewards/chosen": -0.19163846969604492, "rewards/margins": 0.9870219230651855, "rewards/rejected": -1.1786603927612305, "step": 230 }, { "epoch": 0.12098298676748583, "grad_norm": 19.125, "learning_rate": 4.7573609596510354e-07, "logits/chosen": -0.920444130897522, "logits/rejected": -0.9512295722961426, "logps/chosen": -63.34272384643555, "logps/rejected": -77.4976577758789, "loss": 0.3607, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.14525040984153748, "rewards/margins": 1.072211503982544, "rewards/rejected": -1.2174618244171143, "step": 240 }, { "epoch": 0.1260239445494644, "grad_norm": 19.125, "learning_rate": 4.7300981461286803e-07, "logits/chosen": -0.9372714757919312, "logits/rejected": -0.9608826637268066, "logps/chosen": -63.630760192871094, "logps/rejected": -77.2890625, "loss": 0.3813, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.14888228476047516, "rewards/margins": 1.0466125011444092, "rewards/rejected": -1.1954947710037231, "step": 250 }, { "epoch": 0.13106490233144297, "grad_norm": 16.125, "learning_rate": 4.7028353326063247e-07, "logits/chosen": -0.9241989254951477, "logits/rejected": -0.9565703272819519, "logps/chosen": -62.576759338378906, "logps/rejected": -78.99192810058594, "loss": 0.2995, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06501423567533493, "rewards/margins": 1.2511422634124756, "rewards/rejected": -1.3161563873291016, "step": 260 }, { "epoch": 0.13610586011342155, "grad_norm": 18.375, "learning_rate": 4.675572519083969e-07, "logits/chosen": -0.9289599657058716, "logits/rejected": -0.959551990032196, "logps/chosen": -64.06565856933594, "logps/rejected": -78.49452209472656, "loss": 0.365, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.16143986582756042, "rewards/margins": 1.1138832569122314, "rewards/rejected": -1.2753230333328247, "step": 270 }, { "epoch": 0.14114681789540012, "grad_norm": 14.75, "learning_rate": 4.648309705561614e-07, "logits/chosen": -0.9531835317611694, "logits/rejected": -0.9772550463676453, "logps/chosen": -63.0753173828125, "logps/rejected": -78.2010726928711, "loss": 0.3479, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.13170328736305237, "rewards/margins": 1.1704909801483154, "rewards/rejected": -1.3021942377090454, "step": 280 }, { "epoch": 0.1461877756773787, "grad_norm": 14.125, "learning_rate": 4.6210468920392583e-07, "logits/chosen": -0.9216547012329102, "logits/rejected": -0.9838453531265259, "logps/chosen": -62.03778076171875, "logps/rejected": -79.33902740478516, "loss": 0.2986, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.029102539643645287, "rewards/margins": 1.3270413875579834, "rewards/rejected": -1.3561439514160156, "step": 290 }, { "epoch": 0.15122873345935728, "grad_norm": 18.375, "learning_rate": 4.5937840785169027e-07, "logits/chosen": -0.9648447036743164, "logits/rejected": -1.0225803852081299, "logps/chosen": -61.83771514892578, "logps/rejected": -79.71820831298828, "loss": 0.2767, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.035991497337818146, "rewards/margins": 1.3972457647323608, "rewards/rejected": -1.4332373142242432, "step": 300 }, { "epoch": 0.15122873345935728, "eval_logits/chosen": -0.977104663848877, "eval_logits/rejected": -1.0141139030456543, "eval_logps/chosen": -63.36873245239258, "eval_logps/rejected": -79.43486022949219, "eval_loss": 0.3420425057411194, "eval_rewards/accuracies": 0.8808383345603943, "eval_rewards/chosen": -0.1362457126379013, "eval_rewards/margins": 1.2684792280197144, "eval_rewards/rejected": -1.4047249555587769, "eval_runtime": 70.6339, "eval_samples_per_second": 23.643, "eval_steps_per_second": 23.643, "step": 300 }, { "epoch": 0.15626969124133586, "grad_norm": 13.75, "learning_rate": 4.566521264994547e-07, "logits/chosen": -0.9742618799209595, "logits/rejected": -1.0176128149032593, "logps/chosen": -63.59131622314453, "logps/rejected": -79.64852905273438, "loss": 0.3397, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.14291712641716003, "rewards/margins": 1.2807643413543701, "rewards/rejected": -1.4236814975738525, "step": 310 }, { "epoch": 0.16131064902331443, "grad_norm": 21.625, "learning_rate": 4.5392584514721915e-07, "logits/chosen": -0.9295794367790222, "logits/rejected": -0.9652513265609741, "logps/chosen": -62.136070251464844, "logps/rejected": -80.7836685180664, "loss": 0.2774, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.03530962020158768, "rewards/margins": 1.468548059463501, "rewards/rejected": -1.5038578510284424, "step": 320 }, { "epoch": 0.166351606805293, "grad_norm": 14.4375, "learning_rate": 4.5119956379498364e-07, "logits/chosen": -0.9235115051269531, "logits/rejected": -0.9803950190544128, "logps/chosen": -62.91417694091797, "logps/rejected": -80.86149597167969, "loss": 0.292, "rewards/accuracies": 0.90625, "rewards/chosen": -0.07058247923851013, "rewards/margins": 1.4306252002716064, "rewards/rejected": -1.5012075901031494, "step": 330 }, { "epoch": 0.1713925645872716, "grad_norm": 22.875, "learning_rate": 4.484732824427481e-07, "logits/chosen": -0.9881145358085632, "logits/rejected": -1.0382649898529053, "logps/chosen": -62.81303787231445, "logps/rejected": -81.26265716552734, "loss": 0.278, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.061654943972826004, "rewards/margins": 1.5001782178878784, "rewards/rejected": -1.561833143234253, "step": 340 }, { "epoch": 0.17643352236925017, "grad_norm": 10.25, "learning_rate": 4.457470010905125e-07, "logits/chosen": -0.9716188311576843, "logits/rejected": -1.0232843160629272, "logps/chosen": -63.168006896972656, "logps/rejected": -79.84440612792969, "loss": 0.3661, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.10739569365978241, "rewards/margins": 1.3512216806411743, "rewards/rejected": -1.4586174488067627, "step": 350 }, { "epoch": 0.18147448015122875, "grad_norm": 16.125, "learning_rate": 4.43020719738277e-07, "logits/chosen": -0.9138787388801575, "logits/rejected": -0.9851690530776978, "logps/chosen": -61.79378128051758, "logps/rejected": -81.6797866821289, "loss": 0.2695, "rewards/accuracies": 0.90625, "rewards/chosen": 0.00970448087900877, "rewards/margins": 1.6192023754119873, "rewards/rejected": -1.6094980239868164, "step": 360 }, { "epoch": 0.18651543793320732, "grad_norm": 12.0625, "learning_rate": 4.402944383860414e-07, "logits/chosen": -0.932845950126648, "logits/rejected": -0.98590487241745, "logps/chosen": -62.03528594970703, "logps/rejected": -81.13326263427734, "loss": 0.2783, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0034210742451250553, "rewards/margins": 1.5679662227630615, "rewards/rejected": -1.5713872909545898, "step": 370 }, { "epoch": 0.19155639571518587, "grad_norm": 9.625, "learning_rate": 4.375681570338059e-07, "logits/chosen": -0.9335094690322876, "logits/rejected": -0.991866946220398, "logps/chosen": -61.888038635253906, "logps/rejected": -80.99397277832031, "loss": 0.3039, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0033101304434239864, "rewards/margins": 1.5600807666778564, "rewards/rejected": -1.5567705631256104, "step": 380 }, { "epoch": 0.19659735349716445, "grad_norm": 24.5, "learning_rate": 4.348418756815703e-07, "logits/chosen": -0.9722970724105835, "logits/rejected": -1.0437185764312744, "logps/chosen": -62.06909942626953, "logps/rejected": -80.99043273925781, "loss": 0.2994, "rewards/accuracies": 0.90625, "rewards/chosen": -0.017526980489492416, "rewards/margins": 1.57229483127594, "rewards/rejected": -1.5898219347000122, "step": 390 }, { "epoch": 0.20163831127914303, "grad_norm": 14.3125, "learning_rate": 4.3211559432933476e-07, "logits/chosen": -0.9651460647583008, "logits/rejected": -1.0433061122894287, "logps/chosen": -61.74712371826172, "logps/rejected": -82.36299133300781, "loss": 0.2577, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.03247659280896187, "rewards/margins": 1.6827892065048218, "rewards/rejected": -1.6503126621246338, "step": 400 }, { "epoch": 0.20163831127914303, "eval_logits/chosen": -0.9533343315124512, "eval_logits/rejected": -1.0109245777130127, "eval_logps/chosen": -62.41316604614258, "eval_logps/rejected": -81.344482421875, "eval_loss": 0.3170239329338074, "eval_rewards/accuracies": 0.8856287598609924, "eval_rewards/chosen": -0.040688931941986084, "eval_rewards/margins": 1.5549986362457275, "eval_rewards/rejected": -1.5956875085830688, "eval_runtime": 71.3317, "eval_samples_per_second": 23.412, "eval_steps_per_second": 23.412, "step": 400 }, { "epoch": 0.2066792690611216, "grad_norm": 24.25, "learning_rate": 4.2938931297709925e-07, "logits/chosen": -0.9435351490974426, "logits/rejected": -1.0250452756881714, "logps/chosen": -61.98693084716797, "logps/rejected": -82.12709045410156, "loss": 0.2696, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.0036094195675104856, "rewards/margins": 1.654160737991333, "rewards/rejected": -1.650551199913025, "step": 410 }, { "epoch": 0.21172022684310018, "grad_norm": 11.5625, "learning_rate": 4.266630316248637e-07, "logits/chosen": -0.9397176504135132, "logits/rejected": -1.0133641958236694, "logps/chosen": -61.4076042175293, "logps/rejected": -83.43376159667969, "loss": 0.258, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0406290665268898, "rewards/margins": 1.7928526401519775, "rewards/rejected": -1.7522236108779907, "step": 420 }, { "epoch": 0.21676118462507876, "grad_norm": 17.75, "learning_rate": 4.239367502726281e-07, "logits/chosen": -0.9263733625411987, "logits/rejected": -1.007187843322754, "logps/chosen": -60.838951110839844, "logps/rejected": -82.61154174804688, "loss": 0.2287, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.09760002046823502, "rewards/margins": 1.8123610019683838, "rewards/rejected": -1.7147607803344727, "step": 430 }, { "epoch": 0.22180214240705734, "grad_norm": 18.375, "learning_rate": 4.2121046892039257e-07, "logits/chosen": -0.9172664880752563, "logits/rejected": -0.98748779296875, "logps/chosen": -61.5716667175293, "logps/rejected": -82.24566650390625, "loss": 0.2758, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.008749181404709816, "rewards/margins": 1.7142765522003174, "rewards/rejected": -1.7055273056030273, "step": 440 }, { "epoch": 0.22684310018903592, "grad_norm": 28.75, "learning_rate": 4.18484187568157e-07, "logits/chosen": -0.9780591726303101, "logits/rejected": -1.02444326877594, "logps/chosen": -62.16161346435547, "logps/rejected": -81.88623046875, "loss": 0.2933, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.024681804701685905, "rewards/margins": 1.6520192623138428, "rewards/rejected": -1.6767011880874634, "step": 450 }, { "epoch": 0.2318840579710145, "grad_norm": 15.375, "learning_rate": 4.1575790621592144e-07, "logits/chosen": -0.9287646412849426, "logits/rejected": -0.9916049838066101, "logps/chosen": -61.4310188293457, "logps/rejected": -82.45499420166016, "loss": 0.2939, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.04269082099199295, "rewards/margins": 1.7475837469100952, "rewards/rejected": -1.704892873764038, "step": 460 }, { "epoch": 0.23692501575299307, "grad_norm": 13.75, "learning_rate": 4.1303162486368593e-07, "logits/chosen": -0.9281194806098938, "logits/rejected": -1.0078294277191162, "logps/chosen": -60.9696159362793, "logps/rejected": -82.53529357910156, "loss": 0.2433, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.07616719603538513, "rewards/margins": 1.8099626302719116, "rewards/rejected": -1.7337955236434937, "step": 470 }, { "epoch": 0.24196597353497165, "grad_norm": 24.125, "learning_rate": 4.1030534351145037e-07, "logits/chosen": -0.9563441276550293, "logits/rejected": -1.0490442514419556, "logps/chosen": -61.71998977661133, "logps/rejected": -81.29328918457031, "loss": 0.3422, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.017765596508979797, "rewards/margins": 1.6064462661743164, "rewards/rejected": -1.5886808633804321, "step": 480 }, { "epoch": 0.24700693131695023, "grad_norm": 18.75, "learning_rate": 4.075790621592148e-07, "logits/chosen": -0.9509885907173157, "logits/rejected": -1.0152177810668945, "logps/chosen": -60.49787139892578, "logps/rejected": -82.62560272216797, "loss": 0.2712, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.13019904494285583, "rewards/margins": 1.8366725444793701, "rewards/rejected": -1.7064735889434814, "step": 490 }, { "epoch": 0.2520478890989288, "grad_norm": 20.0, "learning_rate": 4.0485278080697925e-07, "logits/chosen": -0.9022938013076782, "logits/rejected": -0.9922693371772766, "logps/chosen": -61.09675216674805, "logps/rejected": -82.68385314941406, "loss": 0.2781, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.08039947599172592, "rewards/margins": 1.8100239038467407, "rewards/rejected": -1.7296245098114014, "step": 500 }, { "epoch": 0.2520478890989288, "eval_logits/chosen": -0.9490191340446472, "eval_logits/rejected": -1.0179123878479004, "eval_logps/chosen": -61.8110466003418, "eval_logps/rejected": -82.53450775146484, "eval_loss": 0.30638858675956726, "eval_rewards/accuracies": 0.8856287598609924, "eval_rewards/chosen": 0.019522832706570625, "eval_rewards/margins": 1.734211802482605, "eval_rewards/rejected": -1.7146891355514526, "eval_runtime": 72.5423, "eval_samples_per_second": 23.021, "eval_steps_per_second": 23.021, "step": 500 }, { "epoch": 0.2570888468809074, "grad_norm": 32.25, "learning_rate": 4.021264994547437e-07, "logits/chosen": -0.9350666999816895, "logits/rejected": -1.0141990184783936, "logps/chosen": -61.0509033203125, "logps/rejected": -82.5855712890625, "loss": 0.2704, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.08123103529214859, "rewards/margins": 1.7908775806427002, "rewards/rejected": -1.7096465826034546, "step": 510 }, { "epoch": 0.26212980466288593, "grad_norm": 25.75, "learning_rate": 3.994002181025082e-07, "logits/chosen": -0.9520455598831177, "logits/rejected": -1.0035117864608765, "logps/chosen": -60.493560791015625, "logps/rejected": -82.90555572509766, "loss": 0.2678, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.1298852264881134, "rewards/margins": 1.8753770589828491, "rewards/rejected": -1.7454917430877686, "step": 520 }, { "epoch": 0.26717076244486454, "grad_norm": 9.375, "learning_rate": 3.966739367502726e-07, "logits/chosen": -0.9617950320243835, "logits/rejected": -1.0328407287597656, "logps/chosen": -61.49128341674805, "logps/rejected": -81.98192596435547, "loss": 0.3276, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.05107005685567856, "rewards/margins": 1.7408430576324463, "rewards/rejected": -1.6897728443145752, "step": 530 }, { "epoch": 0.2722117202268431, "grad_norm": 24.375, "learning_rate": 3.9394765539803705e-07, "logits/chosen": -0.9145433306694031, "logits/rejected": -0.9903620481491089, "logps/chosen": -60.99120330810547, "logps/rejected": -83.13854217529297, "loss": 0.2787, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.09986008703708649, "rewards/margins": 1.8553259372711182, "rewards/rejected": -1.7554658651351929, "step": 540 }, { "epoch": 0.2772526780088217, "grad_norm": 24.125, "learning_rate": 3.9122137404580155e-07, "logits/chosen": -0.9315235018730164, "logits/rejected": -1.0184853076934814, "logps/chosen": -61.380882263183594, "logps/rejected": -83.67707061767578, "loss": 0.2747, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.0887889489531517, "rewards/margins": 1.894553780555725, "rewards/rejected": -1.8057647943496704, "step": 550 }, { "epoch": 0.28229363579080025, "grad_norm": 11.875, "learning_rate": 3.8849509269356593e-07, "logits/chosen": -0.8931863903999329, "logits/rejected": -0.9775617718696594, "logps/chosen": -60.56890869140625, "logps/rejected": -83.23638916015625, "loss": 0.2602, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.1255245953798294, "rewards/margins": 1.8955485820770264, "rewards/rejected": -1.770024061203003, "step": 560 }, { "epoch": 0.28733459357277885, "grad_norm": 13.0, "learning_rate": 3.8576881134133037e-07, "logits/chosen": -0.9146768450737, "logits/rejected": -1.018970251083374, "logps/chosen": -60.158790588378906, "logps/rejected": -83.05947875976562, "loss": 0.258, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.1570456475019455, "rewards/margins": 1.9360195398330688, "rewards/rejected": -1.778973937034607, "step": 570 }, { "epoch": 0.2923755513547574, "grad_norm": 25.0, "learning_rate": 3.8304252998909486e-07, "logits/chosen": -0.899819552898407, "logits/rejected": -0.990888774394989, "logps/chosen": -61.66679763793945, "logps/rejected": -81.74058532714844, "loss": 0.3258, "rewards/accuracies": 0.84375, "rewards/chosen": 0.05306190997362137, "rewards/margins": 1.7001397609710693, "rewards/rejected": -1.6470777988433838, "step": 580 }, { "epoch": 0.297416509136736, "grad_norm": 15.1875, "learning_rate": 3.803162486368593e-07, "logits/chosen": -0.9033399820327759, "logits/rejected": -0.9803060293197632, "logps/chosen": -60.785057067871094, "logps/rejected": -83.88690185546875, "loss": 0.2538, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.10141563415527344, "rewards/margins": 1.9382518529891968, "rewards/rejected": -1.8368362188339233, "step": 590 }, { "epoch": 0.30245746691871456, "grad_norm": 15.25, "learning_rate": 3.775899672846238e-07, "logits/chosen": -0.9281272888183594, "logits/rejected": -1.005056619644165, "logps/chosen": -62.294593811035156, "logps/rejected": -82.4925308227539, "loss": 0.3627, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.008905407041311264, "rewards/margins": 1.7188129425048828, "rewards/rejected": -1.7099075317382812, "step": 600 }, { "epoch": 0.30245746691871456, "eval_logits/chosen": -0.9133957028388977, "eval_logits/rejected": -0.9896692037582397, "eval_logps/chosen": -61.07933807373047, "eval_logps/rejected": -82.90621948242188, "eval_loss": 0.30104419589042664, "eval_rewards/accuracies": 0.8844311237335205, "eval_rewards/chosen": 0.09269363433122635, "eval_rewards/margins": 1.8445546627044678, "eval_rewards/rejected": -1.7518609762191772, "eval_runtime": 73.1888, "eval_samples_per_second": 22.818, "eval_steps_per_second": 22.818, "step": 600 }, { "epoch": 0.3074984247006931, "grad_norm": 30.5, "learning_rate": 3.7486368593238823e-07, "logits/chosen": -0.9185993075370789, "logits/rejected": -1.0144493579864502, "logps/chosen": -61.656349182128906, "logps/rejected": -81.76588439941406, "loss": 0.3439, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.051804833114147186, "rewards/margins": 1.708714485168457, "rewards/rejected": -1.6569095849990845, "step": 610 }, { "epoch": 0.3125393824826717, "grad_norm": 18.0, "learning_rate": 3.721374045801526e-07, "logits/chosen": -0.9089874029159546, "logits/rejected": -0.9995435476303101, "logps/chosen": -60.14899444580078, "logps/rejected": -82.75482177734375, "loss": 0.2812, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.16136237978935242, "rewards/margins": 1.9296493530273438, "rewards/rejected": -1.768286943435669, "step": 620 }, { "epoch": 0.31758034026465026, "grad_norm": 28.0, "learning_rate": 3.694111232279171e-07, "logits/chosen": -0.9505274891853333, "logits/rejected": -1.028907060623169, "logps/chosen": -60.5921516418457, "logps/rejected": -83.64252471923828, "loss": 0.2844, "rewards/accuracies": 0.875, "rewards/chosen": 0.13824741542339325, "rewards/margins": 1.9466102123260498, "rewards/rejected": -1.808362603187561, "step": 630 }, { "epoch": 0.32262129804662887, "grad_norm": 22.625, "learning_rate": 3.6668484187568154e-07, "logits/chosen": -0.9480802416801453, "logits/rejected": -1.0301778316497803, "logps/chosen": -60.25774002075195, "logps/rejected": -82.69186401367188, "loss": 0.2888, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.15573999285697937, "rewards/margins": 1.877264380455017, "rewards/rejected": -1.7215244770050049, "step": 640 }, { "epoch": 0.3276622558286074, "grad_norm": 28.25, "learning_rate": 3.63958560523446e-07, "logits/chosen": -0.9399679899215698, "logits/rejected": -0.9984865188598633, "logps/chosen": -61.39536666870117, "logps/rejected": -82.51799774169922, "loss": 0.3268, "rewards/accuracies": 0.875, "rewards/chosen": 0.06745560467243195, "rewards/margins": 1.796059250831604, "rewards/rejected": -1.7286036014556885, "step": 650 }, { "epoch": 0.332703213610586, "grad_norm": 30.125, "learning_rate": 3.6123227917121047e-07, "logits/chosen": -0.9196429252624512, "logits/rejected": -1.0079903602600098, "logps/chosen": -60.20268630981445, "logps/rejected": -84.8389663696289, "loss": 0.2459, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.16887502372264862, "rewards/margins": 2.105717420578003, "rewards/rejected": -1.9368425607681274, "step": 660 }, { "epoch": 0.3377441713925646, "grad_norm": 26.75, "learning_rate": 3.585059978189749e-07, "logits/chosen": -0.9201717376708984, "logits/rejected": -1.001564621925354, "logps/chosen": -60.38823318481445, "logps/rejected": -83.207275390625, "loss": 0.2703, "rewards/accuracies": 0.90625, "rewards/chosen": 0.17042402923107147, "rewards/margins": 1.9416764974594116, "rewards/rejected": -1.7712528705596924, "step": 670 }, { "epoch": 0.3427851291745432, "grad_norm": 10.3125, "learning_rate": 3.557797164667394e-07, "logits/chosen": -0.8961941003799438, "logits/rejected": -0.9779438972473145, "logps/chosen": -59.930824279785156, "logps/rejected": -83.56443786621094, "loss": 0.2806, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.18050600588321686, "rewards/margins": 1.9991801977157593, "rewards/rejected": -1.818674087524414, "step": 680 }, { "epoch": 0.34782608695652173, "grad_norm": 22.25, "learning_rate": 3.530534351145038e-07, "logits/chosen": -0.9111859202384949, "logits/rejected": -1.0103601217269897, "logps/chosen": -59.5981559753418, "logps/rejected": -84.48451232910156, "loss": 0.2155, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.22089195251464844, "rewards/margins": 2.098818063735962, "rewards/rejected": -1.8779258728027344, "step": 690 }, { "epoch": 0.35286704473850034, "grad_norm": 8.4375, "learning_rate": 3.503271537622682e-07, "logits/chosen": -0.8836026191711426, "logits/rejected": -0.9876821637153625, "logps/chosen": -59.78227615356445, "logps/rejected": -84.5053939819336, "loss": 0.2411, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.20316052436828613, "rewards/margins": 2.080014944076538, "rewards/rejected": -1.8768543004989624, "step": 700 }, { "epoch": 0.35286704473850034, "eval_logits/chosen": -0.9257686138153076, "eval_logits/rejected": -1.0098994970321655, "eval_logps/chosen": -60.61675262451172, "eval_logps/rejected": -83.34400939941406, "eval_loss": 0.2979341447353363, "eval_rewards/accuracies": 0.886227548122406, "eval_rewards/chosen": 0.13895215094089508, "eval_rewards/margins": 1.9345914125442505, "eval_rewards/rejected": -1.7956393957138062, "eval_runtime": 73.2808, "eval_samples_per_second": 22.789, "eval_steps_per_second": 22.789, "step": 700 }, { "epoch": 0.3579080025204789, "grad_norm": 11.8125, "learning_rate": 3.476008724100327e-07, "logits/chosen": -0.8866975903511047, "logits/rejected": -0.9669508934020996, "logps/chosen": -60.970123291015625, "logps/rejected": -83.34266662597656, "loss": 0.3138, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.0989324077963829, "rewards/margins": 1.9064470529556274, "rewards/rejected": -1.8075145483016968, "step": 710 }, { "epoch": 0.3629489603024575, "grad_norm": 12.125, "learning_rate": 3.4487459105779716e-07, "logits/chosen": -0.8704292178153992, "logits/rejected": -0.9828470349311829, "logps/chosen": -58.7971076965332, "logps/rejected": -83.87742614746094, "loss": 0.2295, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.2823036015033722, "rewards/margins": 2.1361794471740723, "rewards/rejected": -1.8538758754730225, "step": 720 }, { "epoch": 0.36798991808443604, "grad_norm": 27.75, "learning_rate": 3.421483097055616e-07, "logits/chosen": -0.8949785232543945, "logits/rejected": -0.9850034713745117, "logps/chosen": -59.54541778564453, "logps/rejected": -85.09830474853516, "loss": 0.1848, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.2564947009086609, "rewards/margins": 2.1756319999694824, "rewards/rejected": -1.9191375970840454, "step": 730 }, { "epoch": 0.37303087586641465, "grad_norm": 30.625, "learning_rate": 3.394220283533261e-07, "logits/chosen": -0.9334227442741394, "logits/rejected": -1.010840654373169, "logps/chosen": -60.2901725769043, "logps/rejected": -84.69157409667969, "loss": 0.2416, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.15744934976100922, "rewards/margins": 2.060612678527832, "rewards/rejected": -1.9031633138656616, "step": 740 }, { "epoch": 0.3780718336483932, "grad_norm": 20.0, "learning_rate": 3.3669574700109047e-07, "logits/chosen": -0.9105051159858704, "logits/rejected": -1.0369561910629272, "logps/chosen": -57.91829299926758, "logps/rejected": -85.75810241699219, "loss": 0.1725, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.35346147418022156, "rewards/margins": 2.3853201866149902, "rewards/rejected": -2.0318589210510254, "step": 750 }, { "epoch": 0.38311279143037175, "grad_norm": 24.375, "learning_rate": 3.339694656488549e-07, "logits/chosen": -0.8942914009094238, "logits/rejected": -1.0089881420135498, "logps/chosen": -60.97709274291992, "logps/rejected": -84.78167724609375, "loss": 0.2952, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.10825882852077484, "rewards/margins": 2.0440988540649414, "rewards/rejected": -1.9358398914337158, "step": 760 }, { "epoch": 0.38815374921235035, "grad_norm": 21.875, "learning_rate": 3.312431842966194e-07, "logits/chosen": -0.8745867609977722, "logits/rejected": -0.9834270477294922, "logps/chosen": -60.55878829956055, "logps/rejected": -84.3352279663086, "loss": 0.2894, "rewards/accuracies": 0.875, "rewards/chosen": 0.13102281093597412, "rewards/margins": 2.0422165393829346, "rewards/rejected": -1.91119384765625, "step": 770 }, { "epoch": 0.3931947069943289, "grad_norm": 25.125, "learning_rate": 3.2851690294438384e-07, "logits/chosen": -0.8937376141548157, "logits/rejected": -0.9868464469909668, "logps/chosen": -59.818359375, "logps/rejected": -85.10429382324219, "loss": 0.2596, "rewards/accuracies": 0.90625, "rewards/chosen": 0.20219922065734863, "rewards/margins": 2.1438941955566406, "rewards/rejected": -1.9416948556900024, "step": 780 }, { "epoch": 0.3982356647763075, "grad_norm": 32.25, "learning_rate": 3.2579062159214833e-07, "logits/chosen": -0.92058265209198, "logits/rejected": -1.0123975276947021, "logps/chosen": -60.107933044433594, "logps/rejected": -84.3521957397461, "loss": 0.294, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.1681869924068451, "rewards/margins": 2.0432465076446533, "rewards/rejected": -1.8750594854354858, "step": 790 }, { "epoch": 0.40327662255828606, "grad_norm": 15.0625, "learning_rate": 3.2306434023991277e-07, "logits/chosen": -0.896182656288147, "logits/rejected": -0.9941380620002747, "logps/chosen": -61.2158203125, "logps/rejected": -82.98789978027344, "loss": 0.3677, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.1341857612133026, "rewards/margins": 1.88623046875, "rewards/rejected": -1.752044677734375, "step": 800 }, { "epoch": 0.40327662255828606, "eval_logits/chosen": -0.911089301109314, "eval_logits/rejected": -0.9992658495903015, "eval_logps/chosen": -60.45991897583008, "eval_logps/rejected": -83.99901580810547, "eval_loss": 0.2951066493988037, "eval_rewards/accuracies": 0.8886227607727051, "eval_rewards/chosen": 0.1546352505683899, "eval_rewards/margins": 2.0157766342163086, "eval_rewards/rejected": -1.861141324043274, "eval_runtime": 71.0897, "eval_samples_per_second": 23.491, "eval_steps_per_second": 23.491, "step": 800 }, { "epoch": 0.40831758034026466, "grad_norm": 16.625, "learning_rate": 3.2033805888767715e-07, "logits/chosen": -0.9287660717964172, "logits/rejected": -1.0095508098602295, "logps/chosen": -60.53706741333008, "logps/rejected": -83.99447631835938, "loss": 0.2838, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.1390010416507721, "rewards/margins": 1.9923112392425537, "rewards/rejected": -1.8533103466033936, "step": 810 }, { "epoch": 0.4133585381222432, "grad_norm": 33.0, "learning_rate": 3.1761177753544164e-07, "logits/chosen": -0.9320980310440063, "logits/rejected": -1.0176376104354858, "logps/chosen": -59.87189483642578, "logps/rejected": -84.4596176147461, "loss": 0.2483, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.1814599335193634, "rewards/margins": 2.1097805500030518, "rewards/rejected": -1.9283206462860107, "step": 820 }, { "epoch": 0.4183994959042218, "grad_norm": 12.9375, "learning_rate": 3.148854961832061e-07, "logits/chosen": -0.9213443994522095, "logits/rejected": -1.0197076797485352, "logps/chosen": -59.96300506591797, "logps/rejected": -84.28089141845703, "loss": 0.2707, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.1989632546901703, "rewards/margins": 2.1028213500976562, "rewards/rejected": -1.9038581848144531, "step": 830 }, { "epoch": 0.42344045368620037, "grad_norm": 5.90625, "learning_rate": 3.121592148309705e-07, "logits/chosen": -0.914442241191864, "logits/rejected": -0.9979844093322754, "logps/chosen": -59.945274353027344, "logps/rejected": -83.28790283203125, "loss": 0.2793, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.17169013619422913, "rewards/margins": 2.031377077102661, "rewards/rejected": -1.8596868515014648, "step": 840 }, { "epoch": 0.428481411468179, "grad_norm": 13.4375, "learning_rate": 3.09432933478735e-07, "logits/chosen": -0.9071733355522156, "logits/rejected": -1.0141820907592773, "logps/chosen": -60.1943244934082, "logps/rejected": -84.26617431640625, "loss": 0.2751, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.1853206306695938, "rewards/margins": 2.0591628551483154, "rewards/rejected": -1.8738422393798828, "step": 850 }, { "epoch": 0.4335223692501575, "grad_norm": 12.875, "learning_rate": 3.0670665212649945e-07, "logits/chosen": -0.9004698991775513, "logits/rejected": -0.9726663827896118, "logps/chosen": -60.701194763183594, "logps/rejected": -83.21146392822266, "loss": 0.3464, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.12978795170783997, "rewards/margins": 1.9474796056747437, "rewards/rejected": -1.8176918029785156, "step": 860 }, { "epoch": 0.43856332703213613, "grad_norm": 13.125, "learning_rate": 3.0398037077426394e-07, "logits/chosen": -0.8763822317123413, "logits/rejected": -0.9599603414535522, "logps/chosen": -60.947471618652344, "logps/rejected": -84.69038391113281, "loss": 0.3711, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.12584984302520752, "rewards/margins": 2.0357413291931152, "rewards/rejected": -1.9098914861679077, "step": 870 }, { "epoch": 0.4436042848141147, "grad_norm": 12.0, "learning_rate": 3.0125408942202833e-07, "logits/chosen": -0.9101651906967163, "logits/rejected": -1.014981985092163, "logps/chosen": -59.863014221191406, "logps/rejected": -83.40458679199219, "loss": 0.3145, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.2130097895860672, "rewards/margins": 2.023256301879883, "rewards/rejected": -1.810246467590332, "step": 880 }, { "epoch": 0.44864524259609323, "grad_norm": 8.5, "learning_rate": 2.9852780806979277e-07, "logits/chosen": -0.8842908143997192, "logits/rejected": -1.0119690895080566, "logps/chosen": -58.0797233581543, "logps/rejected": -84.85250091552734, "loss": 0.1971, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.34588780999183655, "rewards/margins": 2.315692186355591, "rewards/rejected": -1.9698044061660767, "step": 890 }, { "epoch": 0.45368620037807184, "grad_norm": 21.125, "learning_rate": 2.9580152671755726e-07, "logits/chosen": -0.861519455909729, "logits/rejected": -0.9699891805648804, "logps/chosen": -58.99384689331055, "logps/rejected": -84.5435791015625, "loss": 0.2497, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.2806755006313324, "rewards/margins": 2.2056772708892822, "rewards/rejected": -1.925001859664917, "step": 900 }, { "epoch": 0.45368620037807184, "eval_logits/chosen": -0.9106476902961731, "eval_logits/rejected": -1.0023449659347534, "eval_logps/chosen": -60.15101623535156, "eval_logps/rejected": -84.17161560058594, "eval_loss": 0.29339829087257385, "eval_rewards/accuracies": 0.886227548122406, "eval_rewards/chosen": 0.185525581240654, "eval_rewards/margins": 2.0639255046844482, "eval_rewards/rejected": -1.8783999681472778, "eval_runtime": 70.7305, "eval_samples_per_second": 23.611, "eval_steps_per_second": 23.611, "step": 900 }, { "epoch": 0.4587271581600504, "grad_norm": 21.5, "learning_rate": 2.930752453653217e-07, "logits/chosen": -0.8937481641769409, "logits/rejected": -0.9957500696182251, "logps/chosen": -59.79345703125, "logps/rejected": -83.85224914550781, "loss": 0.3282, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.231471985578537, "rewards/margins": 2.0565638542175293, "rewards/rejected": -1.8250917196273804, "step": 910 }, { "epoch": 0.463768115942029, "grad_norm": 18.75, "learning_rate": 2.9034896401308613e-07, "logits/chosen": -0.8847681283950806, "logits/rejected": -0.9873048663139343, "logps/chosen": -59.479347229003906, "logps/rejected": -86.35008239746094, "loss": 0.2292, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.2320963442325592, "rewards/margins": 2.310370683670044, "rewards/rejected": -2.0782742500305176, "step": 920 }, { "epoch": 0.46880907372400754, "grad_norm": 22.25, "learning_rate": 2.876226826608506e-07, "logits/chosen": -0.915035605430603, "logits/rejected": -1.0067721605300903, "logps/chosen": -59.74500274658203, "logps/rejected": -82.90184020996094, "loss": 0.3143, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.18378940224647522, "rewards/margins": 1.998040795326233, "rewards/rejected": -1.814251184463501, "step": 930 }, { "epoch": 0.47385003150598615, "grad_norm": 33.75, "learning_rate": 2.84896401308615e-07, "logits/chosen": -0.9178056716918945, "logits/rejected": -1.0208479166030884, "logps/chosen": -59.2294807434082, "logps/rejected": -84.78729248046875, "loss": 0.2393, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.25158706307411194, "rewards/margins": 2.1938788890838623, "rewards/rejected": -1.9422919750213623, "step": 940 }, { "epoch": 0.4788909892879647, "grad_norm": 34.75, "learning_rate": 2.8217011995637945e-07, "logits/chosen": -0.8805239796638489, "logits/rejected": -0.9753271341323853, "logps/chosen": -59.95941162109375, "logps/rejected": -84.86714172363281, "loss": 0.2666, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.19955232739448547, "rewards/margins": 2.1361844539642334, "rewards/rejected": -1.936632513999939, "step": 950 }, { "epoch": 0.4839319470699433, "grad_norm": 23.5, "learning_rate": 2.7944383860414394e-07, "logits/chosen": -0.8861673474311829, "logits/rejected": -0.9975956082344055, "logps/chosen": -59.12894821166992, "logps/rejected": -85.26177978515625, "loss": 0.2303, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2717575132846832, "rewards/margins": 2.239091396331787, "rewards/rejected": -1.9673335552215576, "step": 960 }, { "epoch": 0.48897290485192185, "grad_norm": 19.625, "learning_rate": 2.767175572519084e-07, "logits/chosen": -0.9173597097396851, "logits/rejected": -1.0085917711257935, "logps/chosen": -59.86204147338867, "logps/rejected": -84.22465515136719, "loss": 0.2877, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.20364825427532196, "rewards/margins": 2.122812509536743, "rewards/rejected": -1.9191640615463257, "step": 970 }, { "epoch": 0.49401386263390046, "grad_norm": 20.125, "learning_rate": 2.7399127589967287e-07, "logits/chosen": -0.8748540878295898, "logits/rejected": -0.9848943948745728, "logps/chosen": -58.594139099121094, "logps/rejected": -84.79752349853516, "loss": 0.2749, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.31691378355026245, "rewards/margins": 2.249189615249634, "rewards/rejected": -1.9322757720947266, "step": 980 }, { "epoch": 0.499054820415879, "grad_norm": 11.5625, "learning_rate": 2.712649945474373e-07, "logits/chosen": -0.8857355117797852, "logits/rejected": -0.9768050909042358, "logps/chosen": -59.4735107421875, "logps/rejected": -84.06812286376953, "loss": 0.2732, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.2545910179615021, "rewards/margins": 2.087873935699463, "rewards/rejected": -1.8332828283309937, "step": 990 }, { "epoch": 0.5040957781978576, "grad_norm": 35.0, "learning_rate": 2.685387131952017e-07, "logits/chosen": -0.8867548108100891, "logits/rejected": -1.0059692859649658, "logps/chosen": -59.520362854003906, "logps/rejected": -84.2868881225586, "loss": 0.282, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.25644558668136597, "rewards/margins": 2.160256862640381, "rewards/rejected": -1.9038112163543701, "step": 1000 }, { "epoch": 0.5040957781978576, "eval_logits/chosen": -0.9006710052490234, "eval_logits/rejected": -0.9953308701515198, "eval_logps/chosen": -59.89181900024414, "eval_logps/rejected": -84.33870697021484, "eval_loss": 0.29226598143577576, "eval_rewards/accuracies": 0.8874251246452332, "eval_rewards/chosen": 0.21144606173038483, "eval_rewards/margins": 2.106555700302124, "eval_rewards/rejected": -1.8951095342636108, "eval_runtime": 71.1587, "eval_samples_per_second": 23.469, "eval_steps_per_second": 23.469, "step": 1000 }, { "epoch": 0.5091367359798362, "grad_norm": 13.375, "learning_rate": 2.658124318429662e-07, "logits/chosen": -0.8491979837417603, "logits/rejected": -0.9555536508560181, "logps/chosen": -59.57502365112305, "logps/rejected": -83.54940795898438, "loss": 0.3238, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.27595460414886475, "rewards/margins": 2.066524028778076, "rewards/rejected": -1.7905696630477905, "step": 1010 }, { "epoch": 0.5141776937618148, "grad_norm": 14.375, "learning_rate": 2.630861504907306e-07, "logits/chosen": -0.871178150177002, "logits/rejected": -0.9892231225967407, "logps/chosen": -59.257232666015625, "logps/rejected": -86.75993347167969, "loss": 0.1845, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.26664021611213684, "rewards/margins": 2.3842711448669434, "rewards/rejected": -2.11763072013855, "step": 1020 }, { "epoch": 0.5192186515437933, "grad_norm": 8.1875, "learning_rate": 2.6035986913849506e-07, "logits/chosen": -0.9025937914848328, "logits/rejected": -0.9637918472290039, "logps/chosen": -59.708038330078125, "logps/rejected": -83.67890930175781, "loss": 0.3105, "rewards/accuracies": 0.875, "rewards/chosen": 0.22001886367797852, "rewards/margins": 2.053025722503662, "rewards/rejected": -1.8330070972442627, "step": 1030 }, { "epoch": 0.5242596093257719, "grad_norm": 27.625, "learning_rate": 2.5763358778625955e-07, "logits/chosen": -0.8830963373184204, "logits/rejected": -0.9901777505874634, "logps/chosen": -58.61798095703125, "logps/rejected": -84.18465423583984, "loss": 0.2793, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.33424392342567444, "rewards/margins": 2.2203192710876465, "rewards/rejected": -1.8860752582550049, "step": 1040 }, { "epoch": 0.5293005671077504, "grad_norm": 11.875, "learning_rate": 2.54907306434024e-07, "logits/chosen": -0.8822509050369263, "logits/rejected": -1.008209466934204, "logps/chosen": -60.164794921875, "logps/rejected": -84.7709732055664, "loss": 0.2963, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.22792068123817444, "rewards/margins": 2.1442337036132812, "rewards/rejected": -1.91631281375885, "step": 1050 }, { "epoch": 0.5343415248897291, "grad_norm": 7.46875, "learning_rate": 2.521810250817885e-07, "logits/chosen": -0.8839899897575378, "logits/rejected": -1.0129756927490234, "logps/chosen": -59.81916046142578, "logps/rejected": -85.43331909179688, "loss": 0.2275, "rewards/accuracies": 0.90625, "rewards/chosen": 0.21425476670265198, "rewards/margins": 2.2582240104675293, "rewards/rejected": -2.04396915435791, "step": 1060 }, { "epoch": 0.5393824826717076, "grad_norm": 15.5625, "learning_rate": 2.4945474372955287e-07, "logits/chosen": -0.82720947265625, "logits/rejected": -0.907505989074707, "logps/chosen": -59.22252655029297, "logps/rejected": -83.96528625488281, "loss": 0.2807, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.2757384181022644, "rewards/margins": 2.1067721843719482, "rewards/rejected": -1.8310333490371704, "step": 1070 }, { "epoch": 0.5444234404536862, "grad_norm": 26.75, "learning_rate": 2.467284623773173e-07, "logits/chosen": -0.8436921834945679, "logits/rejected": -0.942459225654602, "logps/chosen": -59.066001892089844, "logps/rejected": -84.55767059326172, "loss": 0.2846, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2789579927921295, "rewards/margins": 2.1750502586364746, "rewards/rejected": -1.896092176437378, "step": 1080 }, { "epoch": 0.5494643982356647, "grad_norm": 26.75, "learning_rate": 2.440021810250818e-07, "logits/chosen": -0.9082972407341003, "logits/rejected": -1.0078270435333252, "logps/chosen": -58.6378059387207, "logps/rejected": -86.2591323852539, "loss": 0.224, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.31546300649642944, "rewards/margins": 2.3998687267303467, "rewards/rejected": -2.0844056606292725, "step": 1090 }, { "epoch": 0.5545053560176434, "grad_norm": 21.75, "learning_rate": 2.4127589967284623e-07, "logits/chosen": -0.8725331425666809, "logits/rejected": -0.9712456464767456, "logps/chosen": -58.98591995239258, "logps/rejected": -84.76383972167969, "loss": 0.2557, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.29521211981773376, "rewards/margins": 2.201046943664551, "rewards/rejected": -1.9058347940444946, "step": 1100 }, { "epoch": 0.5545053560176434, "eval_logits/chosen": -0.9060633182525635, "eval_logits/rejected": -1.0051296949386597, "eval_logps/chosen": -59.636837005615234, "eval_logps/rejected": -84.44316864013672, "eval_loss": 0.2921236455440521, "eval_rewards/accuracies": 0.8874251246452332, "eval_rewards/chosen": 0.23694448173046112, "eval_rewards/margins": 2.142500638961792, "eval_rewards/rejected": -1.9055562019348145, "eval_runtime": 72.2931, "eval_samples_per_second": 23.1, "eval_steps_per_second": 23.1, "step": 1100 }, { "epoch": 0.5595463137996219, "grad_norm": 19.25, "learning_rate": 2.3854961832061067e-07, "logits/chosen": -0.8947960734367371, "logits/rejected": -1.0010361671447754, "logps/chosen": -58.5375862121582, "logps/rejected": -84.85458374023438, "loss": 0.2887, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.3221047520637512, "rewards/margins": 2.255513906478882, "rewards/rejected": -1.9334090948104858, "step": 1110 }, { "epoch": 0.5645872715816005, "grad_norm": 12.5625, "learning_rate": 2.358233369683751e-07, "logits/chosen": -0.8644863367080688, "logits/rejected": -0.971636950969696, "logps/chosen": -58.843421936035156, "logps/rejected": -84.93423461914062, "loss": 0.2627, "rewards/accuracies": 0.875, "rewards/chosen": 0.3323742747306824, "rewards/margins": 2.243507146835327, "rewards/rejected": -1.9111328125, "step": 1120 }, { "epoch": 0.569628229363579, "grad_norm": 16.25, "learning_rate": 2.3309705561613957e-07, "logits/chosen": -0.8721977472305298, "logits/rejected": -0.9687450528144836, "logps/chosen": -59.44976806640625, "logps/rejected": -82.7881851196289, "loss": 0.3487, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.25436902046203613, "rewards/margins": 2.020705461502075, "rewards/rejected": -1.766336441040039, "step": 1130 }, { "epoch": 0.5746691871455577, "grad_norm": 22.25, "learning_rate": 2.3037077426390404e-07, "logits/chosen": -0.889614462852478, "logits/rejected": -1.0026637315750122, "logps/chosen": -58.558433532714844, "logps/rejected": -84.0979232788086, "loss": 0.2587, "rewards/accuracies": 0.875, "rewards/chosen": 0.33784663677215576, "rewards/margins": 2.1925177574157715, "rewards/rejected": -1.8546711206436157, "step": 1140 }, { "epoch": 0.5797101449275363, "grad_norm": 27.5, "learning_rate": 2.2764449291166848e-07, "logits/chosen": -0.9427415132522583, "logits/rejected": -1.0442800521850586, "logps/chosen": -58.565757751464844, "logps/rejected": -84.64384460449219, "loss": 0.2617, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.3408879339694977, "rewards/margins": 2.2868194580078125, "rewards/rejected": -1.9459314346313477, "step": 1150 }, { "epoch": 0.5847511027095148, "grad_norm": 20.625, "learning_rate": 2.2491821155943292e-07, "logits/chosen": -0.8788965344429016, "logits/rejected": -0.9990353584289551, "logps/chosen": -58.735572814941406, "logps/rejected": -84.48644256591797, "loss": 0.2425, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3037385046482086, "rewards/margins": 2.239537477493286, "rewards/rejected": -1.9357990026474, "step": 1160 }, { "epoch": 0.5897920604914934, "grad_norm": 20.0, "learning_rate": 2.2219193020719738e-07, "logits/chosen": -0.874742865562439, "logits/rejected": -0.980475902557373, "logps/chosen": -58.799041748046875, "logps/rejected": -85.21891784667969, "loss": 0.2426, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.3289235234260559, "rewards/margins": 2.2946279048919678, "rewards/rejected": -1.965704321861267, "step": 1170 }, { "epoch": 0.594833018273472, "grad_norm": 29.5, "learning_rate": 2.1946564885496182e-07, "logits/chosen": -0.875636875629425, "logits/rejected": -0.9794307947158813, "logps/chosen": -59.88359451293945, "logps/rejected": -84.40530395507812, "loss": 0.2653, "rewards/accuracies": 0.875, "rewards/chosen": 0.22515201568603516, "rewards/margins": 2.1366682052612305, "rewards/rejected": -1.9115161895751953, "step": 1180 }, { "epoch": 0.5998739760554506, "grad_norm": 9.875, "learning_rate": 2.1673936750272628e-07, "logits/chosen": -0.8744710683822632, "logits/rejected": -0.9897273182868958, "logps/chosen": -57.4598388671875, "logps/rejected": -86.65351867675781, "loss": 0.1861, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.42013850808143616, "rewards/margins": 2.5188748836517334, "rewards/rejected": -2.098736524581909, "step": 1190 }, { "epoch": 0.6049149338374291, "grad_norm": 11.1875, "learning_rate": 2.1401308615049072e-07, "logits/chosen": -0.8216146230697632, "logits/rejected": -0.9422334432601929, "logps/chosen": -58.2786979675293, "logps/rejected": -86.90044403076172, "loss": 0.2244, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3587431311607361, "rewards/margins": 2.450103282928467, "rewards/rejected": -2.091360569000244, "step": 1200 }, { "epoch": 0.6049149338374291, "eval_logits/chosen": -0.8834900856018066, "eval_logits/rejected": -0.9822787642478943, "eval_logps/chosen": -59.49919891357422, "eval_logps/rejected": -84.61227416992188, "eval_loss": 0.2913074493408203, "eval_rewards/accuracies": 0.8892215490341187, "eval_rewards/chosen": 0.2507072687149048, "eval_rewards/margins": 2.1731746196746826, "eval_rewards/rejected": -1.9224671125411987, "eval_runtime": 72.5811, "eval_samples_per_second": 23.009, "eval_steps_per_second": 23.009, "step": 1200 }, { "epoch": 0.6099558916194077, "grad_norm": 7.84375, "learning_rate": 2.1128680479825516e-07, "logits/chosen": -0.8642328977584839, "logits/rejected": -0.9527546167373657, "logps/chosen": -59.0882682800293, "logps/rejected": -84.75911712646484, "loss": 0.2518, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.28083863854408264, "rewards/margins": 2.2171006202697754, "rewards/rejected": -1.936261773109436, "step": 1210 }, { "epoch": 0.6149968494013862, "grad_norm": 7.21875, "learning_rate": 2.0856052344601963e-07, "logits/chosen": -0.8976170420646667, "logits/rejected": -0.9783880114555359, "logps/chosen": -60.150733947753906, "logps/rejected": -84.77664947509766, "loss": 0.3227, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.18833044171333313, "rewards/margins": 2.1451687812805176, "rewards/rejected": -1.9568383693695068, "step": 1220 }, { "epoch": 0.6200378071833649, "grad_norm": 16.875, "learning_rate": 2.058342420937841e-07, "logits/chosen": -0.8992708325386047, "logits/rejected": -0.9903928637504578, "logps/chosen": -58.91829299926758, "logps/rejected": -84.00004577636719, "loss": 0.3041, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.2912471294403076, "rewards/margins": 2.173732280731201, "rewards/rejected": -1.8824853897094727, "step": 1230 }, { "epoch": 0.6250787649653434, "grad_norm": 17.125, "learning_rate": 2.031079607415485e-07, "logits/chosen": -0.9080449938774109, "logits/rejected": -1.0125758647918701, "logps/chosen": -59.39638137817383, "logps/rejected": -84.40299224853516, "loss": 0.322, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.2707884907722473, "rewards/margins": 2.1304423809051514, "rewards/rejected": -1.8596540689468384, "step": 1240 }, { "epoch": 0.630119722747322, "grad_norm": 6.84375, "learning_rate": 2.0038167938931297e-07, "logits/chosen": -0.8706466555595398, "logits/rejected": -0.9584230184555054, "logps/chosen": -58.84540939331055, "logps/rejected": -83.47660064697266, "loss": 0.3036, "rewards/accuracies": 0.875, "rewards/chosen": 0.30719193816185, "rewards/margins": 2.1285505294799805, "rewards/rejected": -1.8213586807250977, "step": 1250 }, { "epoch": 0.6351606805293005, "grad_norm": 25.75, "learning_rate": 1.9765539803707743e-07, "logits/chosen": -0.8904057741165161, "logits/rejected": -0.9874610900878906, "logps/chosen": -59.1779899597168, "logps/rejected": -84.42768859863281, "loss": 0.3073, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.2561061382293701, "rewards/margins": 2.1732170581817627, "rewards/rejected": -1.917110800743103, "step": 1260 }, { "epoch": 0.6402016383112792, "grad_norm": 16.75, "learning_rate": 1.9492911668484184e-07, "logits/chosen": -0.8631388545036316, "logits/rejected": -0.9788234829902649, "logps/chosen": -58.17034912109375, "logps/rejected": -86.0931167602539, "loss": 0.2073, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3628115653991699, "rewards/margins": 2.3852200508117676, "rewards/rejected": -2.0224084854125977, "step": 1270 }, { "epoch": 0.6452425960932577, "grad_norm": 7.78125, "learning_rate": 1.922028353326063e-07, "logits/chosen": -0.886549174785614, "logits/rejected": -1.004765510559082, "logps/chosen": -58.592750549316406, "logps/rejected": -84.68618774414062, "loss": 0.2295, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.322457492351532, "rewards/margins": 2.2541778087615967, "rewards/rejected": -1.9317200183868408, "step": 1280 }, { "epoch": 0.6502835538752363, "grad_norm": 13.6875, "learning_rate": 1.8947655398037077e-07, "logits/chosen": -0.8659318685531616, "logits/rejected": -0.9652425646781921, "logps/chosen": -60.613441467285156, "logps/rejected": -84.11669158935547, "loss": 0.3235, "rewards/accuracies": 0.875, "rewards/chosen": 0.18255123496055603, "rewards/margins": 2.038862705230713, "rewards/rejected": -1.8563114404678345, "step": 1290 }, { "epoch": 0.6553245116572148, "grad_norm": 23.375, "learning_rate": 1.8675027262813524e-07, "logits/chosen": -0.8807634115219116, "logits/rejected": -1.0138846635818481, "logps/chosen": -58.674583435058594, "logps/rejected": -85.5340576171875, "loss": 0.2442, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3219856321811676, "rewards/margins": 2.348407030105591, "rewards/rejected": -2.026421308517456, "step": 1300 }, { "epoch": 0.6553245116572148, "eval_logits/chosen": -0.9014637470245361, "eval_logits/rejected": -1.0028033256530762, "eval_logps/chosen": -59.48044204711914, "eval_logps/rejected": -84.73458099365234, "eval_loss": 0.29055994749069214, "eval_rewards/accuracies": 0.8856287598609924, "eval_rewards/chosen": 0.25258320569992065, "eval_rewards/margins": 2.1872806549072266, "eval_rewards/rejected": -1.9346975088119507, "eval_runtime": 70.3028, "eval_samples_per_second": 23.754, "eval_steps_per_second": 23.754, "step": 1300 }, { "epoch": 0.6603654694391935, "grad_norm": 10.0625, "learning_rate": 1.8402399127589965e-07, "logits/chosen": -0.8660598993301392, "logits/rejected": -0.965117335319519, "logps/chosen": -58.19904708862305, "logps/rejected": -85.5027084350586, "loss": 0.2252, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.3830675482749939, "rewards/margins": 2.3513522148132324, "rewards/rejected": -1.9682846069335938, "step": 1310 }, { "epoch": 0.665406427221172, "grad_norm": 19.75, "learning_rate": 1.8129770992366411e-07, "logits/chosen": -0.8987786173820496, "logits/rejected": -0.9898836016654968, "logps/chosen": -58.44793701171875, "logps/rejected": -85.73442077636719, "loss": 0.2653, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.3300415873527527, "rewards/margins": 2.320068836212158, "rewards/rejected": -1.9900271892547607, "step": 1320 }, { "epoch": 0.6704473850031506, "grad_norm": 26.625, "learning_rate": 1.7857142857142858e-07, "logits/chosen": -0.8488761186599731, "logits/rejected": -0.9688565135002136, "logps/chosen": -58.56892776489258, "logps/rejected": -84.93611907958984, "loss": 0.2507, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.33988556265830994, "rewards/margins": 2.270322799682617, "rewards/rejected": -1.9304373264312744, "step": 1330 }, { "epoch": 0.6754883427851291, "grad_norm": 14.25, "learning_rate": 1.7584514721919302e-07, "logits/chosen": -0.8936493992805481, "logits/rejected": -0.9925310015678406, "logps/chosen": -57.687774658203125, "logps/rejected": -86.47679138183594, "loss": 0.1893, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.372401624917984, "rewards/margins": 2.4566822052001953, "rewards/rejected": -2.084280490875244, "step": 1340 }, { "epoch": 0.6805293005671077, "grad_norm": 12.6875, "learning_rate": 1.7311886586695746e-07, "logits/chosen": -0.8652948141098022, "logits/rejected": -0.9912670254707336, "logps/chosen": -59.039756774902344, "logps/rejected": -84.45045471191406, "loss": 0.2659, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2832827866077423, "rewards/margins": 2.2053096294403076, "rewards/rejected": -1.9220268726348877, "step": 1350 }, { "epoch": 0.6855702583490864, "grad_norm": 8.9375, "learning_rate": 1.7039258451472192e-07, "logits/chosen": -0.830104649066925, "logits/rejected": -0.9250701069831848, "logps/chosen": -59.46904754638672, "logps/rejected": -84.41246032714844, "loss": 0.292, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.28448396921157837, "rewards/margins": 2.1490626335144043, "rewards/rejected": -1.8645786046981812, "step": 1360 }, { "epoch": 0.6906112161310649, "grad_norm": 16.25, "learning_rate": 1.6766630316248636e-07, "logits/chosen": -0.9015452265739441, "logits/rejected": -0.9891805648803711, "logps/chosen": -59.392555236816406, "logps/rejected": -84.06771087646484, "loss": 0.31, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.22625617682933807, "rewards/margins": 2.116712808609009, "rewards/rejected": -1.8904565572738647, "step": 1370 }, { "epoch": 0.6956521739130435, "grad_norm": 27.25, "learning_rate": 1.6494002181025082e-07, "logits/chosen": -0.8577421307563782, "logits/rejected": -0.9831596612930298, "logps/chosen": -59.06340408325195, "logps/rejected": -84.84831237792969, "loss": 0.2915, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.3013680577278137, "rewards/margins": 2.284956455230713, "rewards/rejected": -1.983588457107544, "step": 1380 }, { "epoch": 0.700693131695022, "grad_norm": 28.375, "learning_rate": 1.6221374045801526e-07, "logits/chosen": -0.8520832061767578, "logits/rejected": -0.9697486162185669, "logps/chosen": -59.848876953125, "logps/rejected": -85.40953063964844, "loss": 0.3043, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.2575295567512512, "rewards/margins": 2.2461721897125244, "rewards/rejected": -1.9886424541473389, "step": 1390 }, { "epoch": 0.7057340894770007, "grad_norm": 14.1875, "learning_rate": 1.594874591057797e-07, "logits/chosen": -0.8636363744735718, "logits/rejected": -0.9646850824356079, "logps/chosen": -59.90193557739258, "logps/rejected": -84.7515869140625, "loss": 0.2823, "rewards/accuracies": 0.90625, "rewards/chosen": 0.22327394783496857, "rewards/margins": 2.1452901363372803, "rewards/rejected": -1.9220161437988281, "step": 1400 }, { "epoch": 0.7057340894770007, "eval_logits/chosen": -0.8801774382591248, "eval_logits/rejected": -0.9814472794532776, "eval_logps/chosen": -59.3859748840332, "eval_logps/rejected": -84.68141174316406, "eval_loss": 0.29091569781303406, "eval_rewards/accuracies": 0.8892215490341187, "eval_rewards/chosen": 0.2620302140712738, "eval_rewards/margins": 2.191411256790161, "eval_rewards/rejected": -1.9293811321258545, "eval_runtime": 70.8997, "eval_samples_per_second": 23.554, "eval_steps_per_second": 23.554, "step": 1400 }, { "epoch": 0.7107750472589792, "grad_norm": 15.9375, "learning_rate": 1.5676117775354416e-07, "logits/chosen": -0.9009099006652832, "logits/rejected": -0.9855210185050964, "logps/chosen": -59.35606002807617, "logps/rejected": -83.26178741455078, "loss": 0.291, "rewards/accuracies": 0.875, "rewards/chosen": 0.2705304026603699, "rewards/margins": 2.0694515705108643, "rewards/rejected": -1.7989212274551392, "step": 1410 }, { "epoch": 0.7158160050409578, "grad_norm": 16.375, "learning_rate": 1.540348964013086e-07, "logits/chosen": -0.8992331624031067, "logits/rejected": -0.9897601008415222, "logps/chosen": -58.76881790161133, "logps/rejected": -85.6570816040039, "loss": 0.2559, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.3151175379753113, "rewards/margins": 2.322927951812744, "rewards/rejected": -2.007810354232788, "step": 1420 }, { "epoch": 0.7208569628229363, "grad_norm": 20.5, "learning_rate": 1.5130861504907304e-07, "logits/chosen": -0.8569045066833496, "logits/rejected": -0.9533321261405945, "logps/chosen": -58.429908752441406, "logps/rejected": -84.6368179321289, "loss": 0.2602, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.35319021344184875, "rewards/margins": 2.2607696056365967, "rewards/rejected": -1.9075794219970703, "step": 1430 }, { "epoch": 0.725897920604915, "grad_norm": 18.5, "learning_rate": 1.485823336968375e-07, "logits/chosen": -0.8877646327018738, "logits/rejected": -0.9839455485343933, "logps/chosen": -59.098785400390625, "logps/rejected": -84.78980255126953, "loss": 0.2648, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.26629897952079773, "rewards/margins": 2.2212090492248535, "rewards/rejected": -1.9549100399017334, "step": 1440 }, { "epoch": 0.7309388783868935, "grad_norm": 21.75, "learning_rate": 1.4585605234460197e-07, "logits/chosen": -0.8866798281669617, "logits/rejected": -0.9940204620361328, "logps/chosen": -58.56513214111328, "logps/rejected": -85.08773040771484, "loss": 0.2643, "rewards/accuracies": 0.90625, "rewards/chosen": 0.32275161147117615, "rewards/margins": 2.3079631328582764, "rewards/rejected": -1.9852116107940674, "step": 1450 }, { "epoch": 0.7359798361688721, "grad_norm": 39.25, "learning_rate": 1.4312977099236638e-07, "logits/chosen": -0.9192501306533813, "logits/rejected": -1.0168657302856445, "logps/chosen": -59.260459899902344, "logps/rejected": -83.24293518066406, "loss": 0.3559, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.2648276388645172, "rewards/margins": 2.0748324394226074, "rewards/rejected": -1.8100048303604126, "step": 1460 }, { "epoch": 0.7410207939508506, "grad_norm": 15.6875, "learning_rate": 1.4040348964013085e-07, "logits/chosen": -0.8752067685127258, "logits/rejected": -0.991533100605011, "logps/chosen": -58.654205322265625, "logps/rejected": -85.35607147216797, "loss": 0.3037, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.31618744134902954, "rewards/margins": 2.2922956943511963, "rewards/rejected": -1.976108193397522, "step": 1470 }, { "epoch": 0.7460617517328293, "grad_norm": 16.75, "learning_rate": 1.376772082878953e-07, "logits/chosen": -0.8595184087753296, "logits/rejected": -0.9737972021102905, "logps/chosen": -58.14202880859375, "logps/rejected": -84.88631439208984, "loss": 0.2482, "rewards/accuracies": 0.90625, "rewards/chosen": 0.375678151845932, "rewards/margins": 2.3022727966308594, "rewards/rejected": -1.9265944957733154, "step": 1480 }, { "epoch": 0.7511027095148078, "grad_norm": 22.5, "learning_rate": 1.3495092693565978e-07, "logits/chosen": -0.8643430471420288, "logits/rejected": -0.9914228320121765, "logps/chosen": -57.81464767456055, "logps/rejected": -85.24693298339844, "loss": 0.2338, "rewards/accuracies": 0.90625, "rewards/chosen": 0.395480215549469, "rewards/margins": 2.3924388885498047, "rewards/rejected": -1.9969587326049805, "step": 1490 }, { "epoch": 0.7561436672967864, "grad_norm": 17.5, "learning_rate": 1.322246455834242e-07, "logits/chosen": -0.8798907995223999, "logits/rejected": -0.9934282302856445, "logps/chosen": -58.668060302734375, "logps/rejected": -85.68354034423828, "loss": 0.2743, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.3188169598579407, "rewards/margins": 2.338467836380005, "rewards/rejected": -2.019650936126709, "step": 1500 }, { "epoch": 0.7561436672967864, "eval_logits/chosen": -0.8938608169555664, "eval_logits/rejected": -0.9942108988761902, "eval_logps/chosen": -59.54497528076172, "eval_logps/rejected": -84.85215759277344, "eval_loss": 0.29076066613197327, "eval_rewards/accuracies": 0.8868263363838196, "eval_rewards/chosen": 0.24613051116466522, "eval_rewards/margins": 2.1925852298736572, "eval_rewards/rejected": -1.946454644203186, "eval_runtime": 72.3546, "eval_samples_per_second": 23.081, "eval_steps_per_second": 23.081, "step": 1500 }, { "epoch": 0.7611846250787649, "grad_norm": 18.875, "learning_rate": 1.2949836423118865e-07, "logits/chosen": -0.8818739652633667, "logits/rejected": -1.0159088373184204, "logps/chosen": -57.401039123535156, "logps/rejected": -87.64556121826172, "loss": 0.1741, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4094136357307434, "rewards/margins": 2.643099784851074, "rewards/rejected": -2.2336859703063965, "step": 1510 }, { "epoch": 0.7662255828607435, "grad_norm": 20.625, "learning_rate": 1.2677208287895312e-07, "logits/chosen": -0.8670439720153809, "logits/rejected": -0.9854429364204407, "logps/chosen": -58.494041442871094, "logps/rejected": -85.44778442382812, "loss": 0.2207, "rewards/accuracies": 0.90625, "rewards/chosen": 0.35261982679367065, "rewards/margins": 2.3209452629089355, "rewards/rejected": -1.9683253765106201, "step": 1520 }, { "epoch": 0.7712665406427222, "grad_norm": 21.25, "learning_rate": 1.2404580152671756e-07, "logits/chosen": -0.8820363879203796, "logits/rejected": -1.001293420791626, "logps/chosen": -59.6056022644043, "logps/rejected": -85.07635498046875, "loss": 0.3197, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.25649771094322205, "rewards/margins": 2.231887102127075, "rewards/rejected": -1.9753892421722412, "step": 1530 }, { "epoch": 0.7763074984247007, "grad_norm": 25.25, "learning_rate": 1.21319520174482e-07, "logits/chosen": -0.9083970785140991, "logits/rejected": -1.029203176498413, "logps/chosen": -58.7678108215332, "logps/rejected": -84.92195129394531, "loss": 0.2812, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.2764410376548767, "rewards/margins": 2.2596542835235596, "rewards/rejected": -1.983213186264038, "step": 1540 }, { "epoch": 0.7813484562066793, "grad_norm": 35.5, "learning_rate": 1.1859323882224645e-07, "logits/chosen": -0.848700225353241, "logits/rejected": -0.9551340341567993, "logps/chosen": -59.077735900878906, "logps/rejected": -86.9412612915039, "loss": 0.2465, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3101661801338196, "rewards/margins": 2.430529832839966, "rewards/rejected": -2.120363712310791, "step": 1550 }, { "epoch": 0.7863894139886578, "grad_norm": 26.75, "learning_rate": 1.1586695747001091e-07, "logits/chosen": -0.8691787719726562, "logits/rejected": -0.977543830871582, "logps/chosen": -59.203521728515625, "logps/rejected": -84.72015380859375, "loss": 0.2758, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.28436484932899475, "rewards/margins": 2.2119786739349365, "rewards/rejected": -1.9276138544082642, "step": 1560 }, { "epoch": 0.7914303717706365, "grad_norm": 14.0625, "learning_rate": 1.1314067611777535e-07, "logits/chosen": -0.8494786024093628, "logits/rejected": -0.9657021760940552, "logps/chosen": -58.790069580078125, "logps/rejected": -85.69227600097656, "loss": 0.2438, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.30834752321243286, "rewards/margins": 2.33400297164917, "rewards/rejected": -2.0256552696228027, "step": 1570 }, { "epoch": 0.796471329552615, "grad_norm": 9.9375, "learning_rate": 1.1041439476553979e-07, "logits/chosen": -0.898410975933075, "logits/rejected": -0.9827295541763306, "logps/chosen": -59.0511360168457, "logps/rejected": -84.7381591796875, "loss": 0.3078, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.2760066092014313, "rewards/margins": 2.209929943084717, "rewards/rejected": -1.9339231252670288, "step": 1580 }, { "epoch": 0.8015122873345936, "grad_norm": 11.375, "learning_rate": 1.0768811341330425e-07, "logits/chosen": -0.8804594874382019, "logits/rejected": -1.0089373588562012, "logps/chosen": -57.877601623535156, "logps/rejected": -86.3949966430664, "loss": 0.2237, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.39419177174568176, "rewards/margins": 2.4698028564453125, "rewards/rejected": -2.075611114501953, "step": 1590 }, { "epoch": 0.8065532451165721, "grad_norm": 13.5625, "learning_rate": 1.0496183206106869e-07, "logits/chosen": -0.885684609413147, "logits/rejected": -1.007638931274414, "logps/chosen": -57.24601364135742, "logps/rejected": -85.00395202636719, "loss": 0.2164, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.43452557921409607, "rewards/margins": 2.38159441947937, "rewards/rejected": -1.9470688104629517, "step": 1600 }, { "epoch": 0.8065532451165721, "eval_logits/chosen": -0.8853088617324829, "eval_logits/rejected": -0.9853202700614929, "eval_logps/chosen": -59.518455505371094, "eval_logps/rejected": -84.9808349609375, "eval_loss": 0.2906578481197357, "eval_rewards/accuracies": 0.886227548122406, "eval_rewards/chosen": 0.24878181517124176, "eval_rewards/margins": 2.2081050872802734, "eval_rewards/rejected": -1.959323525428772, "eval_runtime": 72.4075, "eval_samples_per_second": 23.064, "eval_steps_per_second": 23.064, "step": 1600 }, { "epoch": 0.8115942028985508, "grad_norm": 22.0, "learning_rate": 1.0223555070883315e-07, "logits/chosen": -0.8960712552070618, "logits/rejected": -1.01849365234375, "logps/chosen": -59.022682189941406, "logps/rejected": -85.11215209960938, "loss": 0.234, "rewards/accuracies": 0.90625, "rewards/chosen": 0.30705201625823975, "rewards/margins": 2.2740187644958496, "rewards/rejected": -1.966966986656189, "step": 1610 }, { "epoch": 0.8166351606805293, "grad_norm": 15.3125, "learning_rate": 9.950926935659759e-08, "logits/chosen": -0.8351577520370483, "logits/rejected": -0.950478196144104, "logps/chosen": -61.142723083496094, "logps/rejected": -83.6924819946289, "loss": 0.3591, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1616726666688919, "rewards/margins": 1.9606307744979858, "rewards/rejected": -1.7989578247070312, "step": 1620 }, { "epoch": 0.8216761184625079, "grad_norm": 20.125, "learning_rate": 9.678298800436204e-08, "logits/chosen": -0.8941848874092102, "logits/rejected": -0.9981807470321655, "logps/chosen": -59.347694396972656, "logps/rejected": -83.65950012207031, "loss": 0.3604, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.27368858456611633, "rewards/margins": 2.1373450756073, "rewards/rejected": -1.8636566400527954, "step": 1630 }, { "epoch": 0.8267170762444864, "grad_norm": 22.0, "learning_rate": 9.40567066521265e-08, "logits/chosen": -0.8934575915336609, "logits/rejected": -1.0223013162612915, "logps/chosen": -58.4257698059082, "logps/rejected": -85.53926086425781, "loss": 0.2539, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.34174391627311707, "rewards/margins": 2.34285569190979, "rewards/rejected": -2.0011115074157715, "step": 1640 }, { "epoch": 0.831758034026465, "grad_norm": 36.0, "learning_rate": 9.133042529989095e-08, "logits/chosen": -0.834098219871521, "logits/rejected": -0.95869380235672, "logps/chosen": -60.06525802612305, "logps/rejected": -84.23382568359375, "loss": 0.3107, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.22542810440063477, "rewards/margins": 2.11167573928833, "rewards/rejected": -1.8862476348876953, "step": 1650 }, { "epoch": 0.8367989918084436, "grad_norm": 13.375, "learning_rate": 8.860414394765539e-08, "logits/chosen": -0.8735988736152649, "logits/rejected": -0.9876992106437683, "logps/chosen": -58.84014892578125, "logps/rejected": -85.5834732055664, "loss": 0.2594, "rewards/accuracies": 0.90625, "rewards/chosen": 0.31726759672164917, "rewards/margins": 2.3258700370788574, "rewards/rejected": -2.0086026191711426, "step": 1660 }, { "epoch": 0.8418399495904222, "grad_norm": 32.25, "learning_rate": 8.587786259541985e-08, "logits/chosen": -0.845401406288147, "logits/rejected": -0.9550067782402039, "logps/chosen": -58.90953826904297, "logps/rejected": -85.50616455078125, "loss": 0.293, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.3285283148288727, "rewards/margins": 2.3436825275421143, "rewards/rejected": -2.0151543617248535, "step": 1670 }, { "epoch": 0.8468809073724007, "grad_norm": 29.625, "learning_rate": 8.315158124318429e-08, "logits/chosen": -0.846836268901825, "logits/rejected": -0.9689435958862305, "logps/chosen": -59.2591552734375, "logps/rejected": -84.11135864257812, "loss": 0.3235, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.30486565828323364, "rewards/margins": 2.1752853393554688, "rewards/rejected": -1.8704197406768799, "step": 1680 }, { "epoch": 0.8519218651543793, "grad_norm": 18.25, "learning_rate": 8.042529989094875e-08, "logits/chosen": -0.8394562005996704, "logits/rejected": -0.9826558828353882, "logps/chosen": -58.2289924621582, "logps/rejected": -85.12105560302734, "loss": 0.2287, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.386637419462204, "rewards/margins": 2.350778341293335, "rewards/rejected": -1.9641408920288086, "step": 1690 }, { "epoch": 0.856962822936358, "grad_norm": 14.1875, "learning_rate": 7.769901853871319e-08, "logits/chosen": -0.8795615434646606, "logits/rejected": -0.9935046434402466, "logps/chosen": -56.979454040527344, "logps/rejected": -86.98414611816406, "loss": 0.1638, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.47562724351882935, "rewards/margins": 2.5870258808135986, "rewards/rejected": -2.111398220062256, "step": 1700 }, { "epoch": 0.856962822936358, "eval_logits/chosen": -0.8943283557891846, "eval_logits/rejected": -0.9974154829978943, "eval_logps/chosen": -59.26322937011719, "eval_logps/rejected": -84.75007629394531, "eval_loss": 0.29021164774894714, "eval_rewards/accuracies": 0.8892215490341187, "eval_rewards/chosen": 0.2743041515350342, "eval_rewards/margins": 2.2105515003204346, "eval_rewards/rejected": -1.9362471103668213, "eval_runtime": 70.6147, "eval_samples_per_second": 23.649, "eval_steps_per_second": 23.649, "step": 1700 }, { "epoch": 0.8620037807183365, "grad_norm": 31.125, "learning_rate": 7.497273718647764e-08, "logits/chosen": -0.9100006818771362, "logits/rejected": -1.028938889503479, "logps/chosen": -58.05731964111328, "logps/rejected": -84.74702453613281, "loss": 0.2487, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3694465756416321, "rewards/margins": 2.2923641204833984, "rewards/rejected": -1.9229179620742798, "step": 1710 }, { "epoch": 0.867044738500315, "grad_norm": 16.625, "learning_rate": 7.22464558342421e-08, "logits/chosen": -0.8805274963378906, "logits/rejected": -0.9727686047554016, "logps/chosen": -58.530845642089844, "logps/rejected": -84.3791275024414, "loss": 0.2809, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3172784447669983, "rewards/margins": 2.2292685508728027, "rewards/rejected": -1.9119901657104492, "step": 1720 }, { "epoch": 0.8720856962822936, "grad_norm": 24.125, "learning_rate": 6.952017448200655e-08, "logits/chosen": -0.903597354888916, "logits/rejected": -1.0219088792800903, "logps/chosen": -58.6904411315918, "logps/rejected": -84.50187683105469, "loss": 0.252, "rewards/accuracies": 0.875, "rewards/chosen": 0.3121050298213959, "rewards/margins": 2.2391486167907715, "rewards/rejected": -1.9270436763763428, "step": 1730 }, { "epoch": 0.8771266540642723, "grad_norm": 8.8125, "learning_rate": 6.679389312977098e-08, "logits/chosen": -0.8927151560783386, "logits/rejected": -0.9953739047050476, "logps/chosen": -59.15825271606445, "logps/rejected": -85.2041244506836, "loss": 0.2347, "rewards/accuracies": 0.9375, "rewards/chosen": 0.29449015855789185, "rewards/margins": 2.263671398162842, "rewards/rejected": -1.9691814184188843, "step": 1740 }, { "epoch": 0.8821676118462508, "grad_norm": 8.25, "learning_rate": 6.406761177753544e-08, "logits/chosen": -0.857822597026825, "logits/rejected": -0.9733842015266418, "logps/chosen": -58.579246520996094, "logps/rejected": -85.5455551147461, "loss": 0.2629, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.3299351930618286, "rewards/margins": 2.326323986053467, "rewards/rejected": -1.9963890314102173, "step": 1750 }, { "epoch": 0.8872085696282294, "grad_norm": 12.5, "learning_rate": 6.134133042529989e-08, "logits/chosen": -0.9069849848747253, "logits/rejected": -1.0028917789459229, "logps/chosen": -58.58331298828125, "logps/rejected": -86.31389617919922, "loss": 0.2365, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.3112293779850006, "rewards/margins": 2.403074026107788, "rewards/rejected": -2.0918445587158203, "step": 1760 }, { "epoch": 0.8922495274102079, "grad_norm": 26.375, "learning_rate": 5.861504907306434e-08, "logits/chosen": -0.8625677227973938, "logits/rejected": -0.9746575355529785, "logps/chosen": -57.2651252746582, "logps/rejected": -85.93891906738281, "loss": 0.2094, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.417421817779541, "rewards/margins": 2.4767680168151855, "rewards/rejected": -2.0593464374542236, "step": 1770 }, { "epoch": 0.8972904851921865, "grad_norm": 29.875, "learning_rate": 5.588876772082879e-08, "logits/chosen": -0.8995459675788879, "logits/rejected": -1.0156245231628418, "logps/chosen": -58.09168243408203, "logps/rejected": -85.68585968017578, "loss": 0.2295, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.3603942096233368, "rewards/margins": 2.3807685375213623, "rewards/rejected": -2.020374298095703, "step": 1780 }, { "epoch": 0.9023314429741651, "grad_norm": 11.75, "learning_rate": 5.316248636859324e-08, "logits/chosen": -0.8375666737556458, "logits/rejected": -0.9661946296691895, "logps/chosen": -59.528541564941406, "logps/rejected": -84.20811462402344, "loss": 0.3174, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.2562711834907532, "rewards/margins": 2.144699811935425, "rewards/rejected": -1.8884284496307373, "step": 1790 }, { "epoch": 0.9073724007561437, "grad_norm": 26.25, "learning_rate": 5.043620501635769e-08, "logits/chosen": -0.8782273530960083, "logits/rejected": -0.9897629618644714, "logps/chosen": -58.66547393798828, "logps/rejected": -84.6033706665039, "loss": 0.2588, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3361870348453522, "rewards/margins": 2.2399086952209473, "rewards/rejected": -1.903721570968628, "step": 1800 }, { "epoch": 0.9073724007561437, "eval_logits/chosen": -0.891097903251648, "eval_logits/rejected": -0.9927994608879089, "eval_logps/chosen": -59.322288513183594, "eval_logps/rejected": -84.85601806640625, "eval_loss": 0.2902388572692871, "eval_rewards/accuracies": 0.8886227607727051, "eval_rewards/chosen": 0.2683981657028198, "eval_rewards/margins": 2.215238094329834, "eval_rewards/rejected": -1.9468399286270142, "eval_runtime": 73.6165, "eval_samples_per_second": 22.685, "eval_steps_per_second": 22.685, "step": 1800 }, { "epoch": 0.9124133585381222, "grad_norm": 23.125, "learning_rate": 4.770992366412214e-08, "logits/chosen": -0.9097870588302612, "logits/rejected": -1.0100575685501099, "logps/chosen": -58.4896354675293, "logps/rejected": -85.04032897949219, "loss": 0.2725, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3351896405220032, "rewards/margins": 2.258465528488159, "rewards/rejected": -1.9232757091522217, "step": 1810 }, { "epoch": 0.9174543163201008, "grad_norm": 10.0625, "learning_rate": 4.498364231188658e-08, "logits/chosen": -0.9119001626968384, "logits/rejected": -0.9829280972480774, "logps/chosen": -59.015846252441406, "logps/rejected": -85.16139221191406, "loss": 0.2527, "rewards/accuracies": 0.90625, "rewards/chosen": 0.29453665018081665, "rewards/margins": 2.282649517059326, "rewards/rejected": -1.9881130456924438, "step": 1820 }, { "epoch": 0.9224952741020794, "grad_norm": 37.0, "learning_rate": 4.225736095965103e-08, "logits/chosen": -0.8899482488632202, "logits/rejected": -1.003125786781311, "logps/chosen": -58.26552200317383, "logps/rejected": -84.43729400634766, "loss": 0.2777, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.3805728554725647, "rewards/margins": 2.2908310890197754, "rewards/rejected": -1.9102582931518555, "step": 1830 }, { "epoch": 0.927536231884058, "grad_norm": 28.5, "learning_rate": 3.953107960741548e-08, "logits/chosen": -0.8962399363517761, "logits/rejected": -1.0039699077606201, "logps/chosen": -59.460975646972656, "logps/rejected": -84.71418762207031, "loss": 0.3002, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.2653732895851135, "rewards/margins": 2.18709135055542, "rewards/rejected": -1.9217180013656616, "step": 1840 }, { "epoch": 0.9325771896660365, "grad_norm": 25.375, "learning_rate": 3.680479825517993e-08, "logits/chosen": -0.8921709060668945, "logits/rejected": -0.9834814071655273, "logps/chosen": -59.8410530090332, "logps/rejected": -84.02848052978516, "loss": 0.2857, "rewards/accuracies": 0.875, "rewards/chosen": 0.2075158655643463, "rewards/margins": 2.0984721183776855, "rewards/rejected": -1.890955924987793, "step": 1850 }, { "epoch": 0.9376181474480151, "grad_norm": 17.0, "learning_rate": 3.407851690294438e-08, "logits/chosen": -0.8759490847587585, "logits/rejected": -0.9916941523551941, "logps/chosen": -58.60956954956055, "logps/rejected": -84.21783447265625, "loss": 0.2933, "rewards/accuracies": 0.875, "rewards/chosen": 0.34195294976234436, "rewards/margins": 2.2317111492156982, "rewards/rejected": -1.8897583484649658, "step": 1860 }, { "epoch": 0.9426591052299937, "grad_norm": 16.125, "learning_rate": 3.135223555070883e-08, "logits/chosen": -0.8751128315925598, "logits/rejected": -0.9996623992919922, "logps/chosen": -58.18467330932617, "logps/rejected": -85.35218811035156, "loss": 0.2311, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.3810577392578125, "rewards/margins": 2.361508369445801, "rewards/rejected": -1.9804503917694092, "step": 1870 }, { "epoch": 0.9477000630119723, "grad_norm": 20.625, "learning_rate": 2.862595419847328e-08, "logits/chosen": -0.8567900657653809, "logits/rejected": -0.96808922290802, "logps/chosen": -58.792396545410156, "logps/rejected": -84.65996551513672, "loss": 0.2498, "rewards/accuracies": 0.90625, "rewards/chosen": 0.33555370569229126, "rewards/margins": 2.2720112800598145, "rewards/rejected": -1.9364579916000366, "step": 1880 }, { "epoch": 0.9527410207939508, "grad_norm": 16.25, "learning_rate": 2.589967284623773e-08, "logits/chosen": -0.8800870776176453, "logits/rejected": -1.0060070753097534, "logps/chosen": -57.40453338623047, "logps/rejected": -85.77619171142578, "loss": 0.2096, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4209045469760895, "rewards/margins": 2.4640016555786133, "rewards/rejected": -2.0430970191955566, "step": 1890 }, { "epoch": 0.9577819785759294, "grad_norm": 32.75, "learning_rate": 2.317339149400218e-08, "logits/chosen": -0.8548176884651184, "logits/rejected": -0.9700508117675781, "logps/chosen": -59.573455810546875, "logps/rejected": -84.24436950683594, "loss": 0.2916, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.2536848485469818, "rewards/margins": 2.134523391723633, "rewards/rejected": -1.8808386325836182, "step": 1900 }, { "epoch": 0.9577819785759294, "eval_logits/chosen": -0.8832784295082092, "eval_logits/rejected": -0.9866493344306946, "eval_logps/chosen": -59.40935516357422, "eval_logps/rejected": -84.93498229980469, "eval_loss": 0.2900025248527527, "eval_rewards/accuracies": 0.8886227607727051, "eval_rewards/chosen": 0.25969186425209045, "eval_rewards/margins": 2.2144293785095215, "eval_rewards/rejected": -1.9547375440597534, "eval_runtime": 70.3437, "eval_samples_per_second": 23.741, "eval_steps_per_second": 23.741, "step": 1900 }, { "epoch": 0.962822936357908, "grad_norm": 22.125, "learning_rate": 2.044711014176663e-08, "logits/chosen": -0.9094161987304688, "logits/rejected": -1.0066895484924316, "logps/chosen": -58.97810745239258, "logps/rejected": -83.76020812988281, "loss": 0.2996, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.2789512574672699, "rewards/margins": 2.176806688308716, "rewards/rejected": -1.897855520248413, "step": 1910 }, { "epoch": 0.9678638941398866, "grad_norm": 16.75, "learning_rate": 1.772082878953108e-08, "logits/chosen": -0.8881312608718872, "logits/rejected": -0.9979362487792969, "logps/chosen": -58.24678421020508, "logps/rejected": -84.76899719238281, "loss": 0.2573, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.3678635060787201, "rewards/margins": 2.299140453338623, "rewards/rejected": -1.9312770366668701, "step": 1920 }, { "epoch": 0.9729048519218652, "grad_norm": 10.75, "learning_rate": 1.4994547437295527e-08, "logits/chosen": -0.8843402862548828, "logits/rejected": -0.9742966890335083, "logps/chosen": -59.153114318847656, "logps/rejected": -85.64107513427734, "loss": 0.2745, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.2713843286037445, "rewards/margins": 2.270181179046631, "rewards/rejected": -1.998796820640564, "step": 1930 }, { "epoch": 0.9779458097038437, "grad_norm": 25.375, "learning_rate": 1.2268266085059978e-08, "logits/chosen": -0.872350811958313, "logits/rejected": -0.9762080907821655, "logps/chosen": -58.68449783325195, "logps/rejected": -84.45391845703125, "loss": 0.2923, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.31912150979042053, "rewards/margins": 2.221708059310913, "rewards/rejected": -1.9025866985321045, "step": 1940 }, { "epoch": 0.9829867674858223, "grad_norm": 17.875, "learning_rate": 9.541984732824428e-09, "logits/chosen": -0.8793309926986694, "logits/rejected": -0.9880158305168152, "logps/chosen": -58.13665771484375, "logps/rejected": -85.4889144897461, "loss": 0.2236, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.3792751729488373, "rewards/margins": 2.364917039871216, "rewards/rejected": -1.9856418371200562, "step": 1950 }, { "epoch": 0.9880277252678009, "grad_norm": 14.8125, "learning_rate": 6.815703380588876e-09, "logits/chosen": -0.8793843984603882, "logits/rejected": -0.9873378872871399, "logps/chosen": -57.81184005737305, "logps/rejected": -85.71204376220703, "loss": 0.235, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.3983749747276306, "rewards/margins": 2.4253978729248047, "rewards/rejected": -2.0270228385925293, "step": 1960 }, { "epoch": 0.9930686830497795, "grad_norm": 22.75, "learning_rate": 4.089422028353326e-09, "logits/chosen": -0.8643286824226379, "logits/rejected": -0.9733026623725891, "logps/chosen": -58.93427658081055, "logps/rejected": -84.8313980102539, "loss": 0.2905, "rewards/accuracies": 0.875, "rewards/chosen": 0.29614943265914917, "rewards/margins": 2.2761871814727783, "rewards/rejected": -1.9800374507904053, "step": 1970 }, { "epoch": 0.998109640831758, "grad_norm": 31.75, "learning_rate": 1.3631406761177753e-09, "logits/chosen": -0.8575819134712219, "logits/rejected": -0.9438503980636597, "logps/chosen": -60.40618896484375, "logps/rejected": -82.56617736816406, "loss": 0.3551, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2122369259595871, "rewards/margins": 1.9236053228378296, "rewards/rejected": -1.7113683223724365, "step": 1980 } ], "logging_steps": 10, "max_steps": 1984, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }