{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9666818804198996, "eval_steps": 100, "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.559270516717325e-10, "logits/chosen": -0.9643518328666687, "logits/rejected": -0.9552459120750427, "logps/chosen": -76.82135772705078, "logps/rejected": -59.52644348144531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 4.559270516717325e-09, "logits/chosen": -0.9517099857330322, "logits/rejected": -0.9616715908050537, "logps/chosen": -93.39585876464844, "logps/rejected": -69.06417846679688, "loss": 0.6969, "rewards/accuracies": 0.3055555522441864, "rewards/chosen": -0.04679955542087555, "rewards/margins": -0.035246770828962326, "rewards/rejected": -0.011552784591913223, "step": 10 }, { "epoch": 0.01, "learning_rate": 9.11854103343465e-09, "logits/chosen": -0.9816751480102539, "logits/rejected": -1.003169298171997, "logps/chosen": -90.08583068847656, "logps/rejected": -69.84603118896484, "loss": 0.6993, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00020122528076171875, "rewards/margins": -0.014502143487334251, "rewards/rejected": 0.014300918206572533, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.3677811550151975e-08, "logits/chosen": -0.9795465469360352, "logits/rejected": -0.9944146275520325, "logps/chosen": -90.3318099975586, "logps/rejected": -66.77433013916016, "loss": 0.6898, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.007479047868400812, "rewards/margins": 0.011048078536987305, "rewards/rejected": -0.0035690306685864925, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.82370820668693e-08, "logits/chosen": -0.9575172662734985, "logits/rejected": -0.974955677986145, "logps/chosen": -94.9634017944336, "logps/rejected": -64.13943481445312, "loss": 0.6901, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.020472276955842972, "rewards/margins": -0.0019337296253070235, "rewards/rejected": -0.018538545817136765, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.2796352583586623e-08, "logits/chosen": -0.9589303731918335, "logits/rejected": -0.9642618894577026, "logps/chosen": -96.45872497558594, "logps/rejected": -69.0987319946289, "loss": 0.679, "rewards/accuracies": 0.5, "rewards/chosen": -0.0022797822020947933, "rewards/margins": 0.03321406990289688, "rewards/rejected": -0.03549385070800781, "step": 50 }, { "epoch": 0.03, "learning_rate": 2.735562310030395e-08, "logits/chosen": -0.9560245275497437, "logits/rejected": -0.9745736122131348, "logps/chosen": -90.70116424560547, "logps/rejected": -70.5436782836914, "loss": 0.6589, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.014189362525939941, "rewards/margins": 0.053305577486753464, "rewards/rejected": -0.03911621496081352, "step": 60 }, { "epoch": 0.03, "learning_rate": 3.191489361702128e-08, "logits/chosen": -1.006732702255249, "logits/rejected": -1.0067856311798096, "logps/chosen": -86.19349670410156, "logps/rejected": -66.15892028808594, "loss": 0.6445, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.066792331635952, "rewards/margins": 0.12021535634994507, "rewards/rejected": -0.05342302471399307, "step": 70 }, { "epoch": 0.04, "learning_rate": 3.64741641337386e-08, "logits/chosen": -1.0003823041915894, "logits/rejected": -1.0047554969787598, "logps/chosen": -96.32222747802734, "logps/rejected": -68.91173553466797, "loss": 0.6131, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.09268401563167572, "rewards/margins": 0.19902931153774261, "rewards/rejected": -0.1063452959060669, "step": 80 }, { "epoch": 0.04, "learning_rate": 4.1033434650455923e-08, "logits/chosen": -0.9865853190422058, "logits/rejected": -0.9936298131942749, "logps/chosen": -89.23389434814453, "logps/rejected": -64.02348327636719, "loss": 0.5757, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.13552097976207733, "rewards/margins": 0.2616090774536133, "rewards/rejected": -0.12608811259269714, "step": 90 }, { "epoch": 0.05, "learning_rate": 4.559270516717325e-08, "logits/chosen": -0.9565617442131042, "logits/rejected": -0.9617575407028198, "logps/chosen": -91.22322845458984, "logps/rejected": -68.244140625, "loss": 0.5412, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14585547149181366, "rewards/margins": 0.3651939332485199, "rewards/rejected": -0.21933846175670624, "step": 100 }, { "epoch": 0.05, "eval_logits/chosen": -0.9840940237045288, "eval_logits/rejected": -0.9992539286613464, "eval_logps/chosen": -89.1026611328125, "eval_logps/rejected": -65.7355728149414, "eval_loss": 0.5342935919761658, "eval_rewards/accuracies": 0.9441340565681458, "eval_rewards/chosen": 0.16487935185432434, "eval_rewards/margins": 0.3618040084838867, "eval_rewards/rejected": -0.19692467153072357, "eval_runtime": 71.4873, "eval_samples_per_second": 40.035, "eval_steps_per_second": 2.504, "step": 100 }, { "epoch": 0.05, "learning_rate": 5.015197568389058e-08, "logits/chosen": -0.9964181780815125, "logits/rejected": -1.000222086906433, "logps/chosen": -89.18244934082031, "logps/rejected": -67.94490814208984, "loss": 0.5304, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.12820395827293396, "rewards/margins": 0.37578168511390686, "rewards/rejected": -0.2475777566432953, "step": 110 }, { "epoch": 0.05, "learning_rate": 5.47112462006079e-08, "logits/chosen": -0.9542981386184692, "logits/rejected": -0.9645611643791199, "logps/chosen": -85.86537170410156, "logps/rejected": -67.51045989990234, "loss": 0.4784, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18605713546276093, "rewards/margins": 0.5225586891174316, "rewards/rejected": -0.3365015983581543, "step": 120 }, { "epoch": 0.06, "learning_rate": 5.9270516717325223e-08, "logits/chosen": -0.9908889532089233, "logits/rejected": -0.9953571557998657, "logps/chosen": -92.44700622558594, "logps/rejected": -69.91798400878906, "loss": 0.3954, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.338260680437088, "rewards/margins": 0.743990957736969, "rewards/rejected": -0.4057301878929138, "step": 130 }, { "epoch": 0.06, "learning_rate": 6.382978723404255e-08, "logits/chosen": -0.954239010810852, "logits/rejected": -0.968449592590332, "logps/chosen": -91.10675811767578, "logps/rejected": -67.44385528564453, "loss": 0.3584, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.36729928851127625, "rewards/margins": 0.9051514863967896, "rewards/rejected": -0.5378521680831909, "step": 140 }, { "epoch": 0.07, "learning_rate": 6.838905775075987e-08, "logits/chosen": -0.9643675088882446, "logits/rejected": -0.9654477834701538, "logps/chosen": -93.19139099121094, "logps/rejected": -69.21319580078125, "loss": 0.3223, "rewards/accuracies": 1.0, "rewards/chosen": 0.491039901971817, "rewards/margins": 1.1034448146820068, "rewards/rejected": -0.6124049425125122, "step": 150 }, { "epoch": 0.07, "learning_rate": 7.29483282674772e-08, "logits/chosen": -0.9584270715713501, "logits/rejected": -0.9729134440422058, "logps/chosen": -93.25248718261719, "logps/rejected": -67.92393493652344, "loss": 0.3027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5195748805999756, "rewards/margins": 1.1697877645492554, "rewards/rejected": -0.6502128839492798, "step": 160 }, { "epoch": 0.08, "learning_rate": 7.750759878419453e-08, "logits/chosen": -0.9704807996749878, "logits/rejected": -0.9716874361038208, "logps/chosen": -83.62016296386719, "logps/rejected": -66.22118377685547, "loss": 0.3037, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4028417468070984, "rewards/margins": 1.2314786911010742, "rewards/rejected": -0.8286369442939758, "step": 170 }, { "epoch": 0.08, "learning_rate": 8.206686930091185e-08, "logits/chosen": -0.9470335245132446, "logits/rejected": -0.963607668876648, "logps/chosen": -87.67333984375, "logps/rejected": -67.34207916259766, "loss": 0.2843, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5842958688735962, "rewards/margins": 1.32895028591156, "rewards/rejected": -0.7446545362472534, "step": 180 }, { "epoch": 0.09, "learning_rate": 8.662613981762918e-08, "logits/chosen": -0.965084433555603, "logits/rejected": -0.9808289408683777, "logps/chosen": -89.90495300292969, "logps/rejected": -65.31876373291016, "loss": 0.2496, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6354175806045532, "rewards/margins": 1.4908440113067627, "rewards/rejected": -0.8554266095161438, "step": 190 }, { "epoch": 0.09, "learning_rate": 9.11854103343465e-08, "logits/chosen": -0.9802848100662231, "logits/rejected": -0.9860008358955383, "logps/chosen": -83.04852294921875, "logps/rejected": -66.60576629638672, "loss": 0.2248, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6749335527420044, "rewards/margins": 1.5874103307724, "rewards/rejected": -0.9124768972396851, "step": 200 }, { "epoch": 0.09, "eval_logits/chosen": -0.9838407635688782, "eval_logits/rejected": -0.996880292892456, "eval_logps/chosen": -88.08007049560547, "eval_logps/rejected": -67.23590850830078, "eval_loss": 0.22697897255420685, "eval_rewards/accuracies": 0.9832402467727661, "eval_rewards/chosen": 0.6761797666549683, "eval_rewards/margins": 1.6232739686965942, "eval_rewards/rejected": -0.9470942616462708, "eval_runtime": 105.1739, "eval_samples_per_second": 27.212, "eval_steps_per_second": 1.702, "step": 200 }, { "epoch": 0.1, "learning_rate": 9.574468085106384e-08, "logits/chosen": -0.9623929262161255, "logits/rejected": -0.9724255800247192, "logps/chosen": -82.79269409179688, "logps/rejected": -68.85140228271484, "loss": 0.1957, "rewards/accuracies": 1.0, "rewards/chosen": 0.6824924349784851, "rewards/margins": 1.8234453201293945, "rewards/rejected": -1.1409530639648438, "step": 210 }, { "epoch": 0.1, "learning_rate": 1.0030395136778115e-07, "logits/chosen": -0.9426813125610352, "logits/rejected": -0.95277339220047, "logps/chosen": -90.78047943115234, "logps/rejected": -70.69515991210938, "loss": 0.1768, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7748227715492249, "rewards/margins": 2.0056302547454834, "rewards/rejected": -1.2308075428009033, "step": 220 }, { "epoch": 0.1, "learning_rate": 1.0486322188449848e-07, "logits/chosen": -0.9746893048286438, "logits/rejected": -0.9837571978569031, "logps/chosen": -91.14712524414062, "logps/rejected": -69.38472747802734, "loss": 0.1403, "rewards/accuracies": 1.0, "rewards/chosen": 0.9424777030944824, "rewards/margins": 2.5599138736724854, "rewards/rejected": -1.6174360513687134, "step": 230 }, { "epoch": 0.11, "learning_rate": 1.094224924012158e-07, "logits/chosen": -0.9637104868888855, "logits/rejected": -0.9687842130661011, "logps/chosen": -88.99737548828125, "logps/rejected": -69.65807342529297, "loss": 0.1228, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0617095232009888, "rewards/margins": 2.6283512115478516, "rewards/rejected": -1.5666416883468628, "step": 240 }, { "epoch": 0.11, "learning_rate": 1.1398176291793313e-07, "logits/chosen": -0.9593694806098938, "logits/rejected": -0.9832932353019714, "logps/chosen": -89.79000091552734, "logps/rejected": -71.10289001464844, "loss": 0.116, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0695412158966064, "rewards/margins": 2.6741912364959717, "rewards/rejected": -1.6046499013900757, "step": 250 }, { "epoch": 0.12, "learning_rate": 1.1854103343465045e-07, "logits/chosen": -0.9898384809494019, "logits/rejected": -0.9927492141723633, "logps/chosen": -84.07673645019531, "logps/rejected": -67.94017028808594, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 1.1412315368652344, "rewards/margins": 2.985485315322876, "rewards/rejected": -1.8442538976669312, "step": 260 }, { "epoch": 0.12, "learning_rate": 1.2310030395136776e-07, "logits/chosen": -0.9738420248031616, "logits/rejected": -0.9792343974113464, "logps/chosen": -88.48084259033203, "logps/rejected": -75.20460510253906, "loss": 0.1053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2394745349884033, "rewards/margins": 3.0349550247192383, "rewards/rejected": -1.7954803705215454, "step": 270 }, { "epoch": 0.13, "learning_rate": 1.276595744680851e-07, "logits/chosen": -0.973858654499054, "logits/rejected": -0.977423369884491, "logps/chosen": -88.2381820678711, "logps/rejected": -69.44757080078125, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": 1.0679655075073242, "rewards/margins": 3.237596035003662, "rewards/rejected": -2.169631004333496, "step": 280 }, { "epoch": 0.13, "learning_rate": 1.3221884498480242e-07, "logits/chosen": -0.9805269241333008, "logits/rejected": -0.9859040975570679, "logps/chosen": -86.40400695800781, "logps/rejected": -73.2696533203125, "loss": 0.0876, "rewards/accuracies": 1.0, "rewards/chosen": 1.1778849363327026, "rewards/margins": 3.2878527641296387, "rewards/rejected": -2.1099677085876465, "step": 290 }, { "epoch": 0.14, "learning_rate": 1.3677811550151974e-07, "logits/chosen": -0.9615720510482788, "logits/rejected": -0.9615543484687805, "logps/chosen": -85.02987670898438, "logps/rejected": -69.7044906616211, "loss": 0.0827, "rewards/accuracies": 1.0, "rewards/chosen": 1.123947262763977, "rewards/margins": 3.40171480178833, "rewards/rejected": -2.2777674198150635, "step": 300 }, { "epoch": 0.14, "eval_logits/chosen": -0.9878929853439331, "eval_logits/rejected": -0.9959621429443359, "eval_logps/chosen": -86.76927947998047, "eval_logps/rejected": -69.99213409423828, "eval_loss": 0.08308280259370804, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 1.3315751552581787, "eval_rewards/margins": 3.6567811965942383, "eval_rewards/rejected": -2.3252058029174805, "eval_runtime": 61.2711, "eval_samples_per_second": 46.71, "eval_steps_per_second": 2.921, "step": 300 }, { "epoch": 0.14, "learning_rate": 1.4133738601823708e-07, "logits/chosen": -0.9472341537475586, "logits/rejected": -0.9612518548965454, "logps/chosen": -89.58518981933594, "logps/rejected": -70.13312530517578, "loss": 0.0787, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4316556453704834, "rewards/margins": 3.8420116901397705, "rewards/rejected": -2.410356044769287, "step": 310 }, { "epoch": 0.15, "learning_rate": 1.458966565349544e-07, "logits/chosen": -0.9657572507858276, "logits/rejected": -0.9684290885925293, "logps/chosen": -86.15750122070312, "logps/rejected": -69.44698333740234, "loss": 0.0743, "rewards/accuracies": 1.0, "rewards/chosen": 1.158479928970337, "rewards/margins": 3.4451236724853516, "rewards/rejected": -2.2866437435150146, "step": 320 }, { "epoch": 0.15, "learning_rate": 1.5045592705167174e-07, "logits/chosen": -0.9523347020149231, "logits/rejected": -0.9598930478096008, "logps/chosen": -91.19930267333984, "logps/rejected": -71.31523895263672, "loss": 0.0718, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.529072642326355, "rewards/margins": 3.7886452674865723, "rewards/rejected": -2.2595720291137695, "step": 330 }, { "epoch": 0.16, "learning_rate": 1.5501519756838906e-07, "logits/chosen": -0.9740470051765442, "logits/rejected": -0.9752241373062134, "logps/chosen": -82.92659759521484, "logps/rejected": -69.77666473388672, "loss": 0.0723, "rewards/accuracies": 1.0, "rewards/chosen": 1.3220869302749634, "rewards/margins": 4.073119163513184, "rewards/rejected": -2.7510321140289307, "step": 340 }, { "epoch": 0.16, "learning_rate": 1.5957446808510638e-07, "logits/chosen": -0.979371190071106, "logits/rejected": -0.9843432307243347, "logps/chosen": -88.71659851074219, "logps/rejected": -76.50203704833984, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": 1.327655553817749, "rewards/margins": 4.103621006011963, "rewards/rejected": -2.7759652137756348, "step": 350 }, { "epoch": 0.16, "learning_rate": 1.641337386018237e-07, "logits/chosen": -0.9631199836730957, "logits/rejected": -0.9769188165664673, "logps/chosen": -88.78105163574219, "logps/rejected": -71.40655517578125, "loss": 0.0576, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.553839087486267, "rewards/margins": 4.10164213180542, "rewards/rejected": -2.547802448272705, "step": 360 }, { "epoch": 0.17, "learning_rate": 1.6869300911854104e-07, "logits/chosen": -0.9622189402580261, "logits/rejected": -0.9701143503189087, "logps/chosen": -89.26042938232422, "logps/rejected": -75.21412658691406, "loss": 0.0687, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.384950041770935, "rewards/margins": 4.296316623687744, "rewards/rejected": -2.9113662242889404, "step": 370 }, { "epoch": 0.17, "learning_rate": 1.7325227963525835e-07, "logits/chosen": -0.9843562245368958, "logits/rejected": -0.9918986558914185, "logps/chosen": -86.35823059082031, "logps/rejected": -72.2359848022461, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 1.5267091989517212, "rewards/margins": 4.862682819366455, "rewards/rejected": -3.3359732627868652, "step": 380 }, { "epoch": 0.18, "learning_rate": 1.7781155015197567e-07, "logits/chosen": -0.9975897073745728, "logits/rejected": -0.9955156445503235, "logps/chosen": -91.64771270751953, "logps/rejected": -74.9527816772461, "loss": 0.0481, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4639146327972412, "rewards/margins": 5.018456935882568, "rewards/rejected": -3.5545425415039062, "step": 390 }, { "epoch": 0.18, "learning_rate": 1.82370820668693e-07, "logits/chosen": -0.9872757792472839, "logits/rejected": -0.9929397702217102, "logps/chosen": -86.04756164550781, "logps/rejected": -75.50514221191406, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": 1.6687867641448975, "rewards/margins": 5.520654201507568, "rewards/rejected": -3.851867198944092, "step": 400 }, { "epoch": 0.18, "eval_logits/chosen": -1.001560091972351, "eval_logits/rejected": -0.9991575479507446, "eval_logps/chosen": -86.00261688232422, "eval_logps/rejected": -72.45541381835938, "eval_loss": 0.04544173553586006, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.7149040699005127, "eval_rewards/margins": 5.271754264831543, "eval_rewards/rejected": -3.556849956512451, "eval_runtime": 62.4842, "eval_samples_per_second": 45.804, "eval_steps_per_second": 2.865, "step": 400 }, { "epoch": 0.19, "learning_rate": 1.869300911854103e-07, "logits/chosen": -1.0112515687942505, "logits/rejected": -1.0055806636810303, "logps/chosen": -86.33168029785156, "logps/rejected": -79.20642852783203, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": 1.5995696783065796, "rewards/margins": 5.682262420654297, "rewards/rejected": -4.082693099975586, "step": 410 }, { "epoch": 0.19, "learning_rate": 1.9148936170212767e-07, "logits/chosen": -0.9795786142349243, "logits/rejected": -0.9767486453056335, "logps/chosen": -89.1741714477539, "logps/rejected": -75.95233917236328, "loss": 0.0414, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1586321592330933, "rewards/margins": 5.276698112487793, "rewards/rejected": -4.118066310882568, "step": 420 }, { "epoch": 0.2, "learning_rate": 1.96048632218845e-07, "logits/chosen": -1.0036985874176025, "logits/rejected": -0.9971052408218384, "logps/chosen": -86.45294189453125, "logps/rejected": -79.10576629638672, "loss": 0.0317, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8366873264312744, "rewards/margins": 6.2948222160339355, "rewards/rejected": -4.458134651184082, "step": 430 }, { "epoch": 0.2, "learning_rate": 2.006079027355623e-07, "logits/chosen": -1.0170124769210815, "logits/rejected": -1.0096765756607056, "logps/chosen": -90.57597351074219, "logps/rejected": -72.65457916259766, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 2.073103427886963, "rewards/margins": 6.505836486816406, "rewards/rejected": -4.432732582092285, "step": 440 }, { "epoch": 0.21, "learning_rate": 2.0516717325227962e-07, "logits/chosen": -1.0112982988357544, "logits/rejected": -1.0087357759475708, "logps/chosen": -87.71390533447266, "logps/rejected": -78.8847427368164, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 1.966575264930725, "rewards/margins": 7.236502170562744, "rewards/rejected": -5.26992654800415, "step": 450 }, { "epoch": 0.21, "learning_rate": 2.0972644376899697e-07, "logits/chosen": -0.9955867528915405, "logits/rejected": -0.9789519309997559, "logps/chosen": -93.02728271484375, "logps/rejected": -79.93946838378906, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 1.8666505813598633, "rewards/margins": 7.276576042175293, "rewards/rejected": -5.409926414489746, "step": 460 }, { "epoch": 0.21, "learning_rate": 2.1428571428571428e-07, "logits/chosen": -1.0107711553573608, "logits/rejected": -1.015987753868103, "logps/chosen": -91.70677185058594, "logps/rejected": -78.36326599121094, "loss": 0.0279, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9241142272949219, "rewards/margins": 7.251917839050293, "rewards/rejected": -5.327803611755371, "step": 470 }, { "epoch": 0.22, "learning_rate": 2.188449848024316e-07, "logits/chosen": -1.0210905075073242, "logits/rejected": -1.0056220293045044, "logps/chosen": -90.60785675048828, "logps/rejected": -77.8875961303711, "loss": 0.0252, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8818817138671875, "rewards/margins": 7.938240051269531, "rewards/rejected": -6.056358814239502, "step": 480 }, { "epoch": 0.22, "learning_rate": 2.2340425531914892e-07, "logits/chosen": -1.0591073036193848, "logits/rejected": -1.0334078073501587, "logps/chosen": -86.05088806152344, "logps/rejected": -80.09144592285156, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 1.9079704284667969, "rewards/margins": 7.765076637268066, "rewards/rejected": -5.857105255126953, "step": 490 }, { "epoch": 0.23, "learning_rate": 2.2796352583586626e-07, "logits/chosen": -1.044002652168274, "logits/rejected": -1.0229895114898682, "logps/chosen": -81.89948272705078, "logps/rejected": -77.97576141357422, "loss": 0.0217, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5751844644546509, "rewards/margins": 7.957855224609375, "rewards/rejected": -6.382671356201172, "step": 500 }, { "epoch": 0.23, "eval_logits/chosen": -1.0691347122192383, "eval_logits/rejected": -1.0533946752548218, "eval_logps/chosen": -85.60762023925781, "eval_logps/rejected": -77.9649887084961, "eval_loss": 0.021840358152985573, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 1.912401795387268, "eval_rewards/margins": 8.22403335571289, "eval_rewards/rejected": -6.3116326332092285, "eval_runtime": 60.6624, "eval_samples_per_second": 47.179, "eval_steps_per_second": 2.951, "step": 500 }, { "epoch": 0.23, "learning_rate": 2.3252279635258358e-07, "logits/chosen": -1.0656869411468506, "logits/rejected": -1.0443495512008667, "logps/chosen": -80.74382781982422, "logps/rejected": -80.70405578613281, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 1.984548807144165, "rewards/margins": 8.708304405212402, "rewards/rejected": -6.7237548828125, "step": 510 }, { "epoch": 0.24, "learning_rate": 2.370820668693009e-07, "logits/chosen": -1.0407330989837646, "logits/rejected": -1.019323706626892, "logps/chosen": -94.31208801269531, "logps/rejected": -84.92703247070312, "loss": 0.0192, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9942207336425781, "rewards/margins": 8.950647354125977, "rewards/rejected": -6.956425666809082, "step": 520 }, { "epoch": 0.24, "learning_rate": 2.4164133738601824e-07, "logits/chosen": -1.0353573560714722, "logits/rejected": -1.0317944288253784, "logps/chosen": -84.43415832519531, "logps/rejected": -80.83734130859375, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 2.38482928276062, "rewards/margins": 9.657920837402344, "rewards/rejected": -7.273091793060303, "step": 530 }, { "epoch": 0.25, "learning_rate": 2.4620060790273553e-07, "logits/chosen": -1.0673249959945679, "logits/rejected": -1.0390758514404297, "logps/chosen": -86.52366638183594, "logps/rejected": -82.38668823242188, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 1.5419979095458984, "rewards/margins": 9.403617858886719, "rewards/rejected": -7.861618995666504, "step": 540 }, { "epoch": 0.25, "learning_rate": 2.5075987841945287e-07, "logits/chosen": -1.0771315097808838, "logits/rejected": -1.0471489429473877, "logps/chosen": -94.00312805175781, "logps/rejected": -81.71351623535156, "loss": 0.0146, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8300154209136963, "rewards/margins": 9.741002082824707, "rewards/rejected": -7.910986423492432, "step": 550 }, { "epoch": 0.26, "learning_rate": 2.553191489361702e-07, "logits/chosen": -1.0417684316635132, "logits/rejected": -1.0242894887924194, "logps/chosen": -85.45355987548828, "logps/rejected": -84.82173156738281, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 2.176011323928833, "rewards/margins": 10.29346752166748, "rewards/rejected": -8.117456436157227, "step": 560 }, { "epoch": 0.26, "learning_rate": 2.598784194528875e-07, "logits/chosen": -1.0521628856658936, "logits/rejected": -1.0290045738220215, "logps/chosen": -86.50548553466797, "logps/rejected": -84.81304931640625, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 1.396401286125183, "rewards/margins": 9.808305740356445, "rewards/rejected": -8.411903381347656, "step": 570 }, { "epoch": 0.26, "learning_rate": 2.6443768996960485e-07, "logits/chosen": -1.0446751117706299, "logits/rejected": -1.0253543853759766, "logps/chosen": -83.4941635131836, "logps/rejected": -81.4127197265625, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4424480199813843, "rewards/margins": 10.39416217803955, "rewards/rejected": -8.951713562011719, "step": 580 }, { "epoch": 0.27, "learning_rate": 2.689969604863222e-07, "logits/chosen": -1.0527292490005493, "logits/rejected": -1.028416633605957, "logps/chosen": -89.69185638427734, "logps/rejected": -84.93494415283203, "loss": 0.016, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6588973999023438, "rewards/margins": 10.561676025390625, "rewards/rejected": -8.902778625488281, "step": 590 }, { "epoch": 0.27, "learning_rate": 2.735562310030395e-07, "logits/chosen": -1.0560492277145386, "logits/rejected": -1.031136155128479, "logps/chosen": -88.6587142944336, "logps/rejected": -89.13833618164062, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 1.94255793094635, "rewards/margins": 11.583595275878906, "rewards/rejected": -9.641037940979004, "step": 600 }, { "epoch": 0.27, "eval_logits/chosen": -1.0850204229354858, "eval_logits/rejected": -1.0588685274124146, "eval_logps/chosen": -85.50906372070312, "eval_logps/rejected": -83.90418243408203, "eval_loss": 0.013939271681010723, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 1.961681842803955, "eval_rewards/margins": 11.242914199829102, "eval_rewards/rejected": -9.281231880187988, "eval_runtime": 67.4874, "eval_samples_per_second": 42.408, "eval_steps_per_second": 2.652, "step": 600 }, { "epoch": 0.28, "learning_rate": 2.781155015197568e-07, "logits/chosen": -1.0646154880523682, "logits/rejected": -1.0403302907943726, "logps/chosen": -87.16560363769531, "logps/rejected": -84.03047943115234, "loss": 0.0133, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1723438501358032, "rewards/margins": 10.494967460632324, "rewards/rejected": -9.322624206542969, "step": 610 }, { "epoch": 0.28, "learning_rate": 2.8267477203647417e-07, "logits/chosen": -1.040969729423523, "logits/rejected": -1.025119423866272, "logps/chosen": -86.0352783203125, "logps/rejected": -84.01143646240234, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": 2.131622314453125, "rewards/margins": 11.987198829650879, "rewards/rejected": -9.855576515197754, "step": 620 }, { "epoch": 0.29, "learning_rate": 2.872340425531915e-07, "logits/chosen": -1.0877559185028076, "logits/rejected": -1.057796835899353, "logps/chosen": -88.63595581054688, "logps/rejected": -93.89146423339844, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 1.9702014923095703, "rewards/margins": 12.47899055480957, "rewards/rejected": -10.508790016174316, "step": 630 }, { "epoch": 0.29, "learning_rate": 2.917933130699088e-07, "logits/chosen": -1.0618011951446533, "logits/rejected": -1.0364282131195068, "logps/chosen": -83.58055114746094, "logps/rejected": -87.19285583496094, "loss": 0.0172, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3319017887115479, "rewards/margins": 12.28033447265625, "rewards/rejected": -10.948432922363281, "step": 640 }, { "epoch": 0.3, "learning_rate": 2.9635258358662614e-07, "logits/chosen": -1.0487759113311768, "logits/rejected": -1.0237376689910889, "logps/chosen": -97.61241149902344, "logps/rejected": -89.84696197509766, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 2.0295894145965576, "rewards/margins": 13.6948823928833, "rewards/rejected": -11.665290832519531, "step": 650 }, { "epoch": 0.3, "learning_rate": 2.9989856297548603e-07, "logits/chosen": -1.0819432735443115, "logits/rejected": -1.0479309558868408, "logps/chosen": -88.46907043457031, "logps/rejected": -87.63093566894531, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 1.6469261646270752, "rewards/margins": 12.705039978027344, "rewards/rejected": -11.058113098144531, "step": 660 }, { "epoch": 0.31, "learning_rate": 2.9939137785291633e-07, "logits/chosen": -1.0461461544036865, "logits/rejected": -1.0262444019317627, "logps/chosen": -88.99845886230469, "logps/rejected": -88.98668670654297, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 1.7188533544540405, "rewards/margins": 12.911355972290039, "rewards/rejected": -11.192502975463867, "step": 670 }, { "epoch": 0.31, "learning_rate": 2.9888419273034654e-07, "logits/chosen": -1.0983555316925049, "logits/rejected": -1.058870792388916, "logps/chosen": -92.72132873535156, "logps/rejected": -91.1702880859375, "loss": 0.0139, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1177186965942383, "rewards/margins": 12.790249824523926, "rewards/rejected": -10.672532081604004, "step": 680 }, { "epoch": 0.31, "learning_rate": 2.9837700760777684e-07, "logits/chosen": -1.0601097345352173, "logits/rejected": -1.0348825454711914, "logps/chosen": -86.94996643066406, "logps/rejected": -90.38238525390625, "loss": 0.0086, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6195377111434937, "rewards/margins": 13.726669311523438, "rewards/rejected": -12.107131958007812, "step": 690 }, { "epoch": 0.32, "learning_rate": 2.978698224852071e-07, "logits/chosen": -1.0934697389602661, "logits/rejected": -1.0594290494918823, "logps/chosen": -89.61146545410156, "logps/rejected": -92.48619842529297, "loss": 0.0048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0675747394561768, "rewards/margins": 13.449430465698242, "rewards/rejected": -11.381856918334961, "step": 700 }, { "epoch": 0.32, "eval_logits/chosen": -1.1140062808990479, "eval_logits/rejected": -1.0765314102172852, "eval_logps/chosen": -85.63339233398438, "eval_logps/rejected": -89.27959442138672, "eval_loss": 0.010984507389366627, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 1.8995180130004883, "eval_rewards/margins": 13.868452072143555, "eval_rewards/rejected": -11.968934059143066, "eval_runtime": 63.0243, "eval_samples_per_second": 45.411, "eval_steps_per_second": 2.84, "step": 700 }, { "epoch": 0.32, "learning_rate": 2.9736263736263735e-07, "logits/chosen": -1.1225736141204834, "logits/rejected": -1.079655408859253, "logps/chosen": -84.65995788574219, "logps/rejected": -93.70130920410156, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 1.7059738636016846, "rewards/margins": 13.980459213256836, "rewards/rejected": -12.27448558807373, "step": 710 }, { "epoch": 0.33, "learning_rate": 2.968554522400676e-07, "logits/chosen": -1.0669975280761719, "logits/rejected": -1.0285903215408325, "logps/chosen": -89.87354278564453, "logps/rejected": -91.52484893798828, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0469908714294434, "rewards/margins": 14.076245307922363, "rewards/rejected": -12.029253005981445, "step": 720 }, { "epoch": 0.33, "learning_rate": 2.9634826711749786e-07, "logits/chosen": -1.0944750308990479, "logits/rejected": -1.0600286722183228, "logps/chosen": -90.2714614868164, "logps/rejected": -86.80066680908203, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6248250007629395, "rewards/margins": 14.128560066223145, "rewards/rejected": -11.503734588623047, "step": 730 }, { "epoch": 0.34, "learning_rate": 2.958410819949281e-07, "logits/chosen": -1.0793449878692627, "logits/rejected": -1.0554149150848389, "logps/chosen": -78.48091888427734, "logps/rejected": -89.2685317993164, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 1.959774374961853, "rewards/margins": 14.275179862976074, "rewards/rejected": -12.315404891967773, "step": 740 }, { "epoch": 0.34, "learning_rate": 2.953338968723584e-07, "logits/chosen": -1.0978403091430664, "logits/rejected": -1.0679986476898193, "logps/chosen": -79.7038345336914, "logps/rejected": -89.61735534667969, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 2.5487966537475586, "rewards/margins": 15.569620132446289, "rewards/rejected": -13.020822525024414, "step": 750 }, { "epoch": 0.35, "learning_rate": 2.948267117497887e-07, "logits/chosen": -1.1022833585739136, "logits/rejected": -1.0697553157806396, "logps/chosen": -87.8471450805664, "logps/rejected": -95.08956146240234, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 2.898350715637207, "rewards/margins": 14.861310005187988, "rewards/rejected": -11.962959289550781, "step": 760 }, { "epoch": 0.35, "learning_rate": 2.9431952662721893e-07, "logits/chosen": -1.0957014560699463, "logits/rejected": -1.0692951679229736, "logps/chosen": -85.73948669433594, "logps/rejected": -89.58802795410156, "loss": 0.0072, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3441336154937744, "rewards/margins": 13.35987377166748, "rewards/rejected": -12.015739440917969, "step": 770 }, { "epoch": 0.36, "learning_rate": 2.938123415046492e-07, "logits/chosen": -1.0931169986724854, "logits/rejected": -1.0750610828399658, "logps/chosen": -83.94388580322266, "logps/rejected": -90.04351043701172, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 1.7953174114227295, "rewards/margins": 14.375581741333008, "rewards/rejected": -12.5802640914917, "step": 780 }, { "epoch": 0.36, "learning_rate": 2.9330515638207944e-07, "logits/chosen": -1.1000292301177979, "logits/rejected": -1.0617127418518066, "logps/chosen": -85.10647583007812, "logps/rejected": -95.74652099609375, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9242041110992432, "rewards/margins": 15.295672416687012, "rewards/rejected": -13.371467590332031, "step": 790 }, { "epoch": 0.37, "learning_rate": 2.927979712595097e-07, "logits/chosen": -1.074942708015442, "logits/rejected": -1.0459177494049072, "logps/chosen": -87.57856750488281, "logps/rejected": -92.28514862060547, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 1.659388780593872, "rewards/margins": 13.915122985839844, "rewards/rejected": -12.25573444366455, "step": 800 }, { "epoch": 0.37, "eval_logits/chosen": -1.1495593786239624, "eval_logits/rejected": -1.104751467704773, "eval_logps/chosen": -85.99646759033203, "eval_logps/rejected": -93.16914367675781, "eval_loss": 0.01038471981883049, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 1.7179815769195557, "eval_rewards/margins": 15.631699562072754, "eval_rewards/rejected": -13.913717269897461, "eval_runtime": 65.9739, "eval_samples_per_second": 43.381, "eval_steps_per_second": 2.713, "step": 800 }, { "epoch": 0.37, "learning_rate": 2.9229078613694e-07, "logits/chosen": -1.1248949766159058, "logits/rejected": -1.086717128753662, "logps/chosen": -86.31236267089844, "logps/rejected": -93.25736236572266, "loss": 0.0126, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.9837150573730469, "rewards/margins": 15.558454513549805, "rewards/rejected": -13.574739456176758, "step": 810 }, { "epoch": 0.37, "learning_rate": 2.917836010143702e-07, "logits/chosen": -1.0978708267211914, "logits/rejected": -1.057366132736206, "logps/chosen": -81.67508697509766, "logps/rejected": -91.41726684570312, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 2.00944185256958, "rewards/margins": 15.302815437316895, "rewards/rejected": -13.293373107910156, "step": 820 }, { "epoch": 0.38, "learning_rate": 2.912764158918005e-07, "logits/chosen": -1.1044955253601074, "logits/rejected": -1.0758087635040283, "logps/chosen": -82.3414306640625, "logps/rejected": -97.22112274169922, "loss": 0.0106, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6446453332901, "rewards/margins": 15.301877975463867, "rewards/rejected": -13.657232284545898, "step": 830 }, { "epoch": 0.38, "learning_rate": 2.9076923076923076e-07, "logits/chosen": -1.124459981918335, "logits/rejected": -1.0775994062423706, "logps/chosen": -90.77351379394531, "logps/rejected": -92.74385833740234, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 2.0721054077148438, "rewards/margins": 16.051000595092773, "rewards/rejected": -13.97889518737793, "step": 840 }, { "epoch": 0.39, "learning_rate": 2.90262045646661e-07, "logits/chosen": -1.1489288806915283, "logits/rejected": -1.1091543436050415, "logps/chosen": -83.20817565917969, "logps/rejected": -99.59390258789062, "loss": 0.0079, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3499611616134644, "rewards/margins": 16.251693725585938, "rewards/rejected": -14.9017333984375, "step": 850 }, { "epoch": 0.39, "learning_rate": 2.8975486052409127e-07, "logits/chosen": -1.1202361583709717, "logits/rejected": -1.0809228420257568, "logps/chosen": -93.94721984863281, "logps/rejected": -97.80915069580078, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.6343969106674194, "rewards/margins": 16.120519638061523, "rewards/rejected": -14.486124038696289, "step": 860 }, { "epoch": 0.4, "learning_rate": 2.892476754015215e-07, "logits/chosen": -1.1433136463165283, "logits/rejected": -1.0998663902282715, "logps/chosen": -88.42134094238281, "logps/rejected": -97.21299743652344, "loss": 0.009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.4050545692443848, "rewards/margins": 17.111103057861328, "rewards/rejected": -14.706045150756836, "step": 870 }, { "epoch": 0.4, "learning_rate": 2.887404902789518e-07, "logits/chosen": -1.1467622518539429, "logits/rejected": -1.1034185886383057, "logps/chosen": -93.99958801269531, "logps/rejected": -99.74283599853516, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 2.3127968311309814, "rewards/margins": 16.9749698638916, "rewards/rejected": -14.6621732711792, "step": 880 }, { "epoch": 0.41, "learning_rate": 2.882333051563821e-07, "logits/chosen": -1.1312350034713745, "logits/rejected": -1.0877126455307007, "logps/chosen": -85.80730438232422, "logps/rejected": -98.7739028930664, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.6915788650512695, "rewards/margins": 17.439577102661133, "rewards/rejected": -15.747998237609863, "step": 890 }, { "epoch": 0.41, "learning_rate": 2.8772612003381234e-07, "logits/chosen": -1.1451927423477173, "logits/rejected": -1.1060543060302734, "logps/chosen": -86.88992309570312, "logps/rejected": -99.46288299560547, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 2.0115227699279785, "rewards/margins": 17.152305603027344, "rewards/rejected": -15.140782356262207, "step": 900 }, { "epoch": 0.41, "eval_logits/chosen": -1.1781193017959595, "eval_logits/rejected": -1.1281659603118896, "eval_logps/chosen": -86.0967788696289, "eval_logps/rejected": -96.05574798583984, "eval_loss": 0.009535559453070164, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.6678271293640137, "eval_rewards/margins": 17.024843215942383, "eval_rewards/rejected": -15.357016563415527, "eval_runtime": 61.9434, "eval_samples_per_second": 46.203, "eval_steps_per_second": 2.89, "step": 900 }, { "epoch": 0.42, "learning_rate": 2.872189349112426e-07, "logits/chosen": -1.138933539390564, "logits/rejected": -1.0921571254730225, "logps/chosen": -89.17816162109375, "logps/rejected": -98.90169525146484, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.6821825504302979, "rewards/margins": 17.2696533203125, "rewards/rejected": -15.587472915649414, "step": 910 }, { "epoch": 0.42, "learning_rate": 2.8671174978867285e-07, "logits/chosen": -1.132676362991333, "logits/rejected": -1.0800034999847412, "logps/chosen": -86.4345932006836, "logps/rejected": -97.37138366699219, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 2.264104127883911, "rewards/margins": 17.31032371520996, "rewards/rejected": -15.046220779418945, "step": 920 }, { "epoch": 0.42, "learning_rate": 2.862045646661031e-07, "logits/chosen": -1.1458117961883545, "logits/rejected": -1.0923669338226318, "logps/chosen": -92.89430236816406, "logps/rejected": -95.99363708496094, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 2.16265869140625, "rewards/margins": 18.03095054626465, "rewards/rejected": -15.868295669555664, "step": 930 }, { "epoch": 0.43, "learning_rate": 2.8569737954353336e-07, "logits/chosen": -1.1349055767059326, "logits/rejected": -1.083634614944458, "logps/chosen": -93.05912780761719, "logps/rejected": -103.2133560180664, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 1.1463124752044678, "rewards/margins": 17.069040298461914, "rewards/rejected": -15.922727584838867, "step": 940 }, { "epoch": 0.43, "learning_rate": 2.8519019442096367e-07, "logits/chosen": -1.14430832862854, "logits/rejected": -1.103515386581421, "logps/chosen": -86.96769714355469, "logps/rejected": -103.5058364868164, "loss": 0.0103, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6363202333450317, "rewards/margins": 18.16317367553711, "rewards/rejected": -17.52685546875, "step": 950 }, { "epoch": 0.44, "learning_rate": 2.846830092983939e-07, "logits/chosen": -1.1029218435287476, "logits/rejected": -1.0647412538528442, "logps/chosen": -89.83350372314453, "logps/rejected": -97.29312133789062, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.2907755374908447, "rewards/margins": 16.094762802124023, "rewards/rejected": -14.803988456726074, "step": 960 }, { "epoch": 0.44, "learning_rate": 2.841758241758242e-07, "logits/chosen": -1.1452381610870361, "logits/rejected": -1.1065289974212646, "logps/chosen": -86.73941802978516, "logps/rejected": -92.15290069580078, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": 1.6819941997528076, "rewards/margins": 15.786001205444336, "rewards/rejected": -14.10400676727295, "step": 970 }, { "epoch": 0.45, "learning_rate": 2.8366863905325443e-07, "logits/chosen": -1.1149189472198486, "logits/rejected": -1.0778940916061401, "logps/chosen": -87.29264068603516, "logps/rejected": -98.55472564697266, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.247238278388977, "rewards/margins": 16.33363914489746, "rewards/rejected": -15.086400032043457, "step": 980 }, { "epoch": 0.45, "learning_rate": 2.831614539306847e-07, "logits/chosen": -1.104259967803955, "logits/rejected": -1.0659714937210083, "logps/chosen": -83.01313018798828, "logps/rejected": -93.28031921386719, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 2.3180642127990723, "rewards/margins": 17.55698585510254, "rewards/rejected": -15.238920211791992, "step": 990 }, { "epoch": 0.46, "learning_rate": 2.8265426880811494e-07, "logits/chosen": -1.1296952962875366, "logits/rejected": -1.080012559890747, "logps/chosen": -88.91751098632812, "logps/rejected": -97.22592163085938, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.736224889755249, "rewards/margins": 17.38802719116211, "rewards/rejected": -15.651802062988281, "step": 1000 }, { "epoch": 0.46, "eval_logits/chosen": -1.1499282121658325, "eval_logits/rejected": -1.1065919399261475, "eval_logps/chosen": -85.68370056152344, "eval_logps/rejected": -95.3034439086914, "eval_loss": 0.008165295235812664, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 1.8743646144866943, "eval_rewards/margins": 16.85523223876953, "eval_rewards/rejected": -14.980865478515625, "eval_runtime": 64.5918, "eval_samples_per_second": 44.309, "eval_steps_per_second": 2.771, "step": 1000 }, { "epoch": 0.46, "learning_rate": 2.821470836855452e-07, "logits/chosen": -1.105207085609436, "logits/rejected": -1.0650384426116943, "logps/chosen": -86.5456314086914, "logps/rejected": -99.33902740478516, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 1.171086072921753, "rewards/margins": 16.355039596557617, "rewards/rejected": -15.183954238891602, "step": 1010 }, { "epoch": 0.47, "learning_rate": 2.8163989856297545e-07, "logits/chosen": -1.093670129776001, "logits/rejected": -1.0636560916900635, "logps/chosen": -84.88511657714844, "logps/rejected": -100.3253173828125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.8782209157943726, "rewards/margins": 17.67445945739746, "rewards/rejected": -15.796239852905273, "step": 1020 }, { "epoch": 0.47, "learning_rate": 2.8113271344040575e-07, "logits/chosen": -1.1384307146072388, "logits/rejected": -1.0990992784500122, "logps/chosen": -82.6616439819336, "logps/rejected": -98.50576782226562, "loss": 0.0067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3514288663864136, "rewards/margins": 17.497568130493164, "rewards/rejected": -16.14613914489746, "step": 1030 }, { "epoch": 0.47, "learning_rate": 2.80625528317836e-07, "logits/chosen": -1.099442481994629, "logits/rejected": -1.072345495223999, "logps/chosen": -94.7537612915039, "logps/rejected": -99.29774475097656, "loss": 0.0079, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.425427198410034, "rewards/margins": 17.845937728881836, "rewards/rejected": -15.420511245727539, "step": 1040 }, { "epoch": 0.48, "learning_rate": 2.8011834319526626e-07, "logits/chosen": -1.1023457050323486, "logits/rejected": -1.0619539022445679, "logps/chosen": -89.6052474975586, "logps/rejected": -101.50379943847656, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 1.8860256671905518, "rewards/margins": 18.8059024810791, "rewards/rejected": -16.919872283935547, "step": 1050 }, { "epoch": 0.48, "learning_rate": 2.796111580726965e-07, "logits/chosen": -1.1182763576507568, "logits/rejected": -1.083601474761963, "logps/chosen": -86.69522857666016, "logps/rejected": -98.14034271240234, "loss": 0.0078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7847849130630493, "rewards/margins": 16.757022857666016, "rewards/rejected": -15.972234725952148, "step": 1060 }, { "epoch": 0.49, "learning_rate": 2.7910397295012677e-07, "logits/chosen": -1.0921047925949097, "logits/rejected": -1.058253526687622, "logps/chosen": -88.3211898803711, "logps/rejected": -100.35057067871094, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 2.095982074737549, "rewards/margins": 18.4968318939209, "rewards/rejected": -16.40085220336914, "step": 1070 }, { "epoch": 0.49, "learning_rate": 2.78596787827557e-07, "logits/chosen": -1.1086832284927368, "logits/rejected": -1.074639916419983, "logps/chosen": -88.3951416015625, "logps/rejected": -101.92322540283203, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 2.5710289478302, "rewards/margins": 18.696849822998047, "rewards/rejected": -16.12582015991211, "step": 1080 }, { "epoch": 0.5, "learning_rate": 2.7808960270498733e-07, "logits/chosen": -1.1255228519439697, "logits/rejected": -1.0840017795562744, "logps/chosen": -84.0284194946289, "logps/rejected": -103.49464416503906, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.7648484706878662, "rewards/margins": 18.157766342163086, "rewards/rejected": -16.39291763305664, "step": 1090 }, { "epoch": 0.5, "learning_rate": 2.775824175824176e-07, "logits/chosen": -1.1340689659118652, "logits/rejected": -1.095685362815857, "logps/chosen": -86.91542053222656, "logps/rejected": -100.10322570800781, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1016123294830322, "rewards/margins": 17.42715835571289, "rewards/rejected": -16.325546264648438, "step": 1100 }, { "epoch": 0.5, "eval_logits/chosen": -1.1586607694625854, "eval_logits/rejected": -1.1157867908477783, "eval_logps/chosen": -85.93265533447266, "eval_logps/rejected": -96.73734283447266, "eval_loss": 0.007867630571126938, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.7498830556869507, "eval_rewards/margins": 17.447694778442383, "eval_rewards/rejected": -15.6978120803833, "eval_runtime": 64.3858, "eval_samples_per_second": 44.451, "eval_steps_per_second": 2.78, "step": 1100 }, { "epoch": 0.51, "learning_rate": 2.7707523245984784e-07, "logits/chosen": -1.1197665929794312, "logits/rejected": -1.078808307647705, "logps/chosen": -92.86619567871094, "logps/rejected": -102.52835845947266, "loss": 0.0051, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6940488815307617, "rewards/margins": 17.81356430053711, "rewards/rejected": -16.11951446533203, "step": 1110 }, { "epoch": 0.51, "learning_rate": 2.765680473372781e-07, "logits/chosen": -1.1259820461273193, "logits/rejected": -1.0869123935699463, "logps/chosen": -93.55479431152344, "logps/rejected": -95.98357391357422, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1045427322387695, "rewards/margins": 17.43987464904785, "rewards/rejected": -15.335331916809082, "step": 1120 }, { "epoch": 0.52, "learning_rate": 2.7606086221470835e-07, "logits/chosen": -1.137892246246338, "logits/rejected": -1.0994970798492432, "logps/chosen": -90.780517578125, "logps/rejected": -102.3265380859375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 2.290585517883301, "rewards/margins": 18.35919761657715, "rewards/rejected": -16.068613052368164, "step": 1130 }, { "epoch": 0.52, "learning_rate": 2.755536770921386e-07, "logits/chosen": -1.1444514989852905, "logits/rejected": -1.1056411266326904, "logps/chosen": -86.2841796875, "logps/rejected": -100.90594482421875, "loss": 0.0068, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.109790325164795, "rewards/margins": 17.33005142211914, "rewards/rejected": -16.220258712768555, "step": 1140 }, { "epoch": 0.52, "learning_rate": 2.750464919695689e-07, "logits/chosen": -1.1471726894378662, "logits/rejected": -1.1120365858078003, "logps/chosen": -83.0753402709961, "logps/rejected": -98.07260131835938, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.3137190341949463, "rewards/margins": 17.202226638793945, "rewards/rejected": -15.888509750366211, "step": 1150 }, { "epoch": 0.53, "learning_rate": 2.745393068469991e-07, "logits/chosen": -1.131012201309204, "logits/rejected": -1.0842430591583252, "logps/chosen": -97.24982452392578, "logps/rejected": -103.58902740478516, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 1.803784728050232, "rewards/margins": 19.04859161376953, "rewards/rejected": -17.244808197021484, "step": 1160 }, { "epoch": 0.53, "learning_rate": 2.740321217244294e-07, "logits/chosen": -1.1448577642440796, "logits/rejected": -1.0983836650848389, "logps/chosen": -88.2474136352539, "logps/rejected": -101.29617309570312, "loss": 0.0095, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.321990489959717, "rewards/margins": 19.179903030395508, "rewards/rejected": -16.857913970947266, "step": 1170 }, { "epoch": 0.54, "learning_rate": 2.7352493660185967e-07, "logits/chosen": -1.15381920337677, "logits/rejected": -1.1216586828231812, "logps/chosen": -79.86784362792969, "logps/rejected": -101.373046875, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.6416583061218262, "rewards/margins": 18.733495712280273, "rewards/rejected": -18.091835021972656, "step": 1180 }, { "epoch": 0.54, "learning_rate": 2.7301775147928993e-07, "logits/chosen": -1.1581026315689087, "logits/rejected": -1.1127700805664062, "logps/chosen": -95.01976013183594, "logps/rejected": -102.31490325927734, "loss": 0.0073, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2217804193496704, "rewards/margins": 18.29319953918457, "rewards/rejected": -17.07141876220703, "step": 1190 }, { "epoch": 0.55, "learning_rate": 2.725105663567202e-07, "logits/chosen": -1.1556333303451538, "logits/rejected": -1.1143467426300049, "logps/chosen": -85.31793975830078, "logps/rejected": -104.33609771728516, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 2.109412670135498, "rewards/margins": 19.068283081054688, "rewards/rejected": -16.958871841430664, "step": 1200 }, { "epoch": 0.55, "eval_logits/chosen": -1.2309268712997437, "eval_logits/rejected": -1.177735447883606, "eval_logps/chosen": -86.51016998291016, "eval_logps/rejected": -97.42555236816406, "eval_loss": 0.008077413775026798, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.4611291885375977, "eval_rewards/margins": 17.5030517578125, "eval_rewards/rejected": -16.041919708251953, "eval_runtime": 61.5469, "eval_samples_per_second": 46.501, "eval_steps_per_second": 2.908, "step": 1200 }, { "epoch": 0.55, "learning_rate": 2.7200338123415044e-07, "logits/chosen": -1.1782100200653076, "logits/rejected": -1.1402640342712402, "logps/chosen": -92.4530029296875, "logps/rejected": -104.04048919677734, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.6389929056167603, "rewards/margins": 18.719791412353516, "rewards/rejected": -17.080801010131836, "step": 1210 }, { "epoch": 0.56, "learning_rate": 2.714961961115807e-07, "logits/chosen": -1.1656419038772583, "logits/rejected": -1.1300376653671265, "logps/chosen": -83.96723175048828, "logps/rejected": -97.54676055908203, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 1.120603084564209, "rewards/margins": 17.0549259185791, "rewards/rejected": -15.934321403503418, "step": 1220 }, { "epoch": 0.56, "learning_rate": 2.70989010989011e-07, "logits/chosen": -1.181616187095642, "logits/rejected": -1.140226125717163, "logps/chosen": -86.73959350585938, "logps/rejected": -100.60263061523438, "loss": 0.0033, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8291637897491455, "rewards/margins": 17.071361541748047, "rewards/rejected": -15.242197036743164, "step": 1230 }, { "epoch": 0.57, "learning_rate": 2.7048182586644125e-07, "logits/chosen": -1.1932631731033325, "logits/rejected": -1.1404846906661987, "logps/chosen": -89.65509796142578, "logps/rejected": -100.72111511230469, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 2.12876558303833, "rewards/margins": 18.558353424072266, "rewards/rejected": -16.429588317871094, "step": 1240 }, { "epoch": 0.57, "learning_rate": 2.699746407438715e-07, "logits/chosen": -1.1967064142227173, "logits/rejected": -1.1478387117385864, "logps/chosen": -86.44863891601562, "logps/rejected": -100.36338806152344, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.105329990386963, "rewards/margins": 18.038101196289062, "rewards/rejected": -16.932769775390625, "step": 1250 }, { "epoch": 0.58, "learning_rate": 2.6946745562130176e-07, "logits/chosen": -1.1940025091171265, "logits/rejected": -1.1532320976257324, "logps/chosen": -89.0478744506836, "logps/rejected": -102.28709411621094, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 1.9386533498764038, "rewards/margins": 18.23946762084961, "rewards/rejected": -16.30081558227539, "step": 1260 }, { "epoch": 0.58, "learning_rate": 2.68960270498732e-07, "logits/chosen": -1.1994531154632568, "logits/rejected": -1.1430509090423584, "logps/chosen": -89.60600280761719, "logps/rejected": -107.12632751464844, "loss": 0.0077, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8430372476577759, "rewards/margins": 19.31827163696289, "rewards/rejected": -18.475234985351562, "step": 1270 }, { "epoch": 0.58, "learning_rate": 2.6845308537616227e-07, "logits/chosen": -1.1602545976638794, "logits/rejected": -1.1090123653411865, "logps/chosen": -87.08576202392578, "logps/rejected": -95.82630920410156, "loss": 0.011, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.725023627281189, "rewards/margins": 18.602710723876953, "rewards/rejected": -16.877687454223633, "step": 1280 }, { "epoch": 0.59, "learning_rate": 2.679459002535926e-07, "logits/chosen": -1.2227842807769775, "logits/rejected": -1.1634962558746338, "logps/chosen": -88.16740417480469, "logps/rejected": -97.47923278808594, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.8518360257148743, "rewards/margins": 17.11991310119629, "rewards/rejected": -16.268077850341797, "step": 1290 }, { "epoch": 0.59, "learning_rate": 2.674387151310228e-07, "logits/chosen": -1.1858384609222412, "logits/rejected": -1.1380422115325928, "logps/chosen": -91.48844909667969, "logps/rejected": -104.26606750488281, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 2.4121227264404297, "rewards/margins": 19.12758445739746, "rewards/rejected": -16.71546173095703, "step": 1300 }, { "epoch": 0.59, "eval_logits/chosen": -1.2335172891616821, "eval_logits/rejected": -1.1754339933395386, "eval_logps/chosen": -86.43448638916016, "eval_logps/rejected": -99.26506042480469, "eval_loss": 0.00815549585968256, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.4989758729934692, "eval_rewards/margins": 18.460649490356445, "eval_rewards/rejected": -16.961671829223633, "eval_runtime": 59.4219, "eval_samples_per_second": 48.164, "eval_steps_per_second": 3.012, "step": 1300 }, { "epoch": 0.6, "learning_rate": 2.669315300084531e-07, "logits/chosen": -1.2054587602615356, "logits/rejected": -1.1616264581680298, "logps/chosen": -86.14714050292969, "logps/rejected": -103.92906188964844, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.1422611474990845, "rewards/margins": 18.06179428100586, "rewards/rejected": -16.919536590576172, "step": 1310 }, { "epoch": 0.6, "learning_rate": 2.6642434488588334e-07, "logits/chosen": -1.164485216140747, "logits/rejected": -1.1202260255813599, "logps/chosen": -83.26927947998047, "logps/rejected": -103.29952239990234, "loss": 0.0101, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3391577005386353, "rewards/margins": 19.02712631225586, "rewards/rejected": -17.68796730041504, "step": 1320 }, { "epoch": 0.61, "learning_rate": 2.659171597633136e-07, "logits/chosen": -1.1936196088790894, "logits/rejected": -1.1440293788909912, "logps/chosen": -84.86544036865234, "logps/rejected": -104.18849182128906, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 1.3678613901138306, "rewards/margins": 19.402454376220703, "rewards/rejected": -18.03459358215332, "step": 1330 }, { "epoch": 0.61, "learning_rate": 2.6540997464074385e-07, "logits/chosen": -1.1995398998260498, "logits/rejected": -1.1528923511505127, "logps/chosen": -83.30887603759766, "logps/rejected": -100.70631408691406, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 2.5668017864227295, "rewards/margins": 20.389301300048828, "rewards/rejected": -17.822498321533203, "step": 1340 }, { "epoch": 0.62, "learning_rate": 2.649027895181741e-07, "logits/chosen": -1.2100002765655518, "logits/rejected": -1.1548488140106201, "logps/chosen": -82.71726989746094, "logps/rejected": -100.33686828613281, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 1.1490261554718018, "rewards/margins": 18.165735244750977, "rewards/rejected": -17.016712188720703, "step": 1350 }, { "epoch": 0.62, "learning_rate": 2.6439560439560436e-07, "logits/chosen": -1.2042474746704102, "logits/rejected": -1.1507916450500488, "logps/chosen": -89.17523956298828, "logps/rejected": -103.0042724609375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.0882587432861328, "rewards/margins": 19.461795806884766, "rewards/rejected": -18.37353515625, "step": 1360 }, { "epoch": 0.63, "learning_rate": 2.6388841927303466e-07, "logits/chosen": -1.19635808467865, "logits/rejected": -1.1526683568954468, "logps/chosen": -84.29436492919922, "logps/rejected": -103.22540283203125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 1.6204719543457031, "rewards/margins": 19.50061798095703, "rewards/rejected": -17.880146026611328, "step": 1370 }, { "epoch": 0.63, "learning_rate": 2.633812341504649e-07, "logits/chosen": -1.1552878618240356, "logits/rejected": -1.1150346994400024, "logps/chosen": -84.24638366699219, "logps/rejected": -101.41566467285156, "loss": 0.0058, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.426559329032898, "rewards/margins": 18.867931365966797, "rewards/rejected": -17.44137191772461, "step": 1380 }, { "epoch": 0.63, "learning_rate": 2.6287404902789517e-07, "logits/chosen": -1.1951394081115723, "logits/rejected": -1.1533236503601074, "logps/chosen": -89.08964538574219, "logps/rejected": -107.95294189453125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.272940754890442, "rewards/margins": 19.47437858581543, "rewards/rejected": -18.201440811157227, "step": 1390 }, { "epoch": 0.64, "learning_rate": 2.623668639053254e-07, "logits/chosen": -1.1739484071731567, "logits/rejected": -1.1234138011932373, "logps/chosen": -93.20767211914062, "logps/rejected": -102.9843521118164, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 1.4396759271621704, "rewards/margins": 19.409513473510742, "rewards/rejected": -17.969837188720703, "step": 1400 }, { "epoch": 0.64, "eval_logits/chosen": -1.2195051908493042, "eval_logits/rejected": -1.1641638278961182, "eval_logps/chosen": -86.76068115234375, "eval_logps/rejected": -102.28946685791016, "eval_loss": 0.008062196895480156, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 1.3358712196350098, "eval_rewards/margins": 19.80974578857422, "eval_rewards/rejected": -18.473875045776367, "eval_runtime": 69.0484, "eval_samples_per_second": 41.449, "eval_steps_per_second": 2.592, "step": 1400 }, { "epoch": 0.64, "learning_rate": 2.618596787827557e-07, "logits/chosen": -1.157814621925354, "logits/rejected": -1.119381308555603, "logps/chosen": -92.300048828125, "logps/rejected": -103.40312194824219, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 1.9399839639663696, "rewards/margins": 19.74970054626465, "rewards/rejected": -17.80971908569336, "step": 1410 }, { "epoch": 0.65, "learning_rate": 2.6135249366018593e-07, "logits/chosen": -1.1667792797088623, "logits/rejected": -1.119706392288208, "logps/chosen": -85.43004608154297, "logps/rejected": -100.96659851074219, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.285947561264038, "rewards/margins": 19.657182693481445, "rewards/rejected": -17.371234893798828, "step": 1420 }, { "epoch": 0.65, "learning_rate": 2.6084530853761624e-07, "logits/chosen": -1.1735081672668457, "logits/rejected": -1.1159073114395142, "logps/chosen": -89.15231323242188, "logps/rejected": -104.07325744628906, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.379710853099823, "rewards/margins": 18.454669952392578, "rewards/rejected": -18.07495880126953, "step": 1430 }, { "epoch": 0.66, "learning_rate": 2.603381234150465e-07, "logits/chosen": -1.177208423614502, "logits/rejected": -1.1227295398712158, "logps/chosen": -87.95960235595703, "logps/rejected": -100.62553405761719, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.4888887405395508, "rewards/margins": 18.826932907104492, "rewards/rejected": -17.338045120239258, "step": 1440 }, { "epoch": 0.66, "learning_rate": 2.5983093829247675e-07, "logits/chosen": -1.135980248451233, "logits/rejected": -1.097318410873413, "logps/chosen": -90.05815887451172, "logps/rejected": -97.01565551757812, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 1.6748930215835571, "rewards/margins": 16.284954071044922, "rewards/rejected": -14.61005973815918, "step": 1450 }, { "epoch": 0.67, "learning_rate": 2.59323753169907e-07, "logits/chosen": -1.1009352207183838, "logits/rejected": -1.0712867975234985, "logps/chosen": -90.10000610351562, "logps/rejected": -104.41805267333984, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.7969961762428284, "rewards/margins": 18.065677642822266, "rewards/rejected": -17.268680572509766, "step": 1460 }, { "epoch": 0.67, "learning_rate": 2.5881656804733726e-07, "logits/chosen": -1.1424516439437866, "logits/rejected": -1.104478120803833, "logps/chosen": -85.87166595458984, "logps/rejected": -96.37669372558594, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 1.2775685787200928, "rewards/margins": 17.6835880279541, "rewards/rejected": -16.40601921081543, "step": 1470 }, { "epoch": 0.68, "learning_rate": 2.583093829247675e-07, "logits/chosen": -1.148421049118042, "logits/rejected": -1.1051350831985474, "logps/chosen": -84.38792419433594, "logps/rejected": -95.69989776611328, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.0020928382873535, "rewards/margins": 18.655029296875, "rewards/rejected": -16.652935028076172, "step": 1480 }, { "epoch": 0.68, "learning_rate": 2.5780219780219777e-07, "logits/chosen": -1.137450933456421, "logits/rejected": -1.1082595586776733, "logps/chosen": -88.18030548095703, "logps/rejected": -98.36376190185547, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 2.1630921363830566, "rewards/margins": 18.910762786865234, "rewards/rejected": -16.747669219970703, "step": 1490 }, { "epoch": 0.68, "learning_rate": 2.57295012679628e-07, "logits/chosen": -1.144364595413208, "logits/rejected": -1.1114692687988281, "logps/chosen": -87.55670928955078, "logps/rejected": -104.33062744140625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.494736909866333, "rewards/margins": 19.5846004486084, "rewards/rejected": -18.08986473083496, "step": 1500 }, { "epoch": 0.68, "eval_logits/chosen": -1.2191152572631836, "eval_logits/rejected": -1.1659319400787354, "eval_logps/chosen": -87.01031494140625, "eval_logps/rejected": -101.05850982666016, "eval_loss": 0.00720271747559309, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.211055040359497, "eval_rewards/margins": 19.069448471069336, "eval_rewards/rejected": -17.85839080810547, "eval_runtime": 77.7452, "eval_samples_per_second": 36.813, "eval_steps_per_second": 2.302, "step": 1500 }, { "epoch": 0.69, "learning_rate": 2.5678782755705833e-07, "logits/chosen": -1.1631323099136353, "logits/rejected": -1.1225086450576782, "logps/chosen": -82.05781555175781, "logps/rejected": -102.92134094238281, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.1484121084213257, "rewards/margins": 19.43674087524414, "rewards/rejected": -18.288328170776367, "step": 1510 }, { "epoch": 0.69, "learning_rate": 2.562806424344886e-07, "logits/chosen": -1.1722062826156616, "logits/rejected": -1.1262253522872925, "logps/chosen": -92.10721588134766, "logps/rejected": -107.66226959228516, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.5253825187683105, "rewards/margins": 21.867202758789062, "rewards/rejected": -19.341819763183594, "step": 1520 }, { "epoch": 0.7, "learning_rate": 2.5577345731191884e-07, "logits/chosen": -1.1755383014678955, "logits/rejected": -1.1288068294525146, "logps/chosen": -86.13807678222656, "logps/rejected": -107.95841217041016, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 1.1721524000167847, "rewards/margins": 20.330608367919922, "rewards/rejected": -19.158458709716797, "step": 1530 }, { "epoch": 0.7, "learning_rate": 2.552662721893491e-07, "logits/chosen": -1.1406983137130737, "logits/rejected": -1.1065037250518799, "logps/chosen": -88.02510070800781, "logps/rejected": -102.81829833984375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.6564058661460876, "rewards/margins": 18.354061126708984, "rewards/rejected": -17.697656631469727, "step": 1540 }, { "epoch": 0.71, "learning_rate": 2.5475908706677935e-07, "logits/chosen": -1.1616684198379517, "logits/rejected": -1.1152303218841553, "logps/chosen": -85.58765411376953, "logps/rejected": -105.52689361572266, "loss": 0.0038, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9486013650894165, "rewards/margins": 20.196699142456055, "rewards/rejected": -19.24810028076172, "step": 1550 }, { "epoch": 0.71, "learning_rate": 2.542519019442096e-07, "logits/chosen": -1.2028621435165405, "logits/rejected": -1.1485238075256348, "logps/chosen": -78.52555847167969, "logps/rejected": -99.6640853881836, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5557008981704712, "rewards/margins": 19.518081665039062, "rewards/rejected": -17.962379455566406, "step": 1560 }, { "epoch": 0.72, "learning_rate": 2.537447168216399e-07, "logits/chosen": -1.176762342453003, "logits/rejected": -1.1274961233139038, "logps/chosen": -93.12080383300781, "logps/rejected": -105.16368103027344, "loss": 0.0031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.659658432006836, "rewards/margins": 19.342235565185547, "rewards/rejected": -17.682575225830078, "step": 1570 }, { "epoch": 0.72, "learning_rate": 2.5323753169907016e-07, "logits/chosen": -1.2171311378479004, "logits/rejected": -1.172149419784546, "logps/chosen": -86.0450210571289, "logps/rejected": -107.85368347167969, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.6793642044067383, "rewards/margins": 21.292896270751953, "rewards/rejected": -19.613529205322266, "step": 1580 }, { "epoch": 0.73, "learning_rate": 2.527303465765004e-07, "logits/chosen": -1.2056636810302734, "logits/rejected": -1.1607674360275269, "logps/chosen": -93.74681854248047, "logps/rejected": -113.48725891113281, "loss": 0.0047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6828597187995911, "rewards/margins": 20.181041717529297, "rewards/rejected": -19.49818229675293, "step": 1590 }, { "epoch": 0.73, "learning_rate": 2.5222316145393067e-07, "logits/chosen": -1.1712194681167603, "logits/rejected": -1.124768853187561, "logps/chosen": -96.73299407958984, "logps/rejected": -108.51261901855469, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.9574599266052246, "rewards/margins": 20.438793182373047, "rewards/rejected": -19.481334686279297, "step": 1600 }, { "epoch": 0.73, "eval_logits/chosen": -1.2923858165740967, "eval_logits/rejected": -1.227616786956787, "eval_logps/chosen": -87.92854309082031, "eval_logps/rejected": -104.88460540771484, "eval_loss": 0.007888087071478367, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.7519445419311523, "eval_rewards/margins": 20.523387908935547, "eval_rewards/rejected": -19.77144432067871, "eval_runtime": 58.4414, "eval_samples_per_second": 48.972, "eval_steps_per_second": 3.063, "step": 1600 }, { "epoch": 0.73, "learning_rate": 2.517159763313609e-07, "logits/chosen": -1.2481939792633057, "logits/rejected": -1.1921546459197998, "logps/chosen": -90.30425262451172, "logps/rejected": -106.55949401855469, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.6353106498718262, "rewards/margins": 21.251611709594727, "rewards/rejected": -20.616300582885742, "step": 1610 }, { "epoch": 0.74, "learning_rate": 2.512087912087912e-07, "logits/chosen": -1.2304586172103882, "logits/rejected": -1.173517107963562, "logps/chosen": -92.87260437011719, "logps/rejected": -110.1197280883789, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.5774765610694885, "rewards/margins": 20.693300247192383, "rewards/rejected": -20.115821838378906, "step": 1620 }, { "epoch": 0.74, "learning_rate": 2.507016060862215e-07, "logits/chosen": -1.2224457263946533, "logits/rejected": -1.1627413034439087, "logps/chosen": -89.59088134765625, "logps/rejected": -107.96586608886719, "loss": 0.0049, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6187242865562439, "rewards/margins": 21.216934204101562, "rewards/rejected": -20.598209381103516, "step": 1630 }, { "epoch": 0.75, "learning_rate": 2.501944209636517e-07, "logits/chosen": -1.2341878414154053, "logits/rejected": -1.1646558046340942, "logps/chosen": -99.49878692626953, "logps/rejected": -107.7359390258789, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.47276169061660767, "rewards/margins": 21.252347946166992, "rewards/rejected": -20.779584884643555, "step": 1640 }, { "epoch": 0.75, "learning_rate": 2.49687235841082e-07, "logits/chosen": -1.2416927814483643, "logits/rejected": -1.1858526468276978, "logps/chosen": -92.1099624633789, "logps/rejected": -109.5634765625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.229796051979065, "rewards/margins": 20.57327651977539, "rewards/rejected": -19.34347915649414, "step": 1650 }, { "epoch": 0.76, "learning_rate": 2.4918005071851225e-07, "logits/chosen": -1.2330695390701294, "logits/rejected": -1.1908389329910278, "logps/chosen": -87.75404357910156, "logps/rejected": -106.69844055175781, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.9370654821395874, "rewards/margins": 20.381633758544922, "rewards/rejected": -19.444568634033203, "step": 1660 }, { "epoch": 0.76, "learning_rate": 2.486728655959425e-07, "logits/chosen": -1.2374944686889648, "logits/rejected": -1.178486943244934, "logps/chosen": -88.52249145507812, "logps/rejected": -102.99385070800781, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8609209060668945, "rewards/margins": 20.464916229248047, "rewards/rejected": -19.6039981842041, "step": 1670 }, { "epoch": 0.77, "learning_rate": 2.481656804733728e-07, "logits/chosen": -1.228576421737671, "logits/rejected": -1.1720635890960693, "logps/chosen": -89.48531341552734, "logps/rejected": -108.29301452636719, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.3690917491912842, "rewards/margins": 20.013029098510742, "rewards/rejected": -18.643938064575195, "step": 1680 }, { "epoch": 0.77, "learning_rate": 2.47658495350803e-07, "logits/chosen": -1.2124004364013672, "logits/rejected": -1.1609828472137451, "logps/chosen": -84.43907928466797, "logps/rejected": -104.23795318603516, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.7291128635406494, "rewards/margins": 20.71861457824707, "rewards/rejected": -18.989501953125, "step": 1690 }, { "epoch": 0.78, "learning_rate": 2.4715131022823327e-07, "logits/chosen": -1.2294721603393555, "logits/rejected": -1.1744955778121948, "logps/chosen": -92.34876251220703, "logps/rejected": -111.11873626708984, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.766455888748169, "rewards/margins": 21.175350189208984, "rewards/rejected": -20.408893585205078, "step": 1700 }, { "epoch": 0.78, "eval_logits/chosen": -1.2879317998886108, "eval_logits/rejected": -1.226698875427246, "eval_logps/chosen": -87.46847534179688, "eval_logps/rejected": -104.55460357666016, "eval_loss": 0.007508194539695978, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.9819736480712891, "eval_rewards/margins": 20.588415145874023, "eval_rewards/rejected": -19.606443405151367, "eval_runtime": 66.3677, "eval_samples_per_second": 43.123, "eval_steps_per_second": 2.697, "step": 1700 }, { "epoch": 0.78, "learning_rate": 2.4664412510566357e-07, "logits/chosen": -1.2202476263046265, "logits/rejected": -1.158785104751587, "logps/chosen": -89.9117202758789, "logps/rejected": -105.97224426269531, "loss": 0.0013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9460952877998352, "rewards/margins": 20.959388732910156, "rewards/rejected": -20.013294219970703, "step": 1710 }, { "epoch": 0.79, "learning_rate": 2.4613693998309383e-07, "logits/chosen": -1.2056442499160767, "logits/rejected": -1.1586743593215942, "logps/chosen": -84.9280776977539, "logps/rejected": -102.05850982666016, "loss": 0.0051, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1483945846557617, "rewards/margins": 20.313642501831055, "rewards/rejected": -18.16524887084961, "step": 1720 }, { "epoch": 0.79, "learning_rate": 2.456297548605241e-07, "logits/chosen": -1.1853172779083252, "logits/rejected": -1.146316409111023, "logps/chosen": -85.19999694824219, "logps/rejected": -104.33415222167969, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.6200752258300781, "rewards/margins": 19.86764907836914, "rewards/rejected": -18.247573852539062, "step": 1730 }, { "epoch": 0.79, "learning_rate": 2.4512256973795434e-07, "logits/chosen": -1.1810106039047241, "logits/rejected": -1.1348565816879272, "logps/chosen": -88.91767120361328, "logps/rejected": -105.30973815917969, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.908353567123413, "rewards/margins": 20.9567928314209, "rewards/rejected": -19.04844093322754, "step": 1740 }, { "epoch": 0.8, "learning_rate": 2.446153846153846e-07, "logits/chosen": -1.1730239391326904, "logits/rejected": -1.1376618146896362, "logps/chosen": -87.39840698242188, "logps/rejected": -104.39476013183594, "loss": 0.005, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5575692653656006, "rewards/margins": 19.33660125732422, "rewards/rejected": -17.779033660888672, "step": 1750 }, { "epoch": 0.8, "learning_rate": 2.4410819949281484e-07, "logits/chosen": -1.1946265697479248, "logits/rejected": -1.1434904336929321, "logps/chosen": -89.12718963623047, "logps/rejected": -110.394287109375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.4408177435398102, "rewards/margins": 19.224130630493164, "rewards/rejected": -18.78331184387207, "step": 1760 }, { "epoch": 0.81, "learning_rate": 2.4360101437024515e-07, "logits/chosen": -1.1789695024490356, "logits/rejected": -1.136823296546936, "logps/chosen": -83.54109954833984, "logps/rejected": -104.07505798339844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.6552903652191162, "rewards/margins": 20.878345489501953, "rewards/rejected": -19.223054885864258, "step": 1770 }, { "epoch": 0.81, "learning_rate": 2.4309382924767535e-07, "logits/chosen": -1.1888688802719116, "logits/rejected": -1.1330959796905518, "logps/chosen": -91.33379364013672, "logps/rejected": -110.06937408447266, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.4772006869316101, "rewards/margins": 20.467754364013672, "rewards/rejected": -19.990554809570312, "step": 1780 }, { "epoch": 0.82, "learning_rate": 2.4258664412510566e-07, "logits/chosen": -1.1968967914581299, "logits/rejected": -1.1466515064239502, "logps/chosen": -90.5532455444336, "logps/rejected": -102.20117950439453, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0483832359313965, "rewards/margins": 20.55087661743164, "rewards/rejected": -18.502490997314453, "step": 1790 }, { "epoch": 0.82, "learning_rate": 2.420794590025359e-07, "logits/chosen": -1.1845229864120483, "logits/rejected": -1.137880802154541, "logps/chosen": -90.67330932617188, "logps/rejected": -107.00489807128906, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 1.5007214546203613, "rewards/margins": 20.610965728759766, "rewards/rejected": -19.110244750976562, "step": 1800 }, { "epoch": 0.82, "eval_logits/chosen": -1.2606803178787231, "eval_logits/rejected": -1.1984455585479736, "eval_logps/chosen": -86.78810119628906, "eval_logps/rejected": -103.85710906982422, "eval_loss": 0.008221164345741272, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.322161316871643, "eval_rewards/margins": 20.57985496520996, "eval_rewards/rejected": -19.257694244384766, "eval_runtime": 70.2981, "eval_samples_per_second": 40.712, "eval_steps_per_second": 2.546, "step": 1800 }, { "epoch": 0.83, "learning_rate": 2.4157227387996617e-07, "logits/chosen": -1.2245190143585205, "logits/rejected": -1.1681041717529297, "logps/chosen": -90.532470703125, "logps/rejected": -104.7442398071289, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.7430979609489441, "rewards/margins": 20.217361450195312, "rewards/rejected": -19.47426414489746, "step": 1810 }, { "epoch": 0.83, "learning_rate": 2.410650887573965e-07, "logits/chosen": -1.2046802043914795, "logits/rejected": -1.153531789779663, "logps/chosen": -89.37370300292969, "logps/rejected": -104.24407958984375, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 1.3622597455978394, "rewards/margins": 19.505048751831055, "rewards/rejected": -18.14278793334961, "step": 1820 }, { "epoch": 0.84, "learning_rate": 2.405579036348267e-07, "logits/chosen": -1.1867458820343018, "logits/rejected": -1.1448475122451782, "logps/chosen": -80.08723449707031, "logps/rejected": -102.7812728881836, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.2673819065093994, "rewards/margins": 19.793813705444336, "rewards/rejected": -18.526432037353516, "step": 1830 }, { "epoch": 0.84, "learning_rate": 2.4005071851225693e-07, "logits/chosen": -1.1724661588668823, "logits/rejected": -1.1277903318405151, "logps/chosen": -90.30950927734375, "logps/rejected": -113.6275405883789, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.5377770662307739, "rewards/margins": 20.727577209472656, "rewards/rejected": -20.189800262451172, "step": 1840 }, { "epoch": 0.84, "learning_rate": 2.3954353338968724e-07, "logits/chosen": -1.1979376077651978, "logits/rejected": -1.155174970626831, "logps/chosen": -83.19313049316406, "logps/rejected": -104.564453125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.9587980508804321, "rewards/margins": 20.022220611572266, "rewards/rejected": -19.063425064086914, "step": 1850 }, { "epoch": 0.85, "learning_rate": 2.390363482671175e-07, "logits/chosen": -1.170663595199585, "logits/rejected": -1.11752450466156, "logps/chosen": -90.89318084716797, "logps/rejected": -109.8968734741211, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.412500262260437, "rewards/margins": 21.69009017944336, "rewards/rejected": -20.277587890625, "step": 1860 }, { "epoch": 0.85, "learning_rate": 2.3852916314454775e-07, "logits/chosen": -1.1926937103271484, "logits/rejected": -1.1473562717437744, "logps/chosen": -93.6591796875, "logps/rejected": -106.84416198730469, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 1.8632802963256836, "rewards/margins": 19.899490356445312, "rewards/rejected": -18.036211013793945, "step": 1870 }, { "epoch": 0.86, "learning_rate": 2.38021978021978e-07, "logits/chosen": -1.1921271085739136, "logits/rejected": -1.1413754224777222, "logps/chosen": -85.18832397460938, "logps/rejected": -105.3952407836914, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.8996086120605469, "rewards/margins": 20.485759735107422, "rewards/rejected": -19.586151123046875, "step": 1880 }, { "epoch": 0.86, "learning_rate": 2.3751479289940826e-07, "logits/chosen": -1.2049638032913208, "logits/rejected": -1.1474329233169556, "logps/chosen": -92.87411499023438, "logps/rejected": -106.48674011230469, "loss": 0.0039, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1706862449645996, "rewards/margins": 22.162261962890625, "rewards/rejected": -19.991575241088867, "step": 1890 }, { "epoch": 0.87, "learning_rate": 2.3700760777683854e-07, "logits/chosen": -1.1745494604110718, "logits/rejected": -1.1166932582855225, "logps/chosen": -90.28245544433594, "logps/rejected": -111.68940734863281, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 2.081857204437256, "rewards/margins": 21.467239379882812, "rewards/rejected": -19.385379791259766, "step": 1900 }, { "epoch": 0.87, "eval_logits/chosen": -1.2352814674377441, "eval_logits/rejected": -1.1811531782150269, "eval_logps/chosen": -86.63172149658203, "eval_logps/rejected": -104.2937240600586, "eval_loss": 0.007481275591999292, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 1.4003478288650513, "eval_rewards/margins": 20.87635612487793, "eval_rewards/rejected": -19.47600746154785, "eval_runtime": 64.2566, "eval_samples_per_second": 44.54, "eval_steps_per_second": 2.786, "step": 1900 }, { "epoch": 0.87, "learning_rate": 2.3650042265426882e-07, "logits/chosen": -1.2035939693450928, "logits/rejected": -1.1619645357131958, "logps/chosen": -87.07538604736328, "logps/rejected": -105.97283935546875, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9201610088348389, "rewards/margins": 21.46194839477539, "rewards/rejected": -19.54178810119629, "step": 1910 }, { "epoch": 0.88, "learning_rate": 2.3599323753169907e-07, "logits/chosen": -1.1762607097625732, "logits/rejected": -1.1266227960586548, "logps/chosen": -84.24024200439453, "logps/rejected": -105.28382873535156, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7776741981506348, "rewards/margins": 21.163555145263672, "rewards/rejected": -20.38587760925293, "step": 1920 }, { "epoch": 0.88, "learning_rate": 2.354860524091293e-07, "logits/chosen": -1.196942925453186, "logits/rejected": -1.1432682275772095, "logps/chosen": -95.1038818359375, "logps/rejected": -106.1786117553711, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.3159481287002563, "rewards/margins": 21.558012008666992, "rewards/rejected": -20.2420654296875, "step": 1930 }, { "epoch": 0.89, "learning_rate": 2.3497886728655958e-07, "logits/chosen": -1.1665886640548706, "logits/rejected": -1.1258021593093872, "logps/chosen": -83.95616149902344, "logps/rejected": -100.2303237915039, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.901753306388855, "rewards/margins": 19.35708999633789, "rewards/rejected": -18.455339431762695, "step": 1940 }, { "epoch": 0.89, "learning_rate": 2.3447168216398983e-07, "logits/chosen": -1.18831205368042, "logits/rejected": -1.1461542844772339, "logps/chosen": -89.49617767333984, "logps/rejected": -105.2798843383789, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 1.2909132242202759, "rewards/margins": 20.118661880493164, "rewards/rejected": -18.827747344970703, "step": 1950 }, { "epoch": 0.89, "learning_rate": 2.3396449704142012e-07, "logits/chosen": -1.1922476291656494, "logits/rejected": -1.1407110691070557, "logps/chosen": -94.23783874511719, "logps/rejected": -105.92415618896484, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.5083615183830261, "rewards/margins": 19.7877254486084, "rewards/rejected": -19.279361724853516, "step": 1960 }, { "epoch": 0.9, "learning_rate": 2.334573119188504e-07, "logits/chosen": -1.1829791069030762, "logits/rejected": -1.1273462772369385, "logps/chosen": -88.82598876953125, "logps/rejected": -101.88629150390625, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 1.235897183418274, "rewards/margins": 21.914575576782227, "rewards/rejected": -20.67867660522461, "step": 1970 }, { "epoch": 0.9, "learning_rate": 2.3295012679628062e-07, "logits/chosen": -1.2115113735198975, "logits/rejected": -1.1657658815383911, "logps/chosen": -86.12489318847656, "logps/rejected": -107.81489562988281, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 0.653774082660675, "rewards/margins": 20.462425231933594, "rewards/rejected": -19.808650970458984, "step": 1980 }, { "epoch": 0.91, "learning_rate": 2.3244294167371088e-07, "logits/chosen": -1.2142908573150635, "logits/rejected": -1.1777855157852173, "logps/chosen": -84.75012969970703, "logps/rejected": -106.76557922363281, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.0658010244369507, "rewards/margins": 20.134159088134766, "rewards/rejected": -19.068355560302734, "step": 1990 }, { "epoch": 0.91, "learning_rate": 2.3193575655114116e-07, "logits/chosen": -1.2295953035354614, "logits/rejected": -1.174081802368164, "logps/chosen": -88.81068420410156, "logps/rejected": -106.81929779052734, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 1.5790882110595703, "rewards/margins": 20.7359619140625, "rewards/rejected": -19.156875610351562, "step": 2000 }, { "epoch": 0.91, "eval_logits/chosen": -1.2681105136871338, "eval_logits/rejected": -1.2061457633972168, "eval_logps/chosen": -86.66743469238281, "eval_logps/rejected": -104.74527740478516, "eval_loss": 0.007192350458353758, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.3824937343597412, "eval_rewards/margins": 21.084270477294922, "eval_rewards/rejected": -19.7017765045166, "eval_runtime": 77.6552, "eval_samples_per_second": 36.855, "eval_steps_per_second": 2.305, "step": 2000 }, { "epoch": 0.92, "learning_rate": 2.3142857142857144e-07, "logits/chosen": -1.194087266921997, "logits/rejected": -1.1440551280975342, "logps/chosen": -87.83335876464844, "logps/rejected": -106.5591812133789, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": 1.5759541988372803, "rewards/margins": 20.1513614654541, "rewards/rejected": -18.575407028198242, "step": 2010 }, { "epoch": 0.92, "learning_rate": 2.3092138630600167e-07, "logits/chosen": -1.1671911478042603, "logits/rejected": -1.1248931884765625, "logps/chosen": -82.89833068847656, "logps/rejected": -110.85752868652344, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.971392035484314, "rewards/margins": 20.262601852416992, "rewards/rejected": -19.291210174560547, "step": 2020 }, { "epoch": 0.93, "learning_rate": 2.3041420118343192e-07, "logits/chosen": -1.1789249181747437, "logits/rejected": -1.1308996677398682, "logps/chosen": -85.59063720703125, "logps/rejected": -98.32166290283203, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 1.9662765264511108, "rewards/margins": 20.44546890258789, "rewards/rejected": -18.479190826416016, "step": 2030 }, { "epoch": 0.93, "learning_rate": 2.299070160608622e-07, "logits/chosen": -1.1901233196258545, "logits/rejected": -1.1428929567337036, "logps/chosen": -83.4649887084961, "logps/rejected": -101.49374389648438, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.49424687027931213, "rewards/margins": 19.59086036682129, "rewards/rejected": -19.09661293029785, "step": 2040 }, { "epoch": 0.94, "learning_rate": 2.2939983093829248e-07, "logits/chosen": -1.211072325706482, "logits/rejected": -1.1505995988845825, "logps/chosen": -92.88377380371094, "logps/rejected": -105.16926574707031, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 1.585486650466919, "rewards/margins": 20.312013626098633, "rewards/rejected": -18.72652816772461, "step": 2050 }, { "epoch": 0.94, "learning_rate": 2.2889264581572274e-07, "logits/chosen": -1.1979122161865234, "logits/rejected": -1.1408374309539795, "logps/chosen": -88.95879364013672, "logps/rejected": -100.50065612792969, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2848825454711914, "rewards/margins": 20.595218658447266, "rewards/rejected": -18.31033706665039, "step": 2060 }, { "epoch": 0.94, "learning_rate": 2.2838546069315297e-07, "logits/chosen": -1.1775312423706055, "logits/rejected": -1.1326141357421875, "logps/chosen": -83.72702026367188, "logps/rejected": -104.96478271484375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.2204118967056274, "rewards/margins": 20.910999298095703, "rewards/rejected": -19.69058609008789, "step": 2070 }, { "epoch": 0.95, "learning_rate": 2.2787827557058325e-07, "logits/chosen": -1.1710788011550903, "logits/rejected": -1.1334991455078125, "logps/chosen": -84.7524185180664, "logps/rejected": -103.06710052490234, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.717494249343872, "rewards/margins": 20.584232330322266, "rewards/rejected": -18.866741180419922, "step": 2080 }, { "epoch": 0.95, "learning_rate": 2.273710904480135e-07, "logits/chosen": -1.1447703838348389, "logits/rejected": -1.1037436723709106, "logps/chosen": -88.0915298461914, "logps/rejected": -101.42835998535156, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.3874223232269287, "rewards/margins": 19.40856170654297, "rewards/rejected": -18.02113914489746, "step": 2090 }, { "epoch": 0.96, "learning_rate": 2.2686390532544378e-07, "logits/chosen": -1.1998827457427979, "logits/rejected": -1.144376277923584, "logps/chosen": -87.5056381225586, "logps/rejected": -100.17271423339844, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.228088855743408, "rewards/margins": 20.235179901123047, "rewards/rejected": -18.007089614868164, "step": 2100 }, { "epoch": 0.96, "eval_logits/chosen": -1.2238779067993164, "eval_logits/rejected": -1.1694415807724, "eval_logps/chosen": -86.23553466796875, "eval_logps/rejected": -102.94633483886719, "eval_loss": 0.0070498245768249035, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.5984489917755127, "eval_rewards/margins": 20.400758743286133, "eval_rewards/rejected": -18.802310943603516, "eval_runtime": 62.0998, "eval_samples_per_second": 46.087, "eval_steps_per_second": 2.882, "step": 2100 }, { "epoch": 0.96, "learning_rate": 2.2635672020287406e-07, "logits/chosen": -1.1710069179534912, "logits/rejected": -1.1208176612854004, "logps/chosen": -89.8357162475586, "logps/rejected": -104.9393310546875, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 1.2994686365127563, "rewards/margins": 20.46290397644043, "rewards/rejected": -19.163436889648438, "step": 2110 }, { "epoch": 0.97, "learning_rate": 2.258495350803043e-07, "logits/chosen": -1.167830228805542, "logits/rejected": -1.1247303485870361, "logps/chosen": -88.80085754394531, "logps/rejected": -101.0454330444336, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.7379745244979858, "rewards/margins": 20.091503143310547, "rewards/rejected": -18.353527069091797, "step": 2120 }, { "epoch": 0.97, "learning_rate": 2.2534234995773454e-07, "logits/chosen": -1.1950472593307495, "logits/rejected": -1.144486427307129, "logps/chosen": -91.32661437988281, "logps/rejected": -110.21795654296875, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 0.46089300513267517, "rewards/margins": 21.63949966430664, "rewards/rejected": -21.178604125976562, "step": 2130 }, { "epoch": 0.98, "learning_rate": 2.2483516483516483e-07, "logits/chosen": -1.1894288063049316, "logits/rejected": -1.1427417993545532, "logps/chosen": -85.63072967529297, "logps/rejected": -102.2844467163086, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.9994566440582275, "rewards/margins": 21.046451568603516, "rewards/rejected": -19.0469913482666, "step": 2140 }, { "epoch": 0.98, "learning_rate": 2.243279797125951e-07, "logits/chosen": -1.1913617849349976, "logits/rejected": -1.1351690292358398, "logps/chosen": -86.342529296875, "logps/rejected": -105.4500503540039, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 1.3321704864501953, "rewards/margins": 20.112749099731445, "rewards/rejected": -18.780582427978516, "step": 2150 }, { "epoch": 0.99, "learning_rate": 2.2382079459002536e-07, "logits/chosen": -1.200955867767334, "logits/rejected": -1.141643762588501, "logps/chosen": -93.14427185058594, "logps/rejected": -100.25897979736328, "loss": 0.0028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7122005224227905, "rewards/margins": 20.42462158203125, "rewards/rejected": -18.712421417236328, "step": 2160 }, { "epoch": 0.99, "learning_rate": 2.233136094674556e-07, "logits/chosen": -1.2126085758209229, "logits/rejected": -1.1542718410491943, "logps/chosen": -94.84847259521484, "logps/rejected": -105.94117736816406, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.69661545753479, "rewards/margins": 20.98930549621582, "rewards/rejected": -19.29269027709961, "step": 2170 }, { "epoch": 0.99, "learning_rate": 2.2280642434488587e-07, "logits/chosen": -1.2024385929107666, "logits/rejected": -1.15294349193573, "logps/chosen": -81.07176208496094, "logps/rejected": -103.16294860839844, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 2.119349479675293, "rewards/margins": 21.580371856689453, "rewards/rejected": -19.461023330688477, "step": 2180 }, { "epoch": 1.0, "learning_rate": 2.2229923922231615e-07, "logits/chosen": -1.1685454845428467, "logits/rejected": -1.1278364658355713, "logps/chosen": -91.21316528320312, "logps/rejected": -108.0694580078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.229431629180908, "rewards/margins": 21.74981689453125, "rewards/rejected": -19.5203857421875, "step": 2190 }, { "epoch": 1.0, "learning_rate": 2.217920540997464e-07, "logits/chosen": -1.1836879253387451, "logits/rejected": -1.1423550844192505, "logps/chosen": -88.64768981933594, "logps/rejected": -109.5053482055664, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.5388556718826294, "rewards/margins": 20.56551742553711, "rewards/rejected": -20.026660919189453, "step": 2200 }, { "epoch": 1.0, "eval_logits/chosen": -1.2711361646652222, "eval_logits/rejected": -1.2118490934371948, "eval_logps/chosen": -86.72749328613281, "eval_logps/rejected": -104.70652770996094, "eval_loss": 0.0074717202223837376, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 1.352469801902771, "eval_rewards/margins": 21.034873962402344, "eval_rewards/rejected": -19.682405471801758, "eval_runtime": 64.1917, "eval_samples_per_second": 44.585, "eval_steps_per_second": 2.789, "step": 2200 }, { "epoch": 1.01, "learning_rate": 2.2128486897717668e-07, "logits/chosen": -1.2156295776367188, "logits/rejected": -1.1702911853790283, "logps/chosen": -84.73490142822266, "logps/rejected": -109.97856140136719, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8699300289154053, "rewards/margins": 22.461772918701172, "rewards/rejected": -20.591842651367188, "step": 2210 }, { "epoch": 1.01, "learning_rate": 2.207776838546069e-07, "logits/chosen": -1.2317909002304077, "logits/rejected": -1.1759793758392334, "logps/chosen": -88.38722229003906, "logps/rejected": -110.55941009521484, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 2.2832095623016357, "rewards/margins": 21.7716007232666, "rewards/rejected": -19.488391876220703, "step": 2220 }, { "epoch": 1.02, "learning_rate": 2.202704987320372e-07, "logits/chosen": -1.240782618522644, "logits/rejected": -1.1896260976791382, "logps/chosen": -90.6281509399414, "logps/rejected": -107.0479736328125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.5835065841674805, "rewards/margins": 21.230846405029297, "rewards/rejected": -19.6473388671875, "step": 2230 }, { "epoch": 1.02, "learning_rate": 2.1976331360946745e-07, "logits/chosen": -1.2149629592895508, "logits/rejected": -1.1640093326568604, "logps/chosen": -90.72506713867188, "logps/rejected": -106.3701171875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.3114402294158936, "rewards/margins": 21.860370635986328, "rewards/rejected": -20.54892921447754, "step": 2240 }, { "epoch": 1.03, "learning_rate": 2.1925612848689773e-07, "logits/chosen": -1.207849383354187, "logits/rejected": -1.1694178581237793, "logps/chosen": -88.57085418701172, "logps/rejected": -107.42121887207031, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.3758482933044434, "rewards/margins": 21.30167007446289, "rewards/rejected": -19.925823211669922, "step": 2250 }, { "epoch": 1.03, "learning_rate": 2.1874894336432796e-07, "logits/chosen": -1.221719741821289, "logits/rejected": -1.1561182737350464, "logps/chosen": -92.80986022949219, "logps/rejected": -109.31239318847656, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 2.313194513320923, "rewards/margins": 23.222518920898438, "rewards/rejected": -20.909326553344727, "step": 2260 }, { "epoch": 1.04, "learning_rate": 2.182417582417582e-07, "logits/chosen": -1.2283899784088135, "logits/rejected": -1.1857209205627441, "logps/chosen": -86.95245361328125, "logps/rejected": -110.2240219116211, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.5342823266983032, "rewards/margins": 21.846160888671875, "rewards/rejected": -20.311880111694336, "step": 2270 }, { "epoch": 1.04, "learning_rate": 2.177345731191885e-07, "logits/chosen": -1.2207567691802979, "logits/rejected": -1.1733354330062866, "logps/chosen": -89.40998077392578, "logps/rejected": -108.59842681884766, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 1.084045648574829, "rewards/margins": 21.383882522583008, "rewards/rejected": -20.29983901977539, "step": 2280 }, { "epoch": 1.05, "learning_rate": 2.1722738799661877e-07, "logits/chosen": -1.2484939098358154, "logits/rejected": -1.186006784439087, "logps/chosen": -91.48284149169922, "logps/rejected": -102.3342514038086, "loss": 0.0071, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3055270910263062, "rewards/margins": 20.427013397216797, "rewards/rejected": -19.12148666381836, "step": 2290 }, { "epoch": 1.05, "learning_rate": 2.1672020287404903e-07, "logits/chosen": -1.2090219259262085, "logits/rejected": -1.1676021814346313, "logps/chosen": -89.25651550292969, "logps/rejected": -110.95772552490234, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.608730673789978, "rewards/margins": 20.88467025756836, "rewards/rejected": -20.275938034057617, "step": 2300 }, { "epoch": 1.05, "eval_logits/chosen": -1.3087431192398071, "eval_logits/rejected": -1.2488420009613037, "eval_logps/chosen": -87.20025634765625, "eval_logps/rejected": -105.1304702758789, "eval_loss": 0.007476452272385359, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.1160887479782104, "eval_rewards/margins": 21.010465621948242, "eval_rewards/rejected": -19.894376754760742, "eval_runtime": 68.119, "eval_samples_per_second": 42.015, "eval_steps_per_second": 2.628, "step": 2300 }, { "epoch": 1.05, "learning_rate": 2.1621301775147925e-07, "logits/chosen": -1.2305896282196045, "logits/rejected": -1.1756011247634888, "logps/chosen": -85.5489501953125, "logps/rejected": -109.3161849975586, "loss": 0.0017, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7630363702774048, "rewards/margins": 21.09680938720703, "rewards/rejected": -20.333772659301758, "step": 2310 }, { "epoch": 1.06, "learning_rate": 2.1570583262890953e-07, "logits/chosen": -1.2177748680114746, "logits/rejected": -1.1697107553482056, "logps/chosen": -85.29845428466797, "logps/rejected": -109.9201889038086, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 0.4945674538612366, "rewards/margins": 21.288578033447266, "rewards/rejected": -20.794010162353516, "step": 2320 }, { "epoch": 1.06, "learning_rate": 2.1519864750633982e-07, "logits/chosen": -1.2171680927276611, "logits/rejected": -1.1630264520645142, "logps/chosen": -88.26265716552734, "logps/rejected": -104.31333923339844, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.4045048952102661, "rewards/margins": 20.78306770324707, "rewards/rejected": -19.378559112548828, "step": 2330 }, { "epoch": 1.07, "learning_rate": 2.1469146238377007e-07, "logits/chosen": -1.2630029916763306, "logits/rejected": -1.2106314897537231, "logps/chosen": -88.36056518554688, "logps/rejected": -110.58036041259766, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.7803552150726318, "rewards/margins": 22.81916046142578, "rewards/rejected": -21.038806915283203, "step": 2340 }, { "epoch": 1.07, "learning_rate": 2.1418427726120035e-07, "logits/chosen": -1.2328459024429321, "logits/rejected": -1.1753827333450317, "logps/chosen": -90.98271942138672, "logps/rejected": -107.30308532714844, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 1.918079137802124, "rewards/margins": 22.566545486450195, "rewards/rejected": -20.648466110229492, "step": 2350 }, { "epoch": 1.08, "learning_rate": 2.1367709213863058e-07, "logits/chosen": -1.221885085105896, "logits/rejected": -1.1747316122055054, "logps/chosen": -83.33541870117188, "logps/rejected": -107.0094985961914, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.8313859701156616, "rewards/margins": 22.628454208374023, "rewards/rejected": -20.797067642211914, "step": 2360 }, { "epoch": 1.08, "learning_rate": 2.1316990701606086e-07, "logits/chosen": -1.2495050430297852, "logits/rejected": -1.1890289783477783, "logps/chosen": -86.03840637207031, "logps/rejected": -112.093505859375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.5868552923202515, "rewards/margins": 22.05801010131836, "rewards/rejected": -21.471149444580078, "step": 2370 }, { "epoch": 1.09, "learning_rate": 2.126627218934911e-07, "logits/chosen": -1.2298763990402222, "logits/rejected": -1.1767555475234985, "logps/chosen": -84.1092758178711, "logps/rejected": -106.68165588378906, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.08795030415058136, "rewards/margins": 21.050338745117188, "rewards/rejected": -20.962390899658203, "step": 2380 }, { "epoch": 1.09, "learning_rate": 2.121555367709214e-07, "logits/chosen": -1.2427847385406494, "logits/rejected": -1.2060487270355225, "logps/chosen": -85.17793273925781, "logps/rejected": -109.41981506347656, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.06969626992940903, "rewards/margins": 20.437192916870117, "rewards/rejected": -20.367496490478516, "step": 2390 }, { "epoch": 1.1, "learning_rate": 2.1164835164835165e-07, "logits/chosen": -1.223024845123291, "logits/rejected": -1.1713087558746338, "logps/chosen": -85.1490707397461, "logps/rejected": -114.04695129394531, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0163342952728271, "rewards/margins": 22.59818458557129, "rewards/rejected": -21.581851959228516, "step": 2400 }, { "epoch": 1.1, "eval_logits/chosen": -1.3130491971969604, "eval_logits/rejected": -1.2457395792007446, "eval_logps/chosen": -87.72120666503906, "eval_logps/rejected": -107.27316284179688, "eval_loss": 0.007567834109067917, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 0.8556116223335266, "eval_rewards/margins": 21.821340560913086, "eval_rewards/rejected": -20.965728759765625, "eval_runtime": 68.7759, "eval_samples_per_second": 41.613, "eval_steps_per_second": 2.603, "step": 2400 }, { "epoch": 1.1, "learning_rate": 2.1114116652578188e-07, "logits/chosen": -1.224714756011963, "logits/rejected": -1.1649423837661743, "logps/chosen": -91.27989959716797, "logps/rejected": -109.84356689453125, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8775809407234192, "rewards/margins": 22.14706039428711, "rewards/rejected": -21.26947784423828, "step": 2410 }, { "epoch": 1.1, "learning_rate": 2.1063398140321216e-07, "logits/chosen": -1.2352392673492432, "logits/rejected": -1.1807770729064941, "logps/chosen": -86.58528137207031, "logps/rejected": -107.39393615722656, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.1014773845672607, "rewards/margins": 22.366958618164062, "rewards/rejected": -21.265480041503906, "step": 2420 }, { "epoch": 1.11, "learning_rate": 2.1012679628064244e-07, "logits/chosen": -1.271728515625, "logits/rejected": -1.218794584274292, "logps/chosen": -83.79855346679688, "logps/rejected": -115.31538391113281, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 1.1647331714630127, "rewards/margins": 23.370630264282227, "rewards/rejected": -22.205896377563477, "step": 2430 }, { "epoch": 1.11, "learning_rate": 2.096196111580727e-07, "logits/chosen": -1.2588279247283936, "logits/rejected": -1.1938583850860596, "logps/chosen": -92.08492279052734, "logps/rejected": -108.75785827636719, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.8567431569099426, "rewards/margins": 22.12790298461914, "rewards/rejected": -21.271160125732422, "step": 2440 }, { "epoch": 1.12, "learning_rate": 2.0911242603550297e-07, "logits/chosen": -1.254792332649231, "logits/rejected": -1.184197187423706, "logps/chosen": -92.77003479003906, "logps/rejected": -114.69380950927734, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.006107807159424, "rewards/margins": 24.762887954711914, "rewards/rejected": -22.756778717041016, "step": 2450 }, { "epoch": 1.12, "learning_rate": 2.086052409129332e-07, "logits/chosen": -1.2276710271835327, "logits/rejected": -1.172614336013794, "logps/chosen": -92.51925659179688, "logps/rejected": -109.40245056152344, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 1.4506779909133911, "rewards/margins": 22.675724029541016, "rewards/rejected": -21.225048065185547, "step": 2460 }, { "epoch": 1.13, "learning_rate": 2.0809805579036348e-07, "logits/chosen": -1.2417436838150024, "logits/rejected": -1.1916046142578125, "logps/chosen": -87.77659606933594, "logps/rejected": -107.12300109863281, "loss": 0.0015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3042961359024048, "rewards/margins": 21.55357551574707, "rewards/rejected": -20.249279022216797, "step": 2470 }, { "epoch": 1.13, "learning_rate": 2.0759087066779374e-07, "logits/chosen": -1.2384653091430664, "logits/rejected": -1.19193434715271, "logps/chosen": -87.645751953125, "logps/rejected": -111.66351318359375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.5307129621505737, "rewards/margins": 22.680551528930664, "rewards/rejected": -22.149839401245117, "step": 2480 }, { "epoch": 1.14, "learning_rate": 2.0708368554522402e-07, "logits/chosen": -1.239863634109497, "logits/rejected": -1.1838042736053467, "logps/chosen": -82.25237274169922, "logps/rejected": -107.54081726074219, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.317577600479126, "rewards/margins": 21.645660400390625, "rewards/rejected": -20.328083038330078, "step": 2490 }, { "epoch": 1.14, "learning_rate": 2.0657650042265424e-07, "logits/chosen": -1.238755464553833, "logits/rejected": -1.1892435550689697, "logps/chosen": -91.93738555908203, "logps/rejected": -105.78620910644531, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 0.8200355768203735, "rewards/margins": 21.127065658569336, "rewards/rejected": -20.307029724121094, "step": 2500 }, { "epoch": 1.14, "eval_logits/chosen": -1.2944495677947998, "eval_logits/rejected": -1.2309415340423584, "eval_logps/chosen": -87.02932739257812, "eval_logps/rejected": -104.29837799072266, "eval_loss": 0.006961911916732788, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 1.2015457153320312, "eval_rewards/margins": 20.67987823486328, "eval_rewards/rejected": -19.478334426879883, "eval_runtime": 71.1702, "eval_samples_per_second": 40.213, "eval_steps_per_second": 2.515, "step": 2500 }, { "epoch": 1.15, "learning_rate": 2.0606931530008452e-07, "logits/chosen": -1.2269400358200073, "logits/rejected": -1.165895700454712, "logps/chosen": -89.00814056396484, "logps/rejected": -106.82181549072266, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.1590993404388428, "rewards/margins": 21.093416213989258, "rewards/rejected": -19.934314727783203, "step": 2510 }, { "epoch": 1.15, "learning_rate": 2.0556213017751478e-07, "logits/chosen": -1.244373083114624, "logits/rejected": -1.1799051761627197, "logps/chosen": -86.1784896850586, "logps/rejected": -108.1005859375, "loss": 0.0047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9702383279800415, "rewards/margins": 21.26332664489746, "rewards/rejected": -20.293087005615234, "step": 2520 }, { "epoch": 1.15, "learning_rate": 2.0505494505494506e-07, "logits/chosen": -1.239007830619812, "logits/rejected": -1.188663363456726, "logps/chosen": -89.08403015136719, "logps/rejected": -109.1912612915039, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 0.8398208618164062, "rewards/margins": 20.724300384521484, "rewards/rejected": -19.884479522705078, "step": 2530 }, { "epoch": 1.16, "learning_rate": 2.0454775993237531e-07, "logits/chosen": -1.259961485862732, "logits/rejected": -1.1992539167404175, "logps/chosen": -88.00000762939453, "logps/rejected": -105.28775787353516, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.7344595193862915, "rewards/margins": 22.018203735351562, "rewards/rejected": -20.283742904663086, "step": 2540 }, { "epoch": 1.16, "learning_rate": 2.0404057480980554e-07, "logits/chosen": -1.258159875869751, "logits/rejected": -1.1919949054718018, "logps/chosen": -91.0265121459961, "logps/rejected": -109.3829345703125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.2989047765731812, "rewards/margins": 22.686710357666016, "rewards/rejected": -21.387805938720703, "step": 2550 }, { "epoch": 1.17, "learning_rate": 2.0353338968723582e-07, "logits/chosen": -1.216018557548523, "logits/rejected": -1.1775795221328735, "logps/chosen": -84.73078918457031, "logps/rejected": -113.40096282958984, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.12488274276256561, "rewards/margins": 21.136188507080078, "rewards/rejected": -21.261070251464844, "step": 2560 }, { "epoch": 1.17, "learning_rate": 2.030262045646661e-07, "logits/chosen": -1.2377723455429077, "logits/rejected": -1.1810615062713623, "logps/chosen": -93.79662322998047, "logps/rejected": -107.40061950683594, "loss": 0.0061, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.11829134076833725, "rewards/margins": 20.636348724365234, "rewards/rejected": -20.754640579223633, "step": 2570 }, { "epoch": 1.18, "learning_rate": 2.0251901944209636e-07, "logits/chosen": -1.2281471490859985, "logits/rejected": -1.1688311100006104, "logps/chosen": -91.79940795898438, "logps/rejected": -107.32728576660156, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.22485239803791046, "rewards/margins": 20.858522415161133, "rewards/rejected": -20.633668899536133, "step": 2580 }, { "epoch": 1.18, "learning_rate": 2.0201183431952664e-07, "logits/chosen": -1.197676658630371, "logits/rejected": -1.1506736278533936, "logps/chosen": -81.2101821899414, "logps/rejected": -109.7088623046875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.2717623710632324, "rewards/margins": 21.169933319091797, "rewards/rejected": -21.441696166992188, "step": 2590 }, { "epoch": 1.19, "learning_rate": 2.0150464919695687e-07, "logits/chosen": -1.2182395458221436, "logits/rejected": -1.177594542503357, "logps/chosen": -80.82548522949219, "logps/rejected": -109.75965881347656, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.1711565256118774, "rewards/margins": 23.289653778076172, "rewards/rejected": -22.118499755859375, "step": 2600 }, { "epoch": 1.19, "eval_logits/chosen": -1.2811179161071777, "eval_logits/rejected": -1.2195229530334473, "eval_logps/chosen": -87.84212493896484, "eval_logps/rejected": -107.58037567138672, "eval_loss": 0.007497187703847885, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.7951509952545166, "eval_rewards/margins": 21.91447639465332, "eval_rewards/rejected": -21.119325637817383, "eval_runtime": 67.5783, "eval_samples_per_second": 42.351, "eval_steps_per_second": 2.649, "step": 2600 }, { "epoch": 1.19, "learning_rate": 2.0099746407438715e-07, "logits/chosen": -1.2012989521026611, "logits/rejected": -1.173628807067871, "logps/chosen": -84.23133087158203, "logps/rejected": -108.86763000488281, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 1.148093581199646, "rewards/margins": 21.286386489868164, "rewards/rejected": -20.138290405273438, "step": 2610 }, { "epoch": 1.2, "learning_rate": 2.004902789518174e-07, "logits/chosen": -1.236003041267395, "logits/rejected": -1.1768968105316162, "logps/chosen": -91.87477111816406, "logps/rejected": -112.02117919921875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.2967796325683594, "rewards/margins": 22.90878677368164, "rewards/rejected": -21.612009048461914, "step": 2620 }, { "epoch": 1.2, "learning_rate": 1.9998309382924768e-07, "logits/chosen": -1.2255749702453613, "logits/rejected": -1.169988989830017, "logps/chosen": -89.92630004882812, "logps/rejected": -109.48509216308594, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.11678309738636017, "rewards/margins": 20.700876235961914, "rewards/rejected": -20.584091186523438, "step": 2630 }, { "epoch": 1.2, "learning_rate": 1.9947590870667794e-07, "logits/chosen": -1.2192347049713135, "logits/rejected": -1.159459114074707, "logps/chosen": -96.32530212402344, "logps/rejected": -110.26582336425781, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 2.2365353107452393, "rewards/margins": 22.127044677734375, "rewards/rejected": -19.8905086517334, "step": 2640 }, { "epoch": 1.21, "learning_rate": 1.989687235841082e-07, "logits/chosen": -1.2065198421478271, "logits/rejected": -1.147120714187622, "logps/chosen": -88.47337341308594, "logps/rejected": -106.5348129272461, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 0.9000838994979858, "rewards/margins": 20.618175506591797, "rewards/rejected": -19.718093872070312, "step": 2650 }, { "epoch": 1.21, "learning_rate": 1.9846153846153844e-07, "logits/chosen": -1.195985198020935, "logits/rejected": -1.1491913795471191, "logps/chosen": -84.20733642578125, "logps/rejected": -109.67060852050781, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.2371699810028076, "rewards/margins": 22.225109100341797, "rewards/rejected": -20.98794174194336, "step": 2660 }, { "epoch": 1.22, "learning_rate": 1.9795435333896873e-07, "logits/chosen": -1.1977856159210205, "logits/rejected": -1.1537766456604004, "logps/chosen": -91.16226196289062, "logps/rejected": -111.68495178222656, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.734151601791382, "rewards/margins": 23.330568313598633, "rewards/rejected": -20.596416473388672, "step": 2670 }, { "epoch": 1.22, "learning_rate": 1.9744716821639898e-07, "logits/chosen": -1.224413275718689, "logits/rejected": -1.1784846782684326, "logps/chosen": -87.34507751464844, "logps/rejected": -109.75943756103516, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.3192319869995117, "rewards/margins": 22.662378311157227, "rewards/rejected": -21.343143463134766, "step": 2680 }, { "epoch": 1.23, "learning_rate": 1.9693998309382926e-07, "logits/chosen": -1.208308219909668, "logits/rejected": -1.1619625091552734, "logps/chosen": -88.23912048339844, "logps/rejected": -107.49666595458984, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.7308362126350403, "rewards/margins": 20.509281158447266, "rewards/rejected": -19.778446197509766, "step": 2690 }, { "epoch": 1.23, "learning_rate": 1.964327979712595e-07, "logits/chosen": -1.2484124898910522, "logits/rejected": -1.1881217956542969, "logps/chosen": -90.5484390258789, "logps/rejected": -105.3963851928711, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.22395610809326172, "rewards/margins": 21.083341598510742, "rewards/rejected": -21.30729866027832, "step": 2700 }, { "epoch": 1.23, "eval_logits/chosen": -1.2836943864822388, "eval_logits/rejected": -1.2240569591522217, "eval_logps/chosen": -88.02273559570312, "eval_logps/rejected": -107.38671875, "eval_loss": 0.007047051563858986, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.704842746257782, "eval_rewards/margins": 21.727344512939453, "eval_rewards/rejected": -21.022504806518555, "eval_runtime": 66.849, "eval_samples_per_second": 42.813, "eval_steps_per_second": 2.678, "step": 2700 }, { "epoch": 1.24, "learning_rate": 1.9592561284868977e-07, "logits/chosen": -1.2299364805221558, "logits/rejected": -1.1793487071990967, "logps/chosen": -92.32845306396484, "logps/rejected": -109.91078186035156, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9196799993515015, "rewards/margins": 21.892927169799805, "rewards/rejected": -20.973247528076172, "step": 2710 }, { "epoch": 1.24, "learning_rate": 1.9541842772612002e-07, "logits/chosen": -1.2148702144622803, "logits/rejected": -1.1717993021011353, "logps/chosen": -88.74729919433594, "logps/rejected": -109.92634582519531, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.4501422047615051, "rewards/margins": 20.923179626464844, "rewards/rejected": -20.47303581237793, "step": 2720 }, { "epoch": 1.25, "learning_rate": 1.949112426035503e-07, "logits/chosen": -1.2163503170013428, "logits/rejected": -1.1693785190582275, "logps/chosen": -87.26588439941406, "logps/rejected": -110.53138732910156, "loss": 0.0058, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8547931909561157, "rewards/margins": 21.771400451660156, "rewards/rejected": -22.626192092895508, "step": 2730 }, { "epoch": 1.25, "learning_rate": 1.9440405748098056e-07, "logits/chosen": -1.2386237382888794, "logits/rejected": -1.1867671012878418, "logps/chosen": -89.03327941894531, "logps/rejected": -112.5365219116211, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.6293159127235413, "rewards/margins": 22.378183364868164, "rewards/rejected": -21.74886703491211, "step": 2740 }, { "epoch": 1.26, "learning_rate": 1.938968723584108e-07, "logits/chosen": -1.241208791732788, "logits/rejected": -1.1873838901519775, "logps/chosen": -88.9361801147461, "logps/rejected": -111.81880187988281, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.0905447006225586, "rewards/margins": 23.877498626708984, "rewards/rejected": -22.78695297241211, "step": 2750 }, { "epoch": 1.26, "learning_rate": 1.9338968723584107e-07, "logits/chosen": -1.2293826341629028, "logits/rejected": -1.1767637729644775, "logps/chosen": -84.17843627929688, "logps/rejected": -110.3796157836914, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.36438900232315063, "rewards/margins": 22.36743927001953, "rewards/rejected": -22.731830596923828, "step": 2760 }, { "epoch": 1.26, "learning_rate": 1.9288250211327135e-07, "logits/chosen": -1.2488285303115845, "logits/rejected": -1.192888617515564, "logps/chosen": -94.11280822753906, "logps/rejected": -113.39571380615234, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.8494230508804321, "rewards/margins": 21.94849967956543, "rewards/rejected": -22.797924041748047, "step": 2770 }, { "epoch": 1.27, "learning_rate": 1.923753169907016e-07, "logits/chosen": -1.2614643573760986, "logits/rejected": -1.2098888158798218, "logps/chosen": -89.56803894042969, "logps/rejected": -112.6842041015625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.0361621156334877, "rewards/margins": 23.073833465576172, "rewards/rejected": -23.10999298095703, "step": 2780 }, { "epoch": 1.27, "learning_rate": 1.9186813186813186e-07, "logits/chosen": -1.2534093856811523, "logits/rejected": -1.1944758892059326, "logps/chosen": -93.7116470336914, "logps/rejected": -106.58601379394531, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.6133219003677368, "rewards/margins": 21.842636108398438, "rewards/rejected": -21.22931671142578, "step": 2790 }, { "epoch": 1.28, "learning_rate": 1.913609467455621e-07, "logits/chosen": -1.245892882347107, "logits/rejected": -1.1841986179351807, "logps/chosen": -90.11885070800781, "logps/rejected": -108.6605224609375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.8591505885124207, "rewards/margins": 23.346960067749023, "rewards/rejected": -22.487812042236328, "step": 2800 }, { "epoch": 1.28, "eval_logits/chosen": -1.3369088172912598, "eval_logits/rejected": -1.2667419910430908, "eval_logps/chosen": -88.43457794189453, "eval_logps/rejected": -109.54808807373047, "eval_loss": 0.007251236122101545, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.4989199936389923, "eval_rewards/margins": 22.602100372314453, "eval_rewards/rejected": -22.103178024291992, "eval_runtime": 65.6934, "eval_samples_per_second": 43.566, "eval_steps_per_second": 2.725, "step": 2800 }, { "epoch": 1.28, "learning_rate": 1.908537616229924e-07, "logits/chosen": -1.258395791053772, "logits/rejected": -1.2066611051559448, "logps/chosen": -88.32936096191406, "logps/rejected": -113.46788024902344, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.060993265360593796, "rewards/margins": 23.58555030822754, "rewards/rejected": -23.52455711364746, "step": 2810 }, { "epoch": 1.29, "learning_rate": 1.9034657650042265e-07, "logits/chosen": -1.2754733562469482, "logits/rejected": -1.1980760097503662, "logps/chosen": -93.66505432128906, "logps/rejected": -109.4578857421875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.4866414964199066, "rewards/margins": 22.506893157958984, "rewards/rejected": -22.020254135131836, "step": 2820 }, { "epoch": 1.29, "learning_rate": 1.8983939137785293e-07, "logits/chosen": -1.2679641246795654, "logits/rejected": -1.2068798542022705, "logps/chosen": -89.0657958984375, "logps/rejected": -111.84321594238281, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.362867146730423, "rewards/margins": 23.33441162109375, "rewards/rejected": -23.697277069091797, "step": 2830 }, { "epoch": 1.3, "learning_rate": 1.8933220625528315e-07, "logits/chosen": -1.2522757053375244, "logits/rejected": -1.179939866065979, "logps/chosen": -93.52415466308594, "logps/rejected": -112.2892074584961, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.2486017942428589, "rewards/margins": 23.161243438720703, "rewards/rejected": -21.912641525268555, "step": 2840 }, { "epoch": 1.3, "learning_rate": 1.8882502113271343e-07, "logits/chosen": -1.2578299045562744, "logits/rejected": -1.193433165550232, "logps/chosen": -89.56175994873047, "logps/rejected": -114.70777893066406, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.8356913328170776, "rewards/margins": 23.007110595703125, "rewards/rejected": -22.171415328979492, "step": 2850 }, { "epoch": 1.31, "learning_rate": 1.883178360101437e-07, "logits/chosen": -1.279767632484436, "logits/rejected": -1.2183212041854858, "logps/chosen": -89.8489990234375, "logps/rejected": -108.4361572265625, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 0.6225892305374146, "rewards/margins": 21.247386932373047, "rewards/rejected": -20.624794006347656, "step": 2860 }, { "epoch": 1.31, "learning_rate": 1.8781065088757397e-07, "logits/chosen": -1.3064444065093994, "logits/rejected": -1.251308798789978, "logps/chosen": -88.62841033935547, "logps/rejected": -114.151123046875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.6001108288764954, "rewards/margins": 23.617412567138672, "rewards/rejected": -23.017301559448242, "step": 2870 }, { "epoch": 1.31, "learning_rate": 1.8730346576500422e-07, "logits/chosen": -1.2963197231292725, "logits/rejected": -1.2211467027664185, "logps/chosen": -89.53153991699219, "logps/rejected": -116.00457763671875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.3336631059646606, "rewards/margins": 24.389131546020508, "rewards/rejected": -23.055469512939453, "step": 2880 }, { "epoch": 1.32, "learning_rate": 1.8679628064243448e-07, "logits/chosen": -1.2721803188323975, "logits/rejected": -1.209424614906311, "logps/chosen": -90.9636459350586, "logps/rejected": -112.62530517578125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.8648780584335327, "rewards/margins": 23.844661712646484, "rewards/rejected": -22.97978401184082, "step": 2890 }, { "epoch": 1.32, "learning_rate": 1.8628909551986473e-07, "logits/chosen": -1.304807186126709, "logits/rejected": -1.2427805662155151, "logps/chosen": -83.58341979980469, "logps/rejected": -114.4560546875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.09369596093893051, "rewards/margins": 23.40268325805664, "rewards/rejected": -23.496379852294922, "step": 2900 }, { "epoch": 1.32, "eval_logits/chosen": -1.3647960424423218, "eval_logits/rejected": -1.2919734716415405, "eval_logps/chosen": -88.49767303466797, "eval_logps/rejected": -111.2530517578125, "eval_loss": 0.007645368576049805, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.4673759341239929, "eval_rewards/margins": 23.423046112060547, "eval_rewards/rejected": -22.95566749572754, "eval_runtime": 68.7318, "eval_samples_per_second": 41.64, "eval_steps_per_second": 2.604, "step": 2900 }, { "epoch": 1.33, "learning_rate": 1.8578191039729501e-07, "logits/chosen": -1.2682093381881714, "logits/rejected": -1.2079681158065796, "logps/chosen": -94.27190399169922, "logps/rejected": -113.67183685302734, "loss": 0.0047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7304227352142334, "rewards/margins": 24.069181442260742, "rewards/rejected": -22.338756561279297, "step": 2910 }, { "epoch": 1.33, "learning_rate": 1.8527472527472527e-07, "logits/chosen": -1.2731274366378784, "logits/rejected": -1.2099798917770386, "logps/chosen": -90.7890625, "logps/rejected": -106.4814224243164, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.650849461555481, "rewards/margins": 22.963346481323242, "rewards/rejected": -22.312496185302734, "step": 2920 }, { "epoch": 1.34, "learning_rate": 1.8476754015215555e-07, "logits/chosen": -1.292454481124878, "logits/rejected": -1.225913166999817, "logps/chosen": -85.83187103271484, "logps/rejected": -110.36527252197266, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.2073954343795776, "rewards/margins": 24.3538818359375, "rewards/rejected": -23.146484375, "step": 2930 }, { "epoch": 1.34, "learning_rate": 1.8426035502958578e-07, "logits/chosen": -1.2939157485961914, "logits/rejected": -1.241677165031433, "logps/chosen": -84.26634216308594, "logps/rejected": -108.85639953613281, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.16938212513923645, "rewards/margins": 22.67632293701172, "rewards/rejected": -22.506940841674805, "step": 2940 }, { "epoch": 1.35, "learning_rate": 1.8375316990701606e-07, "logits/chosen": -1.2990548610687256, "logits/rejected": -1.2253262996673584, "logps/chosen": -85.23284149169922, "logps/rejected": -111.3265609741211, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.6182515025138855, "rewards/margins": 23.335525512695312, "rewards/rejected": -22.717273712158203, "step": 2950 }, { "epoch": 1.35, "learning_rate": 1.832459847844463e-07, "logits/chosen": -1.3059203624725342, "logits/rejected": -1.2436602115631104, "logps/chosen": -90.0726089477539, "logps/rejected": -115.29248046875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.11955726146698, "rewards/margins": 24.451736450195312, "rewards/rejected": -23.332178115844727, "step": 2960 }, { "epoch": 1.36, "learning_rate": 1.827387996618766e-07, "logits/chosen": -1.2917280197143555, "logits/rejected": -1.2361105680465698, "logps/chosen": -83.81163024902344, "logps/rejected": -110.36248779296875, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2116299867630005, "rewards/margins": 23.077804565429688, "rewards/rejected": -22.866174697875977, "step": 2970 }, { "epoch": 1.36, "learning_rate": 1.8223161453930685e-07, "logits/chosen": -1.2799623012542725, "logits/rejected": -1.2227824926376343, "logps/chosen": -87.18734741210938, "logps/rejected": -111.6829605102539, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.030847549438476562, "rewards/margins": 22.31467056274414, "rewards/rejected": -22.283823013305664, "step": 2980 }, { "epoch": 1.36, "learning_rate": 1.817244294167371e-07, "logits/chosen": -1.2903432846069336, "logits/rejected": -1.2311654090881348, "logps/chosen": -92.79168701171875, "logps/rejected": -119.3042984008789, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 0.6965764164924622, "rewards/margins": 24.21768569946289, "rewards/rejected": -23.52111053466797, "step": 2990 }, { "epoch": 1.37, "learning_rate": 1.8121724429416736e-07, "logits/chosen": -1.2526795864105225, "logits/rejected": -1.1851316690444946, "logps/chosen": -89.67386627197266, "logps/rejected": -108.59903717041016, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.0026484727859497, "rewards/margins": 23.317981719970703, "rewards/rejected": -22.315330505371094, "step": 3000 }, { "epoch": 1.37, "eval_logits/chosen": -1.3374384641647339, "eval_logits/rejected": -1.2689754962921143, "eval_logps/chosen": -87.6075668334961, "eval_logps/rejected": -110.42879486083984, "eval_loss": 0.007348579820245504, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.9124311804771423, "eval_rewards/margins": 23.45596694946289, "eval_rewards/rejected": -22.543535232543945, "eval_runtime": 63.1636, "eval_samples_per_second": 45.311, "eval_steps_per_second": 2.834, "step": 3000 }, { "epoch": 1.37, "learning_rate": 1.8071005917159764e-07, "logits/chosen": -1.2701785564422607, "logits/rejected": -1.2133753299713135, "logps/chosen": -89.51029968261719, "logps/rejected": -107.2208023071289, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.4126553535461426, "rewards/margins": 23.328351974487305, "rewards/rejected": -21.915699005126953, "step": 3010 }, { "epoch": 1.38, "learning_rate": 1.802028740490279e-07, "logits/chosen": -1.2436578273773193, "logits/rejected": -1.178425908088684, "logps/chosen": -87.91740417480469, "logps/rejected": -108.53468322753906, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.169164776802063, "rewards/margins": 23.27883529663086, "rewards/rejected": -22.109668731689453, "step": 3020 }, { "epoch": 1.38, "learning_rate": 1.7969568892645814e-07, "logits/chosen": -1.2551841735839844, "logits/rejected": -1.1954948902130127, "logps/chosen": -86.35130310058594, "logps/rejected": -113.94222259521484, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.3147376775741577, "rewards/margins": 22.975399017333984, "rewards/rejected": -22.660661697387695, "step": 3030 }, { "epoch": 1.39, "learning_rate": 1.791885038038884e-07, "logits/chosen": -1.2619261741638184, "logits/rejected": -1.1995004415512085, "logps/chosen": -89.64755249023438, "logps/rejected": -115.27685546875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.632409691810608, "rewards/margins": 25.825876235961914, "rewards/rejected": -24.193464279174805, "step": 3040 }, { "epoch": 1.39, "learning_rate": 1.7868131868131868e-07, "logits/chosen": -1.2092974185943604, "logits/rejected": -1.1581265926361084, "logps/chosen": -94.53260803222656, "logps/rejected": -113.39241790771484, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.7243481278419495, "rewards/margins": 24.363618850708008, "rewards/rejected": -23.639272689819336, "step": 3050 }, { "epoch": 1.4, "learning_rate": 1.7817413355874893e-07, "logits/chosen": -1.2218453884124756, "logits/rejected": -1.1740856170654297, "logps/chosen": -87.55658721923828, "logps/rejected": -112.15279388427734, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.7927554845809937, "rewards/margins": 24.38910484313965, "rewards/rejected": -23.59635353088379, "step": 3060 }, { "epoch": 1.4, "learning_rate": 1.7766694843617921e-07, "logits/chosen": -1.272851586341858, "logits/rejected": -1.206168293952942, "logps/chosen": -83.55747985839844, "logps/rejected": -120.4216079711914, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5537545084953308, "rewards/margins": 24.476547241210938, "rewards/rejected": -23.922794342041016, "step": 3070 }, { "epoch": 1.41, "learning_rate": 1.7715976331360944e-07, "logits/chosen": -1.2406799793243408, "logits/rejected": -1.1959577798843384, "logps/chosen": -88.22968292236328, "logps/rejected": -118.29911041259766, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.6699932813644409, "rewards/margins": 24.441137313842773, "rewards/rejected": -23.77114486694336, "step": 3080 }, { "epoch": 1.41, "learning_rate": 1.7665257819103972e-07, "logits/chosen": -1.2692559957504272, "logits/rejected": -1.1990225315093994, "logps/chosen": -91.19334411621094, "logps/rejected": -115.9480209350586, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 1.4864407777786255, "rewards/margins": 24.107561111450195, "rewards/rejected": -22.62112045288086, "step": 3090 }, { "epoch": 1.41, "learning_rate": 1.7614539306846998e-07, "logits/chosen": -1.2366522550582886, "logits/rejected": -1.1838405132293701, "logps/chosen": -89.22175598144531, "logps/rejected": -109.91280364990234, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.0529274940490723, "rewards/margins": 23.17164421081543, "rewards/rejected": -22.118717193603516, "step": 3100 }, { "epoch": 1.41, "eval_logits/chosen": -1.3056086301803589, "eval_logits/rejected": -1.2372196912765503, "eval_logps/chosen": -87.62897491455078, "eval_logps/rejected": -110.38069915771484, "eval_loss": 0.007260460406541824, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.9017306566238403, "eval_rewards/margins": 23.421215057373047, "eval_rewards/rejected": -22.519485473632812, "eval_runtime": 74.4757, "eval_samples_per_second": 38.429, "eval_steps_per_second": 2.403, "step": 3100 }, { "epoch": 1.42, "learning_rate": 1.7563820794590026e-07, "logits/chosen": -1.2446845769882202, "logits/rejected": -1.1912556886672974, "logps/chosen": -87.75724792480469, "logps/rejected": -117.93830871582031, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7617045640945435, "rewards/margins": 24.722881317138672, "rewards/rejected": -23.961177825927734, "step": 3110 }, { "epoch": 1.42, "learning_rate": 1.751310228233305e-07, "logits/chosen": -1.2533457279205322, "logits/rejected": -1.1890594959259033, "logps/chosen": -94.13309478759766, "logps/rejected": -112.0062026977539, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.5000749826431274, "rewards/margins": 22.833240509033203, "rewards/rejected": -22.333166122436523, "step": 3120 }, { "epoch": 1.43, "learning_rate": 1.7462383770076077e-07, "logits/chosen": -1.2367146015167236, "logits/rejected": -1.178846001625061, "logps/chosen": -88.32302856445312, "logps/rejected": -115.77542877197266, "loss": 0.0076, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.37687280774116516, "rewards/margins": 24.140419006347656, "rewards/rejected": -23.763545989990234, "step": 3130 }, { "epoch": 1.43, "learning_rate": 1.7411665257819102e-07, "logits/chosen": -1.2372697591781616, "logits/rejected": -1.1662713289260864, "logps/chosen": -96.11444091796875, "logps/rejected": -114.61024475097656, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.8727655410766602, "rewards/margins": 24.095346450805664, "rewards/rejected": -23.222579956054688, "step": 3140 }, { "epoch": 1.44, "learning_rate": 1.736094674556213e-07, "logits/chosen": -1.2260363101959229, "logits/rejected": -1.169950246810913, "logps/chosen": -87.82176208496094, "logps/rejected": -114.6717529296875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.6339221000671387, "rewards/margins": 23.64284324645996, "rewards/rejected": -23.00891876220703, "step": 3150 }, { "epoch": 1.44, "learning_rate": 1.7310228233305156e-07, "logits/chosen": -1.2591478824615479, "logits/rejected": -1.2040024995803833, "logps/chosen": -89.02254486083984, "logps/rejected": -113.30888366699219, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.9198814630508423, "rewards/margins": 23.625865936279297, "rewards/rejected": -22.705984115600586, "step": 3160 }, { "epoch": 1.45, "learning_rate": 1.7259509721048184e-07, "logits/chosen": -1.2532278299331665, "logits/rejected": -1.2039217948913574, "logps/chosen": -82.79835510253906, "logps/rejected": -113.0461654663086, "loss": 0.0045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6827878952026367, "rewards/margins": 23.46468734741211, "rewards/rejected": -22.781898498535156, "step": 3170 }, { "epoch": 1.45, "learning_rate": 1.7208791208791206e-07, "logits/chosen": -1.272351622581482, "logits/rejected": -1.2235658168792725, "logps/chosen": -83.52709197998047, "logps/rejected": -115.59749603271484, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.9059032201766968, "rewards/margins": 24.44829559326172, "rewards/rejected": -23.54239273071289, "step": 3180 }, { "epoch": 1.46, "learning_rate": 1.7158072696534235e-07, "logits/chosen": -1.2341419458389282, "logits/rejected": -1.178829550743103, "logps/chosen": -90.00852966308594, "logps/rejected": -108.0736312866211, "loss": 0.0024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2999484539031982, "rewards/margins": 22.940885543823242, "rewards/rejected": -21.64093589782715, "step": 3190 }, { "epoch": 1.46, "learning_rate": 1.710735418427726e-07, "logits/chosen": -1.2229827642440796, "logits/rejected": -1.1729035377502441, "logps/chosen": -88.7994384765625, "logps/rejected": -115.1880111694336, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.18852441012859344, "rewards/margins": 22.59259033203125, "rewards/rejected": -22.781112670898438, "step": 3200 }, { "epoch": 1.46, "eval_logits/chosen": -1.3309955596923828, "eval_logits/rejected": -1.2624685764312744, "eval_logps/chosen": -87.6993408203125, "eval_logps/rejected": -111.36738586425781, "eval_loss": 0.0076772235333919525, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.866541862487793, "eval_rewards/margins": 23.879375457763672, "eval_rewards/rejected": -23.012832641601562, "eval_runtime": 71.9768, "eval_samples_per_second": 39.763, "eval_steps_per_second": 2.487, "step": 3200 }, { "epoch": 1.47, "learning_rate": 1.7056635672020288e-07, "logits/chosen": -1.2811453342437744, "logits/rejected": -1.2151391506195068, "logps/chosen": -92.49798583984375, "logps/rejected": -116.91839599609375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.1861944198608398, "rewards/margins": 25.033100128173828, "rewards/rejected": -23.846908569335938, "step": 3210 }, { "epoch": 1.47, "learning_rate": 1.7005917159763313e-07, "logits/chosen": -1.2741590738296509, "logits/rejected": -1.2121143341064453, "logps/chosen": -88.5379409790039, "logps/rejected": -116.83935546875, "loss": 0.0045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4937240481376648, "rewards/margins": 23.87834930419922, "rewards/rejected": -23.384624481201172, "step": 3220 }, { "epoch": 1.47, "learning_rate": 1.695519864750634e-07, "logits/chosen": -1.2720822095870972, "logits/rejected": -1.2110103368759155, "logps/chosen": -90.10042572021484, "logps/rejected": -108.63752746582031, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 0.890106201171875, "rewards/margins": 22.506465911865234, "rewards/rejected": -21.61635971069336, "step": 3230 }, { "epoch": 1.48, "learning_rate": 1.6904480135249364e-07, "logits/chosen": -1.2414791584014893, "logits/rejected": -1.1807258129119873, "logps/chosen": -90.51213073730469, "logps/rejected": -115.80250549316406, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 1.1200735569000244, "rewards/margins": 25.453067779541016, "rewards/rejected": -24.332996368408203, "step": 3240 }, { "epoch": 1.48, "learning_rate": 1.6853761622992392e-07, "logits/chosen": -1.2474098205566406, "logits/rejected": -1.203169584274292, "logps/chosen": -85.99417114257812, "logps/rejected": -115.08979797363281, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.23325538635253906, "rewards/margins": 23.269723892211914, "rewards/rejected": -23.036466598510742, "step": 3250 }, { "epoch": 1.49, "learning_rate": 1.6803043110735418e-07, "logits/chosen": -1.2636672258377075, "logits/rejected": -1.197353482246399, "logps/chosen": -84.11290740966797, "logps/rejected": -111.87858581542969, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.2561546564102173, "rewards/margins": 24.462989807128906, "rewards/rejected": -23.206836700439453, "step": 3260 }, { "epoch": 1.49, "learning_rate": 1.6752324598478443e-07, "logits/chosen": -1.2593055963516235, "logits/rejected": -1.191775918006897, "logps/chosen": -86.7191390991211, "logps/rejected": -121.62376403808594, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.4610673785209656, "rewards/margins": 25.324344635009766, "rewards/rejected": -25.785411834716797, "step": 3270 }, { "epoch": 1.5, "learning_rate": 1.6701606086221469e-07, "logits/chosen": -1.2484862804412842, "logits/rejected": -1.1960498094558716, "logps/chosen": -91.21693420410156, "logps/rejected": -115.73272705078125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.2348252534866333, "rewards/margins": 23.737483978271484, "rewards/rejected": -23.502655029296875, "step": 3280 }, { "epoch": 1.5, "learning_rate": 1.6650887573964497e-07, "logits/chosen": -1.247483253479004, "logits/rejected": -1.1900913715362549, "logps/chosen": -89.03797149658203, "logps/rejected": -114.04032135009766, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.4475841522216797, "rewards/margins": 24.663578033447266, "rewards/rejected": -23.215991973876953, "step": 3290 }, { "epoch": 1.51, "learning_rate": 1.6600169061707522e-07, "logits/chosen": -1.3016645908355713, "logits/rejected": -1.2365801334381104, "logps/chosen": -94.5399398803711, "logps/rejected": -124.09773254394531, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.6547442078590393, "rewards/margins": 25.51272201538086, "rewards/rejected": -24.857975006103516, "step": 3300 }, { "epoch": 1.51, "eval_logits/chosen": -1.3517431020736694, "eval_logits/rejected": -1.2811920642852783, "eval_logps/chosen": -87.5091781616211, "eval_logps/rejected": -111.33209991455078, "eval_loss": 0.007596523035317659, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.9616219401359558, "eval_rewards/margins": 23.956809997558594, "eval_rewards/rejected": -22.995187759399414, "eval_runtime": 59.4047, "eval_samples_per_second": 48.178, "eval_steps_per_second": 3.013, "step": 3300 }, { "epoch": 1.51, "learning_rate": 1.654945054945055e-07, "logits/chosen": -1.2787243127822876, "logits/rejected": -1.2220463752746582, "logps/chosen": -85.47830963134766, "logps/rejected": -120.3558349609375, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1819784939289093, "rewards/margins": 24.017372131347656, "rewards/rejected": -23.835391998291016, "step": 3310 }, { "epoch": 1.52, "learning_rate": 1.6498732037193573e-07, "logits/chosen": -1.2893264293670654, "logits/rejected": -1.2257001399993896, "logps/chosen": -85.7628402709961, "logps/rejected": -111.8202896118164, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.0795629024505615, "rewards/margins": 24.557388305664062, "rewards/rejected": -23.477825164794922, "step": 3320 }, { "epoch": 1.52, "learning_rate": 1.64480135249366e-07, "logits/chosen": -1.2428733110427856, "logits/rejected": -1.1967283487319946, "logps/chosen": -90.3432846069336, "logps/rejected": -121.13531494140625, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 1.1183116436004639, "rewards/margins": 25.94301414489746, "rewards/rejected": -24.824703216552734, "step": 3330 }, { "epoch": 1.52, "learning_rate": 1.6397295012679627e-07, "logits/chosen": -1.2334848642349243, "logits/rejected": -1.1695020198822021, "logps/chosen": -89.45833587646484, "logps/rejected": -115.14144134521484, "loss": 0.007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8301811218261719, "rewards/margins": 23.684751510620117, "rewards/rejected": -22.854568481445312, "step": 3340 }, { "epoch": 1.53, "learning_rate": 1.6346576500422655e-07, "logits/chosen": -1.261792778968811, "logits/rejected": -1.2009265422821045, "logps/chosen": -91.06483459472656, "logps/rejected": -116.6786880493164, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.8044708371162415, "rewards/margins": 24.910701751708984, "rewards/rejected": -24.10622787475586, "step": 3350 }, { "epoch": 1.53, "learning_rate": 1.629585798816568e-07, "logits/chosen": -1.2624332904815674, "logits/rejected": -1.2024790048599243, "logps/chosen": -92.38378143310547, "logps/rejected": -116.35567474365234, "loss": 0.0045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.27976271510124207, "rewards/margins": 23.846553802490234, "rewards/rejected": -24.12631607055664, "step": 3360 }, { "epoch": 1.54, "learning_rate": 1.6245139475908705e-07, "logits/chosen": -1.2883548736572266, "logits/rejected": -1.2222377061843872, "logps/chosen": -88.30838775634766, "logps/rejected": -112.32978820800781, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.10080569982528687, "rewards/margins": 22.96854019165039, "rewards/rejected": -22.867733001708984, "step": 3370 }, { "epoch": 1.54, "learning_rate": 1.619442096365173e-07, "logits/chosen": -1.3186638355255127, "logits/rejected": -1.245281457901001, "logps/chosen": -87.22611999511719, "logps/rejected": -113.5263671875, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1867640018463135, "rewards/margins": 25.683629989624023, "rewards/rejected": -24.49686622619629, "step": 3380 }, { "epoch": 1.55, "learning_rate": 1.614370245139476e-07, "logits/chosen": -1.2895255088806152, "logits/rejected": -1.229871392250061, "logps/chosen": -85.41172790527344, "logps/rejected": -117.4696273803711, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.6948531866073608, "rewards/margins": 25.749099731445312, "rewards/rejected": -25.054248809814453, "step": 3390 }, { "epoch": 1.55, "learning_rate": 1.6092983939137784e-07, "logits/chosen": -1.2895100116729736, "logits/rejected": -1.2215838432312012, "logps/chosen": -92.64399719238281, "logps/rejected": -113.7071304321289, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.008798837661743, "rewards/margins": 25.56492805480957, "rewards/rejected": -23.556129455566406, "step": 3400 }, { "epoch": 1.55, "eval_logits/chosen": -1.352734923362732, "eval_logits/rejected": -1.2827072143554688, "eval_logps/chosen": -87.6773681640625, "eval_logps/rejected": -113.02869415283203, "eval_loss": 0.007990003563463688, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.8775248527526855, "eval_rewards/margins": 24.72101593017578, "eval_rewards/rejected": -23.843490600585938, "eval_runtime": 59.8847, "eval_samples_per_second": 47.792, "eval_steps_per_second": 2.989, "step": 3400 }, { "epoch": 1.56, "learning_rate": 1.6042265426880812e-07, "logits/chosen": -1.2344064712524414, "logits/rejected": -1.1821552515029907, "logps/chosen": -87.24339294433594, "logps/rejected": -113.21418762207031, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.5112111568450928, "rewards/margins": 24.323862075805664, "rewards/rejected": -22.81264877319336, "step": 3410 }, { "epoch": 1.56, "learning_rate": 1.5991546914623835e-07, "logits/chosen": -1.2430731058120728, "logits/rejected": -1.1846189498901367, "logps/chosen": -86.92017364501953, "logps/rejected": -111.88777160644531, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 1.5619323253631592, "rewards/margins": 25.513439178466797, "rewards/rejected": -23.951505661010742, "step": 3420 }, { "epoch": 1.57, "learning_rate": 1.5940828402366863e-07, "logits/chosen": -1.2778552770614624, "logits/rejected": -1.2257239818572998, "logps/chosen": -89.74571228027344, "logps/rejected": -122.40673828125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.9483524560928345, "rewards/margins": 26.498586654663086, "rewards/rejected": -25.550235748291016, "step": 3430 }, { "epoch": 1.57, "learning_rate": 1.589010989010989e-07, "logits/chosen": -1.2671973705291748, "logits/rejected": -1.1991872787475586, "logps/chosen": -95.77862548828125, "logps/rejected": -120.44307708740234, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.5015150308609009, "rewards/margins": 25.031169891357422, "rewards/rejected": -23.529653549194336, "step": 3440 }, { "epoch": 1.57, "learning_rate": 1.5839391377852917e-07, "logits/chosen": -1.283717393875122, "logits/rejected": -1.2063400745391846, "logps/chosen": -93.32177734375, "logps/rejected": -118.2795639038086, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.5961825847625732, "rewards/margins": 25.887725830078125, "rewards/rejected": -24.29154396057129, "step": 3450 }, { "epoch": 1.58, "learning_rate": 1.5788672865595942e-07, "logits/chosen": -1.2777965068817139, "logits/rejected": -1.2139501571655273, "logps/chosen": -86.36488342285156, "logps/rejected": -112.5156021118164, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.4618368148803711, "rewards/margins": 24.505504608154297, "rewards/rejected": -24.043668746948242, "step": 3460 }, { "epoch": 1.58, "learning_rate": 1.5737954353338968e-07, "logits/chosen": -1.2852230072021484, "logits/rejected": -1.2266560792922974, "logps/chosen": -85.028076171875, "logps/rejected": -111.90921783447266, "loss": 0.0054, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6333560943603516, "rewards/margins": 25.04153060913086, "rewards/rejected": -23.40817642211914, "step": 3470 }, { "epoch": 1.59, "learning_rate": 1.5687235841081993e-07, "logits/chosen": -1.2799792289733887, "logits/rejected": -1.216094970703125, "logps/chosen": -88.36021423339844, "logps/rejected": -112.99951171875, "loss": 0.0067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2346739768981934, "rewards/margins": 25.258432388305664, "rewards/rejected": -24.023757934570312, "step": 3480 }, { "epoch": 1.59, "learning_rate": 1.563651732882502e-07, "logits/chosen": -1.2925515174865723, "logits/rejected": -1.2144296169281006, "logps/chosen": -93.78471374511719, "logps/rejected": -115.75556945800781, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.4354177713394165, "rewards/margins": 26.061655044555664, "rewards/rejected": -24.626237869262695, "step": 3490 }, { "epoch": 1.6, "learning_rate": 1.5585798816568047e-07, "logits/chosen": -1.2529594898223877, "logits/rejected": -1.2021251916885376, "logps/chosen": -85.00303649902344, "logps/rejected": -112.36210632324219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.6849428415298462, "rewards/margins": 23.76913070678711, "rewards/rejected": -22.08418846130371, "step": 3500 }, { "epoch": 1.6, "eval_logits/chosen": -1.347659945487976, "eval_logits/rejected": -1.2767060995101929, "eval_logps/chosen": -86.58101654052734, "eval_logps/rejected": -111.87799835205078, "eval_loss": 0.007571995258331299, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.4257049560546875, "eval_rewards/margins": 24.693849563598633, "eval_rewards/rejected": -23.268142700195312, "eval_runtime": 61.6155, "eval_samples_per_second": 46.449, "eval_steps_per_second": 2.905, "step": 3500 }, { "epoch": 1.6, "learning_rate": 1.5535080304311072e-07, "logits/chosen": -1.3136241436004639, "logits/rejected": -1.2429287433624268, "logps/chosen": -89.40177917480469, "logps/rejected": -115.84840393066406, "loss": 0.0039, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.674997091293335, "rewards/margins": 24.841594696044922, "rewards/rejected": -23.166595458984375, "step": 3510 }, { "epoch": 1.61, "learning_rate": 1.5484361792054097e-07, "logits/chosen": -1.2944393157958984, "logits/rejected": -1.2253139019012451, "logps/chosen": -89.92081451416016, "logps/rejected": -117.0937271118164, "loss": 0.0036, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8267130851745605, "rewards/margins": 26.268218994140625, "rewards/rejected": -25.441509246826172, "step": 3520 }, { "epoch": 1.61, "learning_rate": 1.5433643279797126e-07, "logits/chosen": -1.2919762134552002, "logits/rejected": -1.2337154150009155, "logps/chosen": -83.75240325927734, "logps/rejected": -113.67716979980469, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 1.2121895551681519, "rewards/margins": 25.531957626342773, "rewards/rejected": -24.319766998291016, "step": 3530 }, { "epoch": 1.62, "learning_rate": 1.538292476754015e-07, "logits/chosen": -1.2944412231445312, "logits/rejected": -1.2345895767211914, "logps/chosen": -87.15584564208984, "logps/rejected": -115.55804443359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.4887280464172363, "rewards/margins": 26.8122501373291, "rewards/rejected": -24.323522567749023, "step": 3540 }, { "epoch": 1.62, "learning_rate": 1.533220625528318e-07, "logits/chosen": -1.2804213762283325, "logits/rejected": -1.2162938117980957, "logps/chosen": -89.55908203125, "logps/rejected": -120.44891357421875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.0079282522201538, "rewards/margins": 26.581329345703125, "rewards/rejected": -25.573402404785156, "step": 3550 }, { "epoch": 1.62, "learning_rate": 1.5281487743026202e-07, "logits/chosen": -1.3119621276855469, "logits/rejected": -1.227165937423706, "logps/chosen": -96.74497985839844, "logps/rejected": -121.7210464477539, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.5257515907287598, "rewards/margins": 27.811294555664062, "rewards/rejected": -26.28554344177246, "step": 3560 }, { "epoch": 1.63, "learning_rate": 1.523076923076923e-07, "logits/chosen": -1.279449224472046, "logits/rejected": -1.22541081905365, "logps/chosen": -84.21186828613281, "logps/rejected": -113.58978271484375, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9092862010002136, "rewards/margins": 25.38943099975586, "rewards/rejected": -24.480144500732422, "step": 3570 }, { "epoch": 1.63, "learning_rate": 1.5180050718512255e-07, "logits/chosen": -1.2649726867675781, "logits/rejected": -1.1972577571868896, "logps/chosen": -91.45044708251953, "logps/rejected": -119.5804443359375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.8337907791137695, "rewards/margins": 28.143224716186523, "rewards/rejected": -25.309436798095703, "step": 3580 }, { "epoch": 1.64, "learning_rate": 1.5129332206255283e-07, "logits/chosen": -1.2978521585464478, "logits/rejected": -1.252637505531311, "logps/chosen": -90.22964477539062, "logps/rejected": -125.02496337890625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.8120530843734741, "rewards/margins": 26.514413833618164, "rewards/rejected": -24.702360153198242, "step": 3590 }, { "epoch": 1.64, "learning_rate": 1.507861369399831e-07, "logits/chosen": -1.2789386510849, "logits/rejected": -1.2158098220825195, "logps/chosen": -90.51055908203125, "logps/rejected": -118.33430480957031, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.5107016563415527, "rewards/margins": 28.50537109375, "rewards/rejected": -25.994670867919922, "step": 3600 }, { "epoch": 1.64, "eval_logits/chosen": -1.3777239322662354, "eval_logits/rejected": -1.3042701482772827, "eval_logps/chosen": -87.28597259521484, "eval_logps/rejected": -114.07413482666016, "eval_loss": 0.007842887192964554, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.0732280015945435, "eval_rewards/margins": 25.439434051513672, "eval_rewards/rejected": -24.3662052154541, "eval_runtime": 63.3589, "eval_samples_per_second": 45.171, "eval_steps_per_second": 2.825, "step": 3600 }, { "epoch": 1.65, "learning_rate": 1.5027895181741334e-07, "logits/chosen": -1.370045781135559, "logits/rejected": -1.283416748046875, "logps/chosen": -90.81478118896484, "logps/rejected": -112.16239929199219, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0164520740509033, "rewards/margins": 23.93118667602539, "rewards/rejected": -22.91473388671875, "step": 3610 }, { "epoch": 1.65, "learning_rate": 1.497717666948436e-07, "logits/chosen": -1.2713721990585327, "logits/rejected": -1.216667890548706, "logps/chosen": -84.43434143066406, "logps/rejected": -113.650146484375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.3617621064186096, "rewards/margins": 24.290042877197266, "rewards/rejected": -23.92828369140625, "step": 3620 }, { "epoch": 1.66, "learning_rate": 1.4926458157227388e-07, "logits/chosen": -1.316611647605896, "logits/rejected": -1.2415025234222412, "logps/chosen": -88.00056457519531, "logps/rejected": -118.13746643066406, "loss": 0.0058, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7822587490081787, "rewards/margins": 27.109432220458984, "rewards/rejected": -25.327173233032227, "step": 3630 }, { "epoch": 1.66, "learning_rate": 1.4875739644970413e-07, "logits/chosen": -1.2669055461883545, "logits/rejected": -1.198326587677002, "logps/chosen": -92.14048767089844, "logps/rejected": -115.46183776855469, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 1.0600082874298096, "rewards/margins": 25.716211318969727, "rewards/rejected": -24.656200408935547, "step": 3640 }, { "epoch": 1.67, "learning_rate": 1.4825021132713439e-07, "logits/chosen": -1.2855212688446045, "logits/rejected": -1.2362643480300903, "logps/chosen": -85.3121337890625, "logps/rejected": -116.80509948730469, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.1400699615478516, "rewards/margins": 25.843036651611328, "rewards/rejected": -23.70296859741211, "step": 3650 }, { "epoch": 1.67, "learning_rate": 1.4774302620456467e-07, "logits/chosen": -1.2816253900527954, "logits/rejected": -1.2244489192962646, "logps/chosen": -86.08045959472656, "logps/rejected": -120.78515625, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7160287499427795, "rewards/margins": 25.913681030273438, "rewards/rejected": -25.197650909423828, "step": 3660 }, { "epoch": 1.68, "learning_rate": 1.4723584108199492e-07, "logits/chosen": -1.3148205280303955, "logits/rejected": -1.2522845268249512, "logps/chosen": -88.29036712646484, "logps/rejected": -117.13260650634766, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.6190974712371826, "rewards/margins": 26.740299224853516, "rewards/rejected": -25.121198654174805, "step": 3670 }, { "epoch": 1.68, "learning_rate": 1.4672865595942518e-07, "logits/chosen": -1.3006482124328613, "logits/rejected": -1.2300279140472412, "logps/chosen": -87.94609069824219, "logps/rejected": -120.0467300415039, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 2.505680799484253, "rewards/margins": 26.633773803710938, "rewards/rejected": -24.128095626831055, "step": 3680 }, { "epoch": 1.68, "learning_rate": 1.4622147083685546e-07, "logits/chosen": -1.3202269077301025, "logits/rejected": -1.252046823501587, "logps/chosen": -95.1660385131836, "logps/rejected": -121.23722839355469, "loss": 0.0057, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.524958848953247, "rewards/margins": 27.411617279052734, "rewards/rejected": -25.886661529541016, "step": 3690 }, { "epoch": 1.69, "learning_rate": 1.457142857142857e-07, "logits/chosen": -1.2965933084487915, "logits/rejected": -1.2345640659332275, "logps/chosen": -83.83021545410156, "logps/rejected": -117.0789794921875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 2.2809300422668457, "rewards/margins": 26.77134132385254, "rewards/rejected": -24.49040985107422, "step": 3700 }, { "epoch": 1.69, "eval_logits/chosen": -1.3869707584381104, "eval_logits/rejected": -1.3094760179519653, "eval_logps/chosen": -86.7977066040039, "eval_logps/rejected": -115.47274017333984, "eval_loss": 0.008259255439043045, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.3173613548278809, "eval_rewards/margins": 26.382875442504883, "eval_rewards/rejected": -25.065513610839844, "eval_runtime": 63.3424, "eval_samples_per_second": 45.183, "eval_steps_per_second": 2.826, "step": 3700 }, { "epoch": 1.69, "learning_rate": 1.4520710059171596e-07, "logits/chosen": -1.2985725402832031, "logits/rejected": -1.2297552824020386, "logps/chosen": -90.36856079101562, "logps/rejected": -116.03338623046875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.916562557220459, "rewards/margins": 27.512588500976562, "rewards/rejected": -24.596027374267578, "step": 3710 }, { "epoch": 1.7, "learning_rate": 1.4469991546914622e-07, "logits/chosen": -1.3176157474517822, "logits/rejected": -1.247491478919983, "logps/chosen": -85.81680297851562, "logps/rejected": -112.21038818359375, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.835550308227539, "rewards/margins": 26.416223526000977, "rewards/rejected": -24.580673217773438, "step": 3720 }, { "epoch": 1.7, "learning_rate": 1.441927303465765e-07, "logits/chosen": -1.3307037353515625, "logits/rejected": -1.2499626874923706, "logps/chosen": -90.99395751953125, "logps/rejected": -117.75846099853516, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.8873962163925171, "rewards/margins": 25.621952056884766, "rewards/rejected": -24.734556198120117, "step": 3730 }, { "epoch": 1.71, "learning_rate": 1.4368554522400675e-07, "logits/chosen": -1.2795710563659668, "logits/rejected": -1.2092183828353882, "logps/chosen": -90.55952453613281, "logps/rejected": -121.86543273925781, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.7004776000976562, "rewards/margins": 29.03326988220215, "rewards/rejected": -26.332794189453125, "step": 3740 }, { "epoch": 1.71, "learning_rate": 1.43178360101437e-07, "logits/chosen": -1.3224431276321411, "logits/rejected": -1.2512967586517334, "logps/chosen": -84.958984375, "logps/rejected": -113.85465240478516, "loss": 0.0016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.3535866737365723, "rewards/margins": 26.663818359375, "rewards/rejected": -24.310230255126953, "step": 3750 }, { "epoch": 1.72, "learning_rate": 1.426711749788673e-07, "logits/chosen": -1.3394898176193237, "logits/rejected": -1.2797186374664307, "logps/chosen": -83.2117919921875, "logps/rejected": -118.57608795166016, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.2878071069717407, "rewards/margins": 26.771793365478516, "rewards/rejected": -25.483983993530273, "step": 3760 }, { "epoch": 1.72, "learning_rate": 1.4216398985629754e-07, "logits/chosen": -1.2791587114334106, "logits/rejected": -1.2220796346664429, "logps/chosen": -87.02328491210938, "logps/rejected": -123.32960510253906, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.185002088546753, "rewards/margins": 27.750438690185547, "rewards/rejected": -25.565439224243164, "step": 3770 }, { "epoch": 1.73, "learning_rate": 1.416568047337278e-07, "logits/chosen": -1.3284387588500977, "logits/rejected": -1.2588412761688232, "logps/chosen": -88.56816101074219, "logps/rejected": -118.0064468383789, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.4277937412261963, "rewards/margins": 26.984973907470703, "rewards/rejected": -25.557178497314453, "step": 3780 }, { "epoch": 1.73, "learning_rate": 1.4114961961115805e-07, "logits/chosen": -1.3133352994918823, "logits/rejected": -1.248510718345642, "logps/chosen": -92.50769805908203, "logps/rejected": -114.99954986572266, "loss": 0.0046, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1458348035812378, "rewards/margins": 24.03805160522461, "rewards/rejected": -22.89221954345703, "step": 3790 }, { "epoch": 1.73, "learning_rate": 1.4064243448858833e-07, "logits/chosen": -1.302869200706482, "logits/rejected": -1.2390058040618896, "logps/chosen": -82.003662109375, "logps/rejected": -112.65030670166016, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 2.544621706008911, "rewards/margins": 26.4320068359375, "rewards/rejected": -23.887386322021484, "step": 3800 }, { "epoch": 1.73, "eval_logits/chosen": -1.3750476837158203, "eval_logits/rejected": -1.29920494556427, "eval_logps/chosen": -85.6637191772461, "eval_logps/rejected": -111.03407287597656, "eval_loss": 0.007216113153845072, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.884352207183838, "eval_rewards/margins": 24.73053741455078, "eval_rewards/rejected": -22.84618377685547, "eval_runtime": 62.1547, "eval_samples_per_second": 46.046, "eval_steps_per_second": 2.88, "step": 3800 }, { "epoch": 1.74, "learning_rate": 1.401352493660186e-07, "logits/chosen": -1.2911725044250488, "logits/rejected": -1.2250169515609741, "logps/chosen": -84.48226928710938, "logps/rejected": -108.87812805175781, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1833953857421875, "rewards/margins": 23.128978729248047, "rewards/rejected": -21.94558334350586, "step": 3810 }, { "epoch": 1.74, "learning_rate": 1.3962806424344884e-07, "logits/chosen": -1.320176362991333, "logits/rejected": -1.2417948246002197, "logps/chosen": -87.6998519897461, "logps/rejected": -110.061279296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.8369624614715576, "rewards/margins": 24.469478607177734, "rewards/rejected": -22.63251495361328, "step": 3820 }, { "epoch": 1.75, "learning_rate": 1.3912087912087912e-07, "logits/chosen": -1.3057355880737305, "logits/rejected": -1.23427414894104, "logps/chosen": -84.95487976074219, "logps/rejected": -113.34101867675781, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.752861976623535, "rewards/margins": 27.136266708374023, "rewards/rejected": -24.383405685424805, "step": 3830 }, { "epoch": 1.75, "learning_rate": 1.3861369399830938e-07, "logits/chosen": -1.3030786514282227, "logits/rejected": -1.236664056777954, "logps/chosen": -83.6253890991211, "logps/rejected": -113.64739990234375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.3076999187469482, "rewards/margins": 25.728729248046875, "rewards/rejected": -24.421030044555664, "step": 3840 }, { "epoch": 1.76, "learning_rate": 1.3810650887573963e-07, "logits/chosen": -1.30088210105896, "logits/rejected": -1.2418001890182495, "logps/chosen": -85.37004852294922, "logps/rejected": -113.1008071899414, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 2.105379104614258, "rewards/margins": 25.393503189086914, "rewards/rejected": -23.288122177124023, "step": 3850 }, { "epoch": 1.76, "learning_rate": 1.3759932375316989e-07, "logits/chosen": -1.2641175985336304, "logits/rejected": -1.20892333984375, "logps/chosen": -87.78987121582031, "logps/rejected": -111.98150634765625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.467369556427002, "rewards/margins": 25.168516159057617, "rewards/rejected": -22.70115089416504, "step": 3860 }, { "epoch": 1.77, "learning_rate": 1.3709213863060017e-07, "logits/chosen": -1.328049898147583, "logits/rejected": -1.2603827714920044, "logps/chosen": -81.99150085449219, "logps/rejected": -113.47479248046875, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4597467184066772, "rewards/margins": 24.159908294677734, "rewards/rejected": -22.70016098022461, "step": 3870 }, { "epoch": 1.77, "learning_rate": 1.3658495350803042e-07, "logits/chosen": -1.2881757020950317, "logits/rejected": -1.219227910041809, "logps/chosen": -89.5770034790039, "logps/rejected": -117.85916900634766, "loss": 0.0024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6704366207122803, "rewards/margins": 26.334217071533203, "rewards/rejected": -24.663782119750977, "step": 3880 }, { "epoch": 1.78, "learning_rate": 1.3607776838546067e-07, "logits/chosen": -1.2890180349349976, "logits/rejected": -1.2281222343444824, "logps/chosen": -87.47191619873047, "logps/rejected": -117.46723937988281, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.8343310356140137, "rewards/margins": 27.57296371459961, "rewards/rejected": -24.738632202148438, "step": 3890 }, { "epoch": 1.78, "learning_rate": 1.3557058326289096e-07, "logits/chosen": -1.2887780666351318, "logits/rejected": -1.2126020193099976, "logps/chosen": -92.53477478027344, "logps/rejected": -118.53013610839844, "loss": 0.004, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5731879472732544, "rewards/margins": 27.09688949584961, "rewards/rejected": -25.52370262145996, "step": 3900 }, { "epoch": 1.78, "eval_logits/chosen": -1.3683326244354248, "eval_logits/rejected": -1.2940618991851807, "eval_logps/chosen": -86.10755920410156, "eval_logps/rejected": -113.04136657714844, "eval_loss": 0.00731794023886323, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.6624343395233154, "eval_rewards/margins": 25.512258529663086, "eval_rewards/rejected": -23.849822998046875, "eval_runtime": 60.9365, "eval_samples_per_second": 46.967, "eval_steps_per_second": 2.937, "step": 3900 }, { "epoch": 1.78, "learning_rate": 1.350633981403212e-07, "logits/chosen": -1.2927218675613403, "logits/rejected": -1.2228248119354248, "logps/chosen": -83.60839080810547, "logps/rejected": -112.75862121582031, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.901503324508667, "rewards/margins": 24.796140670776367, "rewards/rejected": -22.894638061523438, "step": 3910 }, { "epoch": 1.79, "learning_rate": 1.3455621301775146e-07, "logits/chosen": -1.2360032796859741, "logits/rejected": -1.1930327415466309, "logps/chosen": -85.62734985351562, "logps/rejected": -117.69222259521484, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.328904390335083, "rewards/margins": 26.03916358947754, "rewards/rejected": -24.710262298583984, "step": 3920 }, { "epoch": 1.79, "learning_rate": 1.3404902789518174e-07, "logits/chosen": -1.299557089805603, "logits/rejected": -1.2226934432983398, "logps/chosen": -95.00071716308594, "logps/rejected": -120.6982650756836, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.4814507961273193, "rewards/margins": 25.5311222076416, "rewards/rejected": -24.049671173095703, "step": 3930 }, { "epoch": 1.8, "learning_rate": 1.33541842772612e-07, "logits/chosen": -1.2686429023742676, "logits/rejected": -1.2006398439407349, "logps/chosen": -87.60845184326172, "logps/rejected": -113.25736999511719, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 1.330335021018982, "rewards/margins": 24.519290924072266, "rewards/rejected": -23.188955307006836, "step": 3940 }, { "epoch": 1.8, "learning_rate": 1.3303465765004225e-07, "logits/chosen": -1.3029606342315674, "logits/rejected": -1.2397031784057617, "logps/chosen": -86.67044067382812, "logps/rejected": -116.4280776977539, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.413769245147705, "rewards/margins": 26.642818450927734, "rewards/rejected": -24.22905158996582, "step": 3950 }, { "epoch": 1.81, "learning_rate": 1.325274725274725e-07, "logits/chosen": -1.288098931312561, "logits/rejected": -1.22249436378479, "logps/chosen": -84.44660186767578, "logps/rejected": -116.7946548461914, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.941779375076294, "rewards/margins": 26.12166404724121, "rewards/rejected": -24.17988395690918, "step": 3960 }, { "epoch": 1.81, "learning_rate": 1.320202874049028e-07, "logits/chosen": -1.2721738815307617, "logits/rejected": -1.2212003469467163, "logps/chosen": -86.49989318847656, "logps/rejected": -116.10292053222656, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.964007019996643, "rewards/margins": 25.622446060180664, "rewards/rejected": -23.658437728881836, "step": 3970 }, { "epoch": 1.82, "learning_rate": 1.3151310228233304e-07, "logits/chosen": -1.2734956741333008, "logits/rejected": -1.2074804306030273, "logps/chosen": -87.59144592285156, "logps/rejected": -116.40534973144531, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.6690824031829834, "rewards/margins": 26.692928314208984, "rewards/rejected": -25.02384376525879, "step": 3980 }, { "epoch": 1.82, "learning_rate": 1.310059171597633e-07, "logits/chosen": -1.3229997158050537, "logits/rejected": -1.2673537731170654, "logps/chosen": -88.17668151855469, "logps/rejected": -119.3404769897461, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.873644232749939, "rewards/margins": 26.401302337646484, "rewards/rejected": -24.527660369873047, "step": 3990 }, { "epoch": 1.83, "learning_rate": 1.3049873203719358e-07, "logits/chosen": -1.3080793619155884, "logits/rejected": -1.2402544021606445, "logps/chosen": -90.626953125, "logps/rejected": -115.0836410522461, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.6485347747802734, "rewards/margins": 25.51757049560547, "rewards/rejected": -23.86903953552246, "step": 4000 }, { "epoch": 1.83, "eval_logits/chosen": -1.388069987297058, "eval_logits/rejected": -1.311712622642517, "eval_logps/chosen": -86.8644790649414, "eval_logps/rejected": -114.29080963134766, "eval_loss": 0.007556203752756119, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.283972144126892, "eval_rewards/margins": 25.75852394104004, "eval_rewards/rejected": -24.474552154541016, "eval_runtime": 61.4383, "eval_samples_per_second": 46.583, "eval_steps_per_second": 2.913, "step": 4000 }, { "epoch": 1.83, "learning_rate": 1.2999154691462383e-07, "logits/chosen": -1.2937663793563843, "logits/rejected": -1.2333722114562988, "logps/chosen": -93.04609680175781, "logps/rejected": -114.9658203125, "loss": 0.0066, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.066020131111145, "rewards/margins": 25.072368621826172, "rewards/rejected": -24.00634765625, "step": 4010 }, { "epoch": 1.83, "learning_rate": 1.2948436179205409e-07, "logits/chosen": -1.3368138074874878, "logits/rejected": -1.2809922695159912, "logps/chosen": -82.21131134033203, "logps/rejected": -116.16499328613281, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.2117726802825928, "rewards/margins": 25.279598236083984, "rewards/rejected": -24.067827224731445, "step": 4020 }, { "epoch": 1.84, "learning_rate": 1.2897717666948434e-07, "logits/chosen": -1.275615930557251, "logits/rejected": -1.2142200469970703, "logps/chosen": -93.66702270507812, "logps/rejected": -120.89031982421875, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.48200541734695435, "rewards/margins": 24.838642120361328, "rewards/rejected": -24.356637954711914, "step": 4030 }, { "epoch": 1.84, "learning_rate": 1.2846999154691462e-07, "logits/chosen": -1.3146488666534424, "logits/rejected": -1.2405660152435303, "logps/chosen": -96.49334716796875, "logps/rejected": -119.23211669921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.9090890884399414, "rewards/margins": 27.04913330078125, "rewards/rejected": -25.140043258666992, "step": 4040 }, { "epoch": 1.85, "learning_rate": 1.2796280642434488e-07, "logits/chosen": -1.307680606842041, "logits/rejected": -1.2320573329925537, "logps/chosen": -95.1331787109375, "logps/rejected": -114.7768325805664, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 2.5242226123809814, "rewards/margins": 27.13631820678711, "rewards/rejected": -24.61209487915039, "step": 4050 }, { "epoch": 1.85, "learning_rate": 1.2745562130177513e-07, "logits/chosen": -1.3010895252227783, "logits/rejected": -1.2413908243179321, "logps/chosen": -93.52859497070312, "logps/rejected": -118.62088775634766, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 1.5129265785217285, "rewards/margins": 25.957473754882812, "rewards/rejected": -24.444549560546875, "step": 4060 }, { "epoch": 1.86, "learning_rate": 1.269484361792054e-07, "logits/chosen": -1.2945194244384766, "logits/rejected": -1.2368277311325073, "logps/chosen": -90.33158874511719, "logps/rejected": -120.31858825683594, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.025995206087827682, "rewards/margins": 24.914783477783203, "rewards/rejected": -24.940778732299805, "step": 4070 }, { "epoch": 1.86, "learning_rate": 1.2644125105663566e-07, "logits/chosen": -1.298654317855835, "logits/rejected": -1.2463370561599731, "logps/chosen": -85.01954650878906, "logps/rejected": -115.01509094238281, "loss": 0.0028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.912619411945343, "rewards/margins": 25.490772247314453, "rewards/rejected": -24.57815170288086, "step": 4080 }, { "epoch": 1.87, "learning_rate": 1.2593406593406592e-07, "logits/chosen": -1.282814383506775, "logits/rejected": -1.2446370124816895, "logps/chosen": -85.000732421875, "logps/rejected": -119.21089935302734, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.7353527545928955, "rewards/margins": 26.011398315429688, "rewards/rejected": -24.276042938232422, "step": 4090 }, { "epoch": 1.87, "learning_rate": 1.2542688081149617e-07, "logits/chosen": -1.2971045970916748, "logits/rejected": -1.2302577495574951, "logps/chosen": -90.2288589477539, "logps/rejected": -119.21665954589844, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.2725168466567993, "rewards/margins": 25.95538902282715, "rewards/rejected": -24.68286895751953, "step": 4100 }, { "epoch": 1.87, "eval_logits/chosen": -1.3849560022354126, "eval_logits/rejected": -1.30857515335083, "eval_logps/chosen": -86.99442291259766, "eval_logps/rejected": -114.5916748046875, "eval_loss": 0.007762798108160496, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 1.2190001010894775, "eval_rewards/margins": 25.843978881835938, "eval_rewards/rejected": -24.624980926513672, "eval_runtime": 60.6192, "eval_samples_per_second": 47.213, "eval_steps_per_second": 2.953, "step": 4100 }, { "epoch": 1.88, "learning_rate": 1.2491969568892645e-07, "logits/chosen": -1.2938053607940674, "logits/rejected": -1.2201104164123535, "logps/chosen": -91.01365661621094, "logps/rejected": -116.42720031738281, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.568861961364746, "rewards/margins": 26.685710906982422, "rewards/rejected": -24.116849899291992, "step": 4110 }, { "epoch": 1.88, "learning_rate": 1.244125105663567e-07, "logits/chosen": -1.3405344486236572, "logits/rejected": -1.2672364711761475, "logps/chosen": -91.08586120605469, "logps/rejected": -121.0147705078125, "loss": 0.0022, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5391464233398438, "rewards/margins": 27.237112045288086, "rewards/rejected": -25.69796371459961, "step": 4120 }, { "epoch": 1.88, "learning_rate": 1.2390532544378696e-07, "logits/chosen": -1.2898027896881104, "logits/rejected": -1.2514622211456299, "logps/chosen": -82.71153259277344, "logps/rejected": -122.4542236328125, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.03917119652032852, "rewards/margins": 26.051074981689453, "rewards/rejected": -26.01190185546875, "step": 4130 }, { "epoch": 1.89, "learning_rate": 1.2339814032121724e-07, "logits/chosen": -1.3003642559051514, "logits/rejected": -1.2193737030029297, "logps/chosen": -92.31513214111328, "logps/rejected": -114.4735336303711, "loss": 0.0013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3771212100982666, "rewards/margins": 25.401681900024414, "rewards/rejected": -24.02456283569336, "step": 4140 }, { "epoch": 1.89, "learning_rate": 1.228909551986475e-07, "logits/chosen": -1.3193917274475098, "logits/rejected": -1.2444701194763184, "logps/chosen": -88.08500671386719, "logps/rejected": -111.34246826171875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.7157142162322998, "rewards/margins": 25.313156127929688, "rewards/rejected": -23.59743881225586, "step": 4150 }, { "epoch": 1.9, "learning_rate": 1.2238377007607775e-07, "logits/chosen": -1.2875083684921265, "logits/rejected": -1.2169303894042969, "logps/chosen": -90.44905090332031, "logps/rejected": -113.9192886352539, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.2316787242889404, "rewards/margins": 25.797632217407227, "rewards/rejected": -24.56595230102539, "step": 4160 }, { "epoch": 1.9, "learning_rate": 1.2187658495350803e-07, "logits/chosen": -1.319364309310913, "logits/rejected": -1.2405459880828857, "logps/chosen": -94.96125793457031, "logps/rejected": -118.59950256347656, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.9828643798828125, "rewards/margins": 25.974903106689453, "rewards/rejected": -23.99203872680664, "step": 4170 }, { "epoch": 1.91, "learning_rate": 1.213693998309383e-07, "logits/chosen": -1.3195867538452148, "logits/rejected": -1.2445495128631592, "logps/chosen": -90.23202514648438, "logps/rejected": -114.99327087402344, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.1389697790145874, "rewards/margins": 25.39142608642578, "rewards/rejected": -25.252456665039062, "step": 4180 }, { "epoch": 1.91, "learning_rate": 1.2086221470836854e-07, "logits/chosen": -1.3217099905014038, "logits/rejected": -1.248329997062683, "logps/chosen": -89.2808609008789, "logps/rejected": -123.01396179199219, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.657078742980957, "rewards/margins": 27.067142486572266, "rewards/rejected": -26.410064697265625, "step": 4190 }, { "epoch": 1.92, "learning_rate": 1.203550295857988e-07, "logits/chosen": -1.324310064315796, "logits/rejected": -1.257927417755127, "logps/chosen": -86.30921173095703, "logps/rejected": -114.84613037109375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.0646955966949463, "rewards/margins": 25.24355697631836, "rewards/rejected": -24.178863525390625, "step": 4200 }, { "epoch": 1.92, "eval_logits/chosen": -1.4128456115722656, "eval_logits/rejected": -1.331875205039978, "eval_logps/chosen": -87.6678466796875, "eval_logps/rejected": -114.6224594116211, "eval_loss": 0.007396237924695015, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.8822879791259766, "eval_rewards/margins": 25.522655487060547, "eval_rewards/rejected": -24.640369415283203, "eval_runtime": 63.2696, "eval_samples_per_second": 45.235, "eval_steps_per_second": 2.829, "step": 4200 }, { "epoch": 1.92, "learning_rate": 1.1984784446322908e-07, "logits/chosen": -1.3565280437469482, "logits/rejected": -1.2859523296356201, "logps/chosen": -85.88087463378906, "logps/rejected": -116.72319030761719, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.10283108055591583, "rewards/margins": 24.91754150390625, "rewards/rejected": -25.020374298095703, "step": 4210 }, { "epoch": 1.93, "learning_rate": 1.1934065934065933e-07, "logits/chosen": -1.3157243728637695, "logits/rejected": -1.253631830215454, "logps/chosen": -88.73226165771484, "logps/rejected": -118.5372543334961, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.7852961421012878, "rewards/margins": 26.198617935180664, "rewards/rejected": -25.41332244873047, "step": 4220 }, { "epoch": 1.93, "learning_rate": 1.188334742180896e-07, "logits/chosen": -1.3382046222686768, "logits/rejected": -1.2745530605316162, "logps/chosen": -87.59422302246094, "logps/rejected": -120.77787017822266, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.4094334542751312, "rewards/margins": 26.14259910583496, "rewards/rejected": -25.733165740966797, "step": 4230 }, { "epoch": 1.94, "learning_rate": 1.1832628909551987e-07, "logits/chosen": -1.3534907102584839, "logits/rejected": -1.2823253870010376, "logps/chosen": -84.35427856445312, "logps/rejected": -119.73152923583984, "loss": 0.0029, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9467264413833618, "rewards/margins": 28.15091323852539, "rewards/rejected": -26.20418357849121, "step": 4240 }, { "epoch": 1.94, "learning_rate": 1.1781910397295012e-07, "logits/chosen": -1.325655221939087, "logits/rejected": -1.2503167390823364, "logps/chosen": -89.13822937011719, "logps/rejected": -122.24861145019531, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.596404790878296, "rewards/margins": 26.64560317993164, "rewards/rejected": -25.0491943359375, "step": 4250 }, { "epoch": 1.94, "learning_rate": 1.1731191885038039e-07, "logits/chosen": -1.2953070402145386, "logits/rejected": -1.2415964603424072, "logps/chosen": -89.16068267822266, "logps/rejected": -121.19921875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.6085556745529175, "rewards/margins": 25.759998321533203, "rewards/rejected": -25.15144157409668, "step": 4260 }, { "epoch": 1.95, "learning_rate": 1.1680473372781064e-07, "logits/chosen": -1.314546823501587, "logits/rejected": -1.2527072429656982, "logps/chosen": -89.053466796875, "logps/rejected": -118.50617980957031, "loss": 0.0079, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5143841505050659, "rewards/margins": 25.380834579467773, "rewards/rejected": -24.866455078125, "step": 4270 }, { "epoch": 1.95, "learning_rate": 1.1629754860524091e-07, "logits/chosen": -1.29738450050354, "logits/rejected": -1.256644606590271, "logps/chosen": -91.60551452636719, "logps/rejected": -119.3724136352539, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4654719829559326, "rewards/margins": 25.70328140258789, "rewards/rejected": -24.237810134887695, "step": 4280 }, { "epoch": 1.96, "learning_rate": 1.1579036348267118e-07, "logits/chosen": -1.3026115894317627, "logits/rejected": -1.2353935241699219, "logps/chosen": -88.94530487060547, "logps/rejected": -110.451904296875, "loss": 0.0014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.12701252102851868, "rewards/margins": 23.56112289428711, "rewards/rejected": -23.43410873413086, "step": 4290 }, { "epoch": 1.96, "learning_rate": 1.1528317836010143e-07, "logits/chosen": -1.311436414718628, "logits/rejected": -1.2566239833831787, "logps/chosen": -85.2987289428711, "logps/rejected": -115.095947265625, "loss": 0.0046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6091933846473694, "rewards/margins": 25.31754493713379, "rewards/rejected": -24.70834732055664, "step": 4300 }, { "epoch": 1.96, "eval_logits/chosen": -1.4097875356674194, "eval_logits/rejected": -1.3324825763702393, "eval_logps/chosen": -87.3016357421875, "eval_logps/rejected": -111.95087432861328, "eval_loss": 0.006749654188752174, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.0653921365737915, "eval_rewards/margins": 24.369966506958008, "eval_rewards/rejected": -23.30457305908203, "eval_runtime": 121.9318, "eval_samples_per_second": 23.472, "eval_steps_per_second": 1.468, "step": 4300 }, { "epoch": 1.97, "learning_rate": 1.147759932375317e-07, "logits/chosen": -1.3246426582336426, "logits/rejected": -1.2590562105178833, "logps/chosen": -88.42132568359375, "logps/rejected": -115.56095886230469, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.7354822158813477, "rewards/margins": 25.661792755126953, "rewards/rejected": -23.926311492919922, "step": 4310 }, { "epoch": 1.97, "learning_rate": 1.1426880811496195e-07, "logits/chosen": -1.3025567531585693, "logits/rejected": -1.227550745010376, "logps/chosen": -93.46360778808594, "logps/rejected": -112.05072021484375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.7918212413787842, "rewards/margins": 25.10645866394043, "rewards/rejected": -23.314638137817383, "step": 4320 }, { "epoch": 1.98, "learning_rate": 1.1376162299239222e-07, "logits/chosen": -1.3441492319107056, "logits/rejected": -1.2833201885223389, "logps/chosen": -87.40711975097656, "logps/rejected": -111.749267578125, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 1.4917819499969482, "rewards/margins": 24.15626335144043, "rewards/rejected": -22.664485931396484, "step": 4330 }, { "epoch": 1.98, "learning_rate": 1.1325443786982247e-07, "logits/chosen": -1.2675365209579468, "logits/rejected": -1.2221524715423584, "logps/chosen": -87.13706970214844, "logps/rejected": -113.51161193847656, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.4677601754665375, "rewards/margins": 23.889986038208008, "rewards/rejected": -23.42222785949707, "step": 4340 }, { "epoch": 1.99, "learning_rate": 1.1274725274725274e-07, "logits/chosen": -1.2820374965667725, "logits/rejected": -1.224862813949585, "logps/chosen": -86.58576965332031, "logps/rejected": -113.4376220703125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.2789913415908813, "rewards/margins": 24.847978591918945, "rewards/rejected": -23.568988800048828, "step": 4350 }, { "epoch": 1.99, "learning_rate": 1.1224006762468301e-07, "logits/chosen": -1.3038794994354248, "logits/rejected": -1.2235076427459717, "logps/chosen": -91.44725036621094, "logps/rejected": -112.098876953125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.9584652185440063, "rewards/margins": 24.83078384399414, "rewards/rejected": -23.872318267822266, "step": 4360 }, { "epoch": 1.99, "learning_rate": 1.1173288250211326e-07, "logits/chosen": -1.3245998620986938, "logits/rejected": -1.2639939785003662, "logps/chosen": -88.39584350585938, "logps/rejected": -113.70811462402344, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 1.3476972579956055, "rewards/margins": 23.937400817871094, "rewards/rejected": -22.589706420898438, "step": 4370 }, { "epoch": 2.0, "learning_rate": 1.1122569737954353e-07, "logits/chosen": -1.3038419485092163, "logits/rejected": -1.2349718809127808, "logps/chosen": -88.70793151855469, "logps/rejected": -110.3454360961914, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.9998445510864258, "rewards/margins": 24.013072967529297, "rewards/rejected": -22.013225555419922, "step": 4380 }, { "epoch": 2.0, "learning_rate": 1.1071851225697379e-07, "logits/chosen": -1.3599039316177368, "logits/rejected": -1.282843828201294, "logps/chosen": -95.58447265625, "logps/rejected": -119.10038757324219, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.9651339650154114, "rewards/margins": 25.86013412475586, "rewards/rejected": -24.89499855041504, "step": 4390 }, { "epoch": 2.01, "learning_rate": 1.1021132713440405e-07, "logits/chosen": -1.2892515659332275, "logits/rejected": -1.2328523397445679, "logps/chosen": -88.33984375, "logps/rejected": -111.24787902832031, "loss": 0.0024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.816326916217804, "rewards/margins": 23.1959171295166, "rewards/rejected": -22.379589080810547, "step": 4400 }, { "epoch": 2.01, "eval_logits/chosen": -1.4092082977294922, "eval_logits/rejected": -1.3323308229446411, "eval_logps/chosen": -86.91348266601562, "eval_logps/rejected": -111.49284362792969, "eval_loss": 0.006608007475733757, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.2594695091247559, "eval_rewards/margins": 24.33502960205078, "eval_rewards/rejected": -23.075557708740234, "eval_runtime": 62.0198, "eval_samples_per_second": 46.147, "eval_steps_per_second": 2.886, "step": 4400 }, { "epoch": 2.01, "learning_rate": 1.0970414201183432e-07, "logits/chosen": -1.2805674076080322, "logits/rejected": -1.221649408340454, "logps/chosen": -94.44866943359375, "logps/rejected": -120.46419525146484, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9686487317085266, "rewards/margins": 25.624675750732422, "rewards/rejected": -24.65602684020996, "step": 4410 }, { "epoch": 2.02, "learning_rate": 1.0919695688926457e-07, "logits/chosen": -1.3352880477905273, "logits/rejected": -1.2853882312774658, "logps/chosen": -86.63716125488281, "logps/rejected": -115.88981628417969, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 0.9782888293266296, "rewards/margins": 23.240489959716797, "rewards/rejected": -22.262203216552734, "step": 4420 }, { "epoch": 2.02, "learning_rate": 1.0868977176669484e-07, "logits/chosen": -1.3116331100463867, "logits/rejected": -1.2609537839889526, "logps/chosen": -85.02494812011719, "logps/rejected": -115.8293228149414, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.2943823337554932, "rewards/margins": 24.715749740600586, "rewards/rejected": -23.421369552612305, "step": 4430 }, { "epoch": 2.03, "learning_rate": 1.081825866441251e-07, "logits/chosen": -1.3299988508224487, "logits/rejected": -1.266905665397644, "logps/chosen": -90.40599060058594, "logps/rejected": -116.70439147949219, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.15012526512146, "rewards/margins": 24.405237197875977, "rewards/rejected": -23.25511360168457, "step": 4440 }, { "epoch": 2.03, "learning_rate": 1.0767540152155536e-07, "logits/chosen": -1.324263572692871, "logits/rejected": -1.2538119554519653, "logps/chosen": -83.78913116455078, "logps/rejected": -115.94393157958984, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.9307165145874023, "rewards/margins": 24.19430160522461, "rewards/rejected": -23.263586044311523, "step": 4450 }, { "epoch": 2.04, "learning_rate": 1.0716821639898562e-07, "logits/chosen": -1.340303659439087, "logits/rejected": -1.2662428617477417, "logps/chosen": -93.44474792480469, "logps/rejected": -110.15861511230469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.513580560684204, "rewards/margins": 24.58599853515625, "rewards/rejected": -23.072418212890625, "step": 4460 }, { "epoch": 2.04, "learning_rate": 1.0666103127641589e-07, "logits/chosen": -1.3335578441619873, "logits/rejected": -1.2744941711425781, "logps/chosen": -81.96378326416016, "logps/rejected": -116.51377868652344, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.511701762676239, "rewards/margins": 25.025680541992188, "rewards/rejected": -24.51398277282715, "step": 4470 }, { "epoch": 2.04, "learning_rate": 1.0615384615384615e-07, "logits/chosen": -1.3683226108551025, "logits/rejected": -1.2874228954315186, "logps/chosen": -84.69481658935547, "logps/rejected": -113.8635482788086, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.6471071243286133, "rewards/margins": 25.367168426513672, "rewards/rejected": -23.720062255859375, "step": 4480 }, { "epoch": 2.05, "learning_rate": 1.0564666103127641e-07, "logits/chosen": -1.3335193395614624, "logits/rejected": -1.2593390941619873, "logps/chosen": -89.44517517089844, "logps/rejected": -113.9664535522461, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.6522144079208374, "rewards/margins": 24.980897903442383, "rewards/rejected": -23.328683853149414, "step": 4490 }, { "epoch": 2.05, "learning_rate": 1.0513947590870668e-07, "logits/chosen": -1.3129395246505737, "logits/rejected": -1.2438514232635498, "logps/chosen": -89.5646743774414, "logps/rejected": -110.78108215332031, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.7508271932601929, "rewards/margins": 24.931434631347656, "rewards/rejected": -24.180606842041016, "step": 4500 }, { "epoch": 2.05, "eval_logits/chosen": -1.4130858182907104, "eval_logits/rejected": -1.3323482275009155, "eval_logps/chosen": -87.22586059570312, "eval_logps/rejected": -112.64375305175781, "eval_loss": 0.0067411912605166435, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 1.1032801866531372, "eval_rewards/margins": 24.75430679321289, "eval_rewards/rejected": -23.651025772094727, "eval_runtime": 68.0364, "eval_samples_per_second": 42.066, "eval_steps_per_second": 2.631, "step": 4500 }, { "epoch": 2.06, "learning_rate": 1.0463229078613693e-07, "logits/chosen": -1.338761568069458, "logits/rejected": -1.2733943462371826, "logps/chosen": -93.2102279663086, "logps/rejected": -115.36387634277344, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.200361728668213, "rewards/margins": 24.453805923461914, "rewards/rejected": -23.25344467163086, "step": 4510 }, { "epoch": 2.06, "learning_rate": 1.041251056635672e-07, "logits/chosen": -1.3229678869247437, "logits/rejected": -1.258108377456665, "logps/chosen": -88.09375762939453, "logps/rejected": -119.7026596069336, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.234534740447998, "rewards/margins": 26.626230239868164, "rewards/rejected": -24.39169692993164, "step": 4520 }, { "epoch": 2.07, "learning_rate": 1.0361792054099746e-07, "logits/chosen": -1.344191312789917, "logits/rejected": -1.2628560066223145, "logps/chosen": -94.99612426757812, "logps/rejected": -112.13182067871094, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6532214879989624, "rewards/margins": 25.270252227783203, "rewards/rejected": -23.61703109741211, "step": 4530 }, { "epoch": 2.07, "learning_rate": 1.0311073541842772e-07, "logits/chosen": -1.3356167078018188, "logits/rejected": -1.274715542793274, "logps/chosen": -85.35197448730469, "logps/rejected": -114.82647705078125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.3368360996246338, "rewards/margins": 25.388629913330078, "rewards/rejected": -24.051794052124023, "step": 4540 }, { "epoch": 2.08, "learning_rate": 1.0260355029585799e-07, "logits/chosen": -1.3028538227081299, "logits/rejected": -1.243327021598816, "logps/chosen": -83.73817443847656, "logps/rejected": -119.57144927978516, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.675358235836029, "rewards/margins": 25.816675186157227, "rewards/rejected": -25.141313552856445, "step": 4550 }, { "epoch": 2.08, "learning_rate": 1.0209636517328824e-07, "logits/chosen": -1.3460599184036255, "logits/rejected": -1.2746741771697998, "logps/chosen": -86.41990661621094, "logps/rejected": -114.42486572265625, "loss": 0.0076, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9956108331680298, "rewards/margins": 26.080272674560547, "rewards/rejected": -25.084659576416016, "step": 4560 }, { "epoch": 2.09, "learning_rate": 1.0158918005071851e-07, "logits/chosen": -1.33854079246521, "logits/rejected": -1.2720884084701538, "logps/chosen": -88.3348617553711, "logps/rejected": -110.352783203125, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0480217933654785, "rewards/margins": 24.384031295776367, "rewards/rejected": -23.336009979248047, "step": 4570 }, { "epoch": 2.09, "learning_rate": 1.0108199492814876e-07, "logits/chosen": -1.3189232349395752, "logits/rejected": -1.2537434101104736, "logps/chosen": -93.42473602294922, "logps/rejected": -117.38157653808594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.220496416091919, "rewards/margins": 25.104259490966797, "rewards/rejected": -23.88376235961914, "step": 4580 }, { "epoch": 2.09, "learning_rate": 1.0057480980557903e-07, "logits/chosen": -1.346653938293457, "logits/rejected": -1.2888588905334473, "logps/chosen": -84.58158111572266, "logps/rejected": -116.2260971069336, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9570630788803101, "rewards/margins": 24.75169563293457, "rewards/rejected": -23.794628143310547, "step": 4590 }, { "epoch": 2.1, "learning_rate": 1.000676246830093e-07, "logits/chosen": -1.3445775508880615, "logits/rejected": -1.2613394260406494, "logps/chosen": -88.18348693847656, "logps/rejected": -117.32752990722656, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.2772257328033447, "rewards/margins": 27.420055389404297, "rewards/rejected": -25.1428279876709, "step": 4600 }, { "epoch": 2.1, "eval_logits/chosen": -1.4255520105361938, "eval_logits/rejected": -1.3487104177474976, "eval_logps/chosen": -87.3498306274414, "eval_logps/rejected": -113.354248046875, "eval_loss": 0.006937822792679071, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.0413002967834473, "eval_rewards/margins": 25.047561645507812, "eval_rewards/rejected": -24.006263732910156, "eval_runtime": 70.6268, "eval_samples_per_second": 40.523, "eval_steps_per_second": 2.534, "step": 4600 }, { "epoch": 2.1, "learning_rate": 9.956043956043955e-08, "logits/chosen": -1.3094035387039185, "logits/rejected": -1.2451339960098267, "logps/chosen": -89.12696838378906, "logps/rejected": -114.3353271484375, "loss": 0.0036, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.26013678312301636, "rewards/margins": 23.884912490844727, "rewards/rejected": -23.624773025512695, "step": 4610 }, { "epoch": 2.11, "learning_rate": 9.905325443786982e-08, "logits/chosen": -1.3309595584869385, "logits/rejected": -1.2776004076004028, "logps/chosen": -86.14755249023438, "logps/rejected": -116.1485366821289, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.36527928709983826, "rewards/margins": 24.457759857177734, "rewards/rejected": -24.092479705810547, "step": 4620 }, { "epoch": 2.11, "learning_rate": 9.854606931530007e-08, "logits/chosen": -1.3390415906906128, "logits/rejected": -1.2725872993469238, "logps/chosen": -91.37030029296875, "logps/rejected": -115.81642150878906, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.0659124851226807, "rewards/margins": 25.772014617919922, "rewards/rejected": -24.706104278564453, "step": 4630 }, { "epoch": 2.12, "learning_rate": 9.803888419273034e-08, "logits/chosen": -1.3206666707992554, "logits/rejected": -1.2614423036575317, "logps/chosen": -83.52031707763672, "logps/rejected": -116.8504867553711, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7927557826042175, "rewards/margins": 25.795413970947266, "rewards/rejected": -25.002655029296875, "step": 4640 }, { "epoch": 2.12, "learning_rate": 9.753169907016061e-08, "logits/chosen": -1.3370873928070068, "logits/rejected": -1.2674705982208252, "logps/chosen": -87.10755157470703, "logps/rejected": -119.5748519897461, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 2.217125654220581, "rewards/margins": 27.1856632232666, "rewards/rejected": -24.968536376953125, "step": 4650 }, { "epoch": 2.13, "learning_rate": 9.702451394759086e-08, "logits/chosen": -1.3084181547164917, "logits/rejected": -1.2384984493255615, "logps/chosen": -87.81915283203125, "logps/rejected": -113.61048889160156, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.1231558322906494, "rewards/margins": 25.29172706604004, "rewards/rejected": -24.168569564819336, "step": 4660 }, { "epoch": 2.13, "learning_rate": 9.651732882502113e-08, "logits/chosen": -1.31204092502594, "logits/rejected": -1.2542951107025146, "logps/chosen": -92.28341674804688, "logps/rejected": -115.6882553100586, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.2532768249511719, "rewards/margins": 24.862823486328125, "rewards/rejected": -23.609546661376953, "step": 4670 }, { "epoch": 2.14, "learning_rate": 9.601014370245138e-08, "logits/chosen": -1.3849248886108398, "logits/rejected": -1.3047869205474854, "logps/chosen": -88.0652084350586, "logps/rejected": -116.75108337402344, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.82236385345459, "rewards/margins": 27.45639419555664, "rewards/rejected": -24.634029388427734, "step": 4680 }, { "epoch": 2.14, "learning_rate": 9.550295857988165e-08, "logits/chosen": -1.3264617919921875, "logits/rejected": -1.2588824033737183, "logps/chosen": -90.63042449951172, "logps/rejected": -117.13484954833984, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.8007968664169312, "rewards/margins": 25.313175201416016, "rewards/rejected": -24.512378692626953, "step": 4690 }, { "epoch": 2.15, "learning_rate": 9.49957734573119e-08, "logits/chosen": -1.319097876548767, "logits/rejected": -1.2523882389068604, "logps/chosen": -91.26343536376953, "logps/rejected": -122.9940414428711, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.29426923394203186, "rewards/margins": 26.019664764404297, "rewards/rejected": -25.72539710998535, "step": 4700 }, { "epoch": 2.15, "eval_logits/chosen": -1.431301236152649, "eval_logits/rejected": -1.3519829511642456, "eval_logps/chosen": -87.77189636230469, "eval_logps/rejected": -114.70515441894531, "eval_loss": 0.007204956840723753, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 0.8302635550498962, "eval_rewards/margins": 25.51198387145996, "eval_rewards/rejected": -24.681718826293945, "eval_runtime": 71.5845, "eval_samples_per_second": 39.981, "eval_steps_per_second": 2.501, "step": 4700 }, { "epoch": 2.15, "learning_rate": 9.448858833474217e-08, "logits/chosen": -1.3294093608856201, "logits/rejected": -1.255859613418579, "logps/chosen": -87.07064819335938, "logps/rejected": -119.63526916503906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.91957026720047, "rewards/margins": 25.611663818359375, "rewards/rejected": -24.692096710205078, "step": 4710 }, { "epoch": 2.15, "learning_rate": 9.398140321217244e-08, "logits/chosen": -1.3767739534378052, "logits/rejected": -1.3070622682571411, "logps/chosen": -91.01148986816406, "logps/rejected": -119.0385971069336, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.5754287242889404, "rewards/margins": 25.93698501586914, "rewards/rejected": -24.361553192138672, "step": 4720 }, { "epoch": 2.16, "learning_rate": 9.34742180896027e-08, "logits/chosen": -1.3255687952041626, "logits/rejected": -1.2571724653244019, "logps/chosen": -92.34770202636719, "logps/rejected": -119.81795501708984, "loss": 0.0067, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.29987969994544983, "rewards/margins": 24.60814094543457, "rewards/rejected": -24.90802001953125, "step": 4730 }, { "epoch": 2.16, "learning_rate": 9.296703296703296e-08, "logits/chosen": -1.364595651626587, "logits/rejected": -1.2936162948608398, "logps/chosen": -88.7773666381836, "logps/rejected": -116.1494140625, "loss": 0.0024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7363441586494446, "rewards/margins": 25.116535186767578, "rewards/rejected": -25.852880477905273, "step": 4740 }, { "epoch": 2.17, "learning_rate": 9.245984784446322e-08, "logits/chosen": -1.3715788125991821, "logits/rejected": -1.3029972314834595, "logps/chosen": -93.72126770019531, "logps/rejected": -118.23783111572266, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 1.3293616771697998, "rewards/margins": 25.474594116210938, "rewards/rejected": -24.145231246948242, "step": 4750 }, { "epoch": 2.17, "learning_rate": 9.195266272189349e-08, "logits/chosen": -1.3523025512695312, "logits/rejected": -1.2690999507904053, "logps/chosen": -93.32905578613281, "logps/rejected": -118.7462158203125, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9021531343460083, "rewards/margins": 26.36541748046875, "rewards/rejected": -25.463268280029297, "step": 4760 }, { "epoch": 2.18, "learning_rate": 9.144547759932375e-08, "logits/chosen": -1.294425368309021, "logits/rejected": -1.2407166957855225, "logps/chosen": -90.24799346923828, "logps/rejected": -123.93524169921875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.4086612462997437, "rewards/margins": 25.56521224975586, "rewards/rejected": -24.15654945373535, "step": 4770 }, { "epoch": 2.18, "learning_rate": 9.093829247675401e-08, "logits/chosen": -1.3438308238983154, "logits/rejected": -1.275816798210144, "logps/chosen": -91.6666488647461, "logps/rejected": -119.28436279296875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 2.383312702178955, "rewards/margins": 27.003463745117188, "rewards/rejected": -24.620147705078125, "step": 4780 }, { "epoch": 2.19, "learning_rate": 9.043110735418427e-08, "logits/chosen": -1.3451048135757446, "logits/rejected": -1.2772449254989624, "logps/chosen": -94.4778823852539, "logps/rejected": -116.2429428100586, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2315725088119507, "rewards/margins": 25.15644073486328, "rewards/rejected": -23.924867630004883, "step": 4790 }, { "epoch": 2.19, "learning_rate": 8.992392223161453e-08, "logits/chosen": -1.2999621629714966, "logits/rejected": -1.2457572221755981, "logps/chosen": -88.31067657470703, "logps/rejected": -120.50489807128906, "loss": 0.0067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5283034443855286, "rewards/margins": 25.667068481445312, "rewards/rejected": -25.13875961303711, "step": 4800 }, { "epoch": 2.19, "eval_logits/chosen": -1.4411685466766357, "eval_logits/rejected": -1.3603928089141846, "eval_logps/chosen": -87.89783477783203, "eval_logps/rejected": -114.8197250366211, "eval_loss": 0.007284725550562143, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.7672966718673706, "eval_rewards/margins": 25.506298065185547, "eval_rewards/rejected": -24.739002227783203, "eval_runtime": 61.4965, "eval_samples_per_second": 46.539, "eval_steps_per_second": 2.911, "step": 4800 }, { "epoch": 2.2, "learning_rate": 8.94167371090448e-08, "logits/chosen": -1.3604772090911865, "logits/rejected": -1.291609287261963, "logps/chosen": -87.88396453857422, "logps/rejected": -116.81595611572266, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.6294358968734741, "rewards/margins": 25.354211807250977, "rewards/rejected": -24.724775314331055, "step": 4810 }, { "epoch": 2.2, "learning_rate": 8.890955198647506e-08, "logits/chosen": -1.327947735786438, "logits/rejected": -1.2544294595718384, "logps/chosen": -87.47128295898438, "logps/rejected": -121.36673736572266, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.21699205040931702, "rewards/margins": 25.810955047607422, "rewards/rejected": -25.593963623046875, "step": 4820 }, { "epoch": 2.2, "learning_rate": 8.840236686390532e-08, "logits/chosen": -1.3332288265228271, "logits/rejected": -1.2797290086746216, "logps/chosen": -84.39314270019531, "logps/rejected": -118.59745788574219, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.0452686548233032, "rewards/margins": 26.805339813232422, "rewards/rejected": -25.760074615478516, "step": 4830 }, { "epoch": 2.21, "learning_rate": 8.789518174133559e-08, "logits/chosen": -1.3392903804779053, "logits/rejected": -1.273781418800354, "logps/chosen": -87.46485137939453, "logps/rejected": -118.17325592041016, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.027283096686005592, "rewards/margins": 25.02812385559082, "rewards/rejected": -25.05540657043457, "step": 4840 }, { "epoch": 2.21, "learning_rate": 8.738799661876584e-08, "logits/chosen": -1.36063551902771, "logits/rejected": -1.275144338607788, "logps/chosen": -93.5079116821289, "logps/rejected": -117.1824951171875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.3975207209587097, "rewards/margins": 25.79745101928711, "rewards/rejected": -25.399932861328125, "step": 4850 }, { "epoch": 2.22, "learning_rate": 8.688081149619611e-08, "logits/chosen": -1.3441591262817383, "logits/rejected": -1.2624187469482422, "logps/chosen": -91.46955108642578, "logps/rejected": -112.102783203125, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1940736770629883, "rewards/margins": 25.44129753112793, "rewards/rejected": -23.247224807739258, "step": 4860 }, { "epoch": 2.22, "learning_rate": 8.637362637362636e-08, "logits/chosen": -1.371538758277893, "logits/rejected": -1.3088654279708862, "logps/chosen": -86.52335357666016, "logps/rejected": -117.4716567993164, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.9726071357727051, "rewards/margins": 26.16326332092285, "rewards/rejected": -25.190654754638672, "step": 4870 }, { "epoch": 2.23, "learning_rate": 8.586644125105663e-08, "logits/chosen": -1.3846619129180908, "logits/rejected": -1.307712197303772, "logps/chosen": -87.76756286621094, "logps/rejected": -115.74507141113281, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3747789263725281, "rewards/margins": 25.810970306396484, "rewards/rejected": -25.436189651489258, "step": 4880 }, { "epoch": 2.23, "learning_rate": 8.53592561284869e-08, "logits/chosen": -1.3542847633361816, "logits/rejected": -1.288946270942688, "logps/chosen": -91.2932357788086, "logps/rejected": -116.8852767944336, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.8335541486740112, "rewards/margins": 25.681522369384766, "rewards/rejected": -24.847965240478516, "step": 4890 }, { "epoch": 2.24, "learning_rate": 8.485207100591715e-08, "logits/chosen": -1.3608229160308838, "logits/rejected": -1.2844916582107544, "logps/chosen": -88.2170181274414, "logps/rejected": -120.30814361572266, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.6866910457611084, "rewards/margins": 27.874353408813477, "rewards/rejected": -26.187664031982422, "step": 4900 }, { "epoch": 2.24, "eval_logits/chosen": -1.461911678314209, "eval_logits/rejected": -1.3814847469329834, "eval_logps/chosen": -87.90396881103516, "eval_logps/rejected": -115.87995147705078, "eval_loss": 0.007340571843087673, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.7642290592193604, "eval_rewards/margins": 26.033344268798828, "eval_rewards/rejected": -25.26911735534668, "eval_runtime": 68.6122, "eval_samples_per_second": 41.713, "eval_steps_per_second": 2.609, "step": 4900 }, { "epoch": 2.24, "learning_rate": 8.434488588334742e-08, "logits/chosen": -1.4112756252288818, "logits/rejected": -1.3500181436538696, "logps/chosen": -89.21757507324219, "logps/rejected": -117.94172668457031, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.4118437170982361, "rewards/margins": 24.942344665527344, "rewards/rejected": -24.530498504638672, "step": 4910 }, { "epoch": 2.25, "learning_rate": 8.383770076077767e-08, "logits/chosen": -1.3511896133422852, "logits/rejected": -1.2767199277877808, "logps/chosen": -95.7844009399414, "logps/rejected": -117.9591293334961, "loss": 0.0023, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.7757484912872314, "rewards/margins": 26.916187286376953, "rewards/rejected": -25.14043617248535, "step": 4920 }, { "epoch": 2.25, "learning_rate": 8.333051563820794e-08, "logits/chosen": -1.3630616664886475, "logits/rejected": -1.2954555749893188, "logps/chosen": -91.95133972167969, "logps/rejected": -123.22148132324219, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0093523263931274, "rewards/margins": 27.7642822265625, "rewards/rejected": -26.754928588867188, "step": 4930 }, { "epoch": 2.25, "learning_rate": 8.282333051563821e-08, "logits/chosen": -1.3652262687683105, "logits/rejected": -1.3002612590789795, "logps/chosen": -89.00601959228516, "logps/rejected": -117.67951965332031, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.025102138519287, "rewards/margins": 26.249094009399414, "rewards/rejected": -25.223987579345703, "step": 4940 }, { "epoch": 2.26, "learning_rate": 8.231614539306846e-08, "logits/chosen": -1.3879879713058472, "logits/rejected": -1.3240267038345337, "logps/chosen": -87.9173812866211, "logps/rejected": -117.83023834228516, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.08251948654651642, "rewards/margins": 24.835966110229492, "rewards/rejected": -24.75344467163086, "step": 4950 }, { "epoch": 2.26, "learning_rate": 8.180896027049873e-08, "logits/chosen": -1.372811198234558, "logits/rejected": -1.2928178310394287, "logps/chosen": -90.76931762695312, "logps/rejected": -118.8313217163086, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.388689398765564, "rewards/margins": 26.92373275756836, "rewards/rejected": -25.535043716430664, "step": 4960 }, { "epoch": 2.27, "learning_rate": 8.130177514792898e-08, "logits/chosen": -1.3881031274795532, "logits/rejected": -1.314194917678833, "logps/chosen": -87.59803771972656, "logps/rejected": -118.09504699707031, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.4410366117954254, "rewards/margins": 26.20395278930664, "rewards/rejected": -25.762914657592773, "step": 4970 }, { "epoch": 2.27, "learning_rate": 8.079459002535925e-08, "logits/chosen": -1.3640211820602417, "logits/rejected": -1.2896654605865479, "logps/chosen": -87.9101333618164, "logps/rejected": -117.8978500366211, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.964639663696289, "rewards/margins": 28.1193790435791, "rewards/rejected": -26.154743194580078, "step": 4980 }, { "epoch": 2.28, "learning_rate": 8.02874049027895e-08, "logits/chosen": -1.377068281173706, "logits/rejected": -1.3123360872268677, "logps/chosen": -87.87947845458984, "logps/rejected": -125.39814758300781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.9372149705886841, "rewards/margins": 27.093730926513672, "rewards/rejected": -26.156518936157227, "step": 4990 }, { "epoch": 2.28, "learning_rate": 7.978021978021977e-08, "logits/chosen": -1.3721301555633545, "logits/rejected": -1.302929162979126, "logps/chosen": -88.95343780517578, "logps/rejected": -122.36506652832031, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.1758254766464233, "rewards/margins": 27.310205459594727, "rewards/rejected": -26.134380340576172, "step": 5000 }, { "epoch": 2.28, "eval_logits/chosen": -1.4729403257369995, "eval_logits/rejected": -1.3894855976104736, "eval_logps/chosen": -88.23526763916016, "eval_logps/rejected": -117.04609680175781, "eval_loss": 0.0077310591004788876, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.5985845923423767, "eval_rewards/margins": 26.450777053833008, "eval_rewards/rejected": -25.852190017700195, "eval_runtime": 65.1848, "eval_samples_per_second": 43.906, "eval_steps_per_second": 2.746, "step": 5000 }, { "epoch": 2.29, "learning_rate": 7.927303465765004e-08, "logits/chosen": -1.4071820974349976, "logits/rejected": -1.325002908706665, "logps/chosen": -92.00203704833984, "logps/rejected": -121.43309020996094, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.40916553139686584, "rewards/margins": 25.54302978515625, "rewards/rejected": -25.133861541748047, "step": 5010 }, { "epoch": 2.29, "learning_rate": 7.87658495350803e-08, "logits/chosen": -1.382490873336792, "logits/rejected": -1.290702223777771, "logps/chosen": -91.81244659423828, "logps/rejected": -116.6902847290039, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.020965814590454, "rewards/margins": 27.167251586914062, "rewards/rejected": -26.146282196044922, "step": 5020 }, { "epoch": 2.3, "learning_rate": 7.825866441251056e-08, "logits/chosen": -1.359053373336792, "logits/rejected": -1.290919542312622, "logps/chosen": -86.95582580566406, "logps/rejected": -120.83614349365234, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.065791368484497, "rewards/margins": 29.087757110595703, "rewards/rejected": -27.021968841552734, "step": 5030 }, { "epoch": 2.3, "learning_rate": 7.775147928994082e-08, "logits/chosen": -1.369732141494751, "logits/rejected": -1.3190780878067017, "logps/chosen": -88.25080108642578, "logps/rejected": -122.08647155761719, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.4733394682407379, "rewards/margins": 26.999135971069336, "rewards/rejected": -26.52579689025879, "step": 5040 }, { "epoch": 2.3, "learning_rate": 7.724429416737108e-08, "logits/chosen": -1.3806296586990356, "logits/rejected": -1.314660668373108, "logps/chosen": -82.57357025146484, "logps/rejected": -113.80106353759766, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.6215075254440308, "rewards/margins": 25.933462142944336, "rewards/rejected": -25.311954498291016, "step": 5050 }, { "epoch": 2.31, "learning_rate": 7.673710904480135e-08, "logits/chosen": -1.3427412509918213, "logits/rejected": -1.277625560760498, "logps/chosen": -88.83329772949219, "logps/rejected": -113.25550842285156, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.39279159903526306, "rewards/margins": 24.52106475830078, "rewards/rejected": -24.128273010253906, "step": 5060 }, { "epoch": 2.31, "learning_rate": 7.62299239222316e-08, "logits/chosen": -1.3516267538070679, "logits/rejected": -1.290647268295288, "logps/chosen": -93.58750915527344, "logps/rejected": -119.11576843261719, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.06238238885998726, "rewards/margins": 25.322040557861328, "rewards/rejected": -25.384424209594727, "step": 5070 }, { "epoch": 2.32, "learning_rate": 7.572273879966187e-08, "logits/chosen": -1.3757238388061523, "logits/rejected": -1.306443452835083, "logps/chosen": -93.31622314453125, "logps/rejected": -117.56782531738281, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.3956282138824463, "rewards/margins": 26.834218978881836, "rewards/rejected": -25.43859100341797, "step": 5080 }, { "epoch": 2.32, "learning_rate": 7.521555367709213e-08, "logits/chosen": -1.3766396045684814, "logits/rejected": -1.3114879131317139, "logps/chosen": -86.86039733886719, "logps/rejected": -118.87870025634766, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 1.0653996467590332, "rewards/margins": 26.928573608398438, "rewards/rejected": -25.863174438476562, "step": 5090 }, { "epoch": 2.33, "learning_rate": 7.47083685545224e-08, "logits/chosen": -1.38792884349823, "logits/rejected": -1.31496262550354, "logps/chosen": -86.50321197509766, "logps/rejected": -119.1225357055664, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.047428846359253, "rewards/margins": 26.675806045532227, "rewards/rejected": -25.628376007080078, "step": 5100 }, { "epoch": 2.33, "eval_logits/chosen": -1.4878861904144287, "eval_logits/rejected": -1.4035993814468384, "eval_logps/chosen": -88.03448486328125, "eval_logps/rejected": -116.85780334472656, "eval_loss": 0.007282613776624203, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.6989699006080627, "eval_rewards/margins": 26.457014083862305, "eval_rewards/rejected": -25.758041381835938, "eval_runtime": 63.4536, "eval_samples_per_second": 45.104, "eval_steps_per_second": 2.821, "step": 5100 }, { "epoch": 2.33, "learning_rate": 7.420118343195266e-08, "logits/chosen": -1.3945187330245972, "logits/rejected": -1.311867594718933, "logps/chosen": -94.6859359741211, "logps/rejected": -113.41032409667969, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.1795772314071655, "rewards/margins": 26.022451400756836, "rewards/rejected": -24.842870712280273, "step": 5110 }, { "epoch": 2.34, "learning_rate": 7.369399830938292e-08, "logits/chosen": -1.3820443153381348, "logits/rejected": -1.3166067600250244, "logps/chosen": -83.39116668701172, "logps/rejected": -115.35809326171875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.4604426920413971, "rewards/margins": 26.040401458740234, "rewards/rejected": -25.579957962036133, "step": 5120 }, { "epoch": 2.34, "learning_rate": 7.318681318681318e-08, "logits/chosen": -1.3669214248657227, "logits/rejected": -1.3000479936599731, "logps/chosen": -90.93084716796875, "logps/rejected": -114.6349868774414, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9245045781135559, "rewards/margins": 26.640636444091797, "rewards/rejected": -25.716135025024414, "step": 5130 }, { "epoch": 2.35, "learning_rate": 7.267962806424344e-08, "logits/chosen": -1.3965649604797363, "logits/rejected": -1.3084334135055542, "logps/chosen": -97.05128479003906, "logps/rejected": -123.03981018066406, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.9901596903800964, "rewards/margins": 26.782718658447266, "rewards/rejected": -25.792556762695312, "step": 5140 }, { "epoch": 2.35, "learning_rate": 7.21724429416737e-08, "logits/chosen": -1.3820029497146606, "logits/rejected": -1.2966136932373047, "logps/chosen": -90.78387451171875, "logps/rejected": -119.2174301147461, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.5053150653839111, "rewards/margins": 27.21875, "rewards/rejected": -25.713436126708984, "step": 5150 }, { "epoch": 2.36, "learning_rate": 7.166525781910397e-08, "logits/chosen": -1.3862712383270264, "logits/rejected": -1.315342903137207, "logps/chosen": -88.68326568603516, "logps/rejected": -119.2471923828125, "loss": 0.0044, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5128425359725952, "rewards/margins": 24.922550201416016, "rewards/rejected": -25.435388565063477, "step": 5160 }, { "epoch": 2.36, "learning_rate": 7.115807269653423e-08, "logits/chosen": -1.358323335647583, "logits/rejected": -1.2905691862106323, "logps/chosen": -88.0818862915039, "logps/rejected": -118.68809509277344, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4642322063446045, "rewards/margins": 26.244770050048828, "rewards/rejected": -24.78053855895996, "step": 5170 }, { "epoch": 2.36, "learning_rate": 7.06508875739645e-08, "logits/chosen": -1.412334680557251, "logits/rejected": -1.33579683303833, "logps/chosen": -93.82710266113281, "logps/rejected": -117.13541412353516, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.39411622285842896, "rewards/margins": 25.49358367919922, "rewards/rejected": -25.099470138549805, "step": 5180 }, { "epoch": 2.37, "learning_rate": 7.014370245139475e-08, "logits/chosen": -1.3709720373153687, "logits/rejected": -1.2929903268814087, "logps/chosen": -91.2157974243164, "logps/rejected": -124.2120132446289, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.4855771064758301, "rewards/margins": 27.795948028564453, "rewards/rejected": -27.31036949157715, "step": 5190 }, { "epoch": 2.37, "learning_rate": 6.963651732882502e-08, "logits/chosen": -1.3689887523651123, "logits/rejected": -1.2964820861816406, "logps/chosen": -86.01536560058594, "logps/rejected": -118.3643798828125, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.1772948503494263, "rewards/margins": 27.771255493164062, "rewards/rejected": -26.59395980834961, "step": 5200 }, { "epoch": 2.37, "eval_logits/chosen": -1.4872238636016846, "eval_logits/rejected": -1.399623990058899, "eval_logps/chosen": -88.03882598876953, "eval_logps/rejected": -117.7686996459961, "eval_loss": 0.007167758885771036, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.6967981457710266, "eval_rewards/margins": 26.910293579101562, "eval_rewards/rejected": -26.213491439819336, "eval_runtime": 70.0341, "eval_samples_per_second": 40.866, "eval_steps_per_second": 2.556, "step": 5200 }, { "epoch": 2.38, "learning_rate": 6.912933220625529e-08, "logits/chosen": -1.4059669971466064, "logits/rejected": -1.3302024602890015, "logps/chosen": -88.3302230834961, "logps/rejected": -118.16276550292969, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.0470592975616455, "rewards/margins": 26.200448989868164, "rewards/rejected": -25.153392791748047, "step": 5210 }, { "epoch": 2.38, "learning_rate": 6.862214708368554e-08, "logits/chosen": -1.397664189338684, "logits/rejected": -1.3320751190185547, "logps/chosen": -85.62290954589844, "logps/rejected": -119.31951904296875, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.21012182533740997, "rewards/margins": 25.69754409790039, "rewards/rejected": -25.907669067382812, "step": 5220 }, { "epoch": 2.39, "learning_rate": 6.811496196111581e-08, "logits/chosen": -1.4220311641693115, "logits/rejected": -1.3425581455230713, "logps/chosen": -88.67344665527344, "logps/rejected": -117.9163589477539, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.3023020029067993, "rewards/margins": 27.779937744140625, "rewards/rejected": -26.477636337280273, "step": 5230 }, { "epoch": 2.39, "learning_rate": 6.760777683854606e-08, "logits/chosen": -1.3980010747909546, "logits/rejected": -1.3161672353744507, "logps/chosen": -94.78350830078125, "logps/rejected": -118.20045471191406, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.4626989364624023, "rewards/margins": 27.554616928100586, "rewards/rejected": -26.091922760009766, "step": 5240 }, { "epoch": 2.4, "learning_rate": 6.710059171597633e-08, "logits/chosen": -1.3799418210983276, "logits/rejected": -1.3152214288711548, "logps/chosen": -90.38761901855469, "logps/rejected": -122.6275863647461, "loss": 0.0044, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18404527008533478, "rewards/margins": 27.751272201538086, "rewards/rejected": -27.56722640991211, "step": 5250 }, { "epoch": 2.4, "learning_rate": 6.659340659340658e-08, "logits/chosen": -1.3793939352035522, "logits/rejected": -1.3092762231826782, "logps/chosen": -90.41889953613281, "logps/rejected": -118.99210357666016, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.34509414434432983, "rewards/margins": 27.324649810791016, "rewards/rejected": -26.97955322265625, "step": 5260 }, { "epoch": 2.41, "learning_rate": 6.608622147083685e-08, "logits/chosen": -1.3813196420669556, "logits/rejected": -1.3037269115447998, "logps/chosen": -93.68791198730469, "logps/rejected": -128.4936981201172, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.1084160804748535, "rewards/margins": 28.909637451171875, "rewards/rejected": -27.801223754882812, "step": 5270 }, { "epoch": 2.41, "learning_rate": 6.557903634826712e-08, "logits/chosen": -1.3911049365997314, "logits/rejected": -1.3268201351165771, "logps/chosen": -87.09625244140625, "logps/rejected": -122.59407806396484, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.46324244141578674, "rewards/margins": 27.979909896850586, "rewards/rejected": -27.51666831970215, "step": 5280 }, { "epoch": 2.41, "learning_rate": 6.507185122569737e-08, "logits/chosen": -1.3915858268737793, "logits/rejected": -1.3138229846954346, "logps/chosen": -93.52770233154297, "logps/rejected": -121.19853210449219, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.6541392803192139, "rewards/margins": 29.604787826538086, "rewards/rejected": -27.95064926147461, "step": 5290 }, { "epoch": 2.42, "learning_rate": 6.456466610312764e-08, "logits/chosen": -1.4070789813995361, "logits/rejected": -1.3349624872207642, "logps/chosen": -91.31390380859375, "logps/rejected": -119.59687805175781, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.320601224899292, "rewards/margins": 28.13943862915039, "rewards/rejected": -26.818838119506836, "step": 5300 }, { "epoch": 2.42, "eval_logits/chosen": -1.501613736152649, "eval_logits/rejected": -1.4148671627044678, "eval_logps/chosen": -88.82596588134766, "eval_logps/rejected": -119.5730209350586, "eval_loss": 0.007715919055044651, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.30323532223701477, "eval_rewards/margins": 27.418886184692383, "eval_rewards/rejected": -27.115652084350586, "eval_runtime": 60.0405, "eval_samples_per_second": 47.668, "eval_steps_per_second": 2.981, "step": 5300 }, { "epoch": 2.42, "learning_rate": 6.40574809805579e-08, "logits/chosen": -1.388594150543213, "logits/rejected": -1.314540147781372, "logps/chosen": -90.13387298583984, "logps/rejected": -122.98262023925781, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.6017175316810608, "rewards/margins": 28.00093650817871, "rewards/rejected": -27.399221420288086, "step": 5310 }, { "epoch": 2.43, "learning_rate": 6.355029585798816e-08, "logits/chosen": -1.4170560836791992, "logits/rejected": -1.3456897735595703, "logps/chosen": -88.29602813720703, "logps/rejected": -122.81431579589844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.3829694986343384, "rewards/margins": 29.370956420898438, "rewards/rejected": -27.987987518310547, "step": 5320 }, { "epoch": 2.43, "learning_rate": 6.304311073541843e-08, "logits/chosen": -1.3887192010879517, "logits/rejected": -1.3243298530578613, "logps/chosen": -88.75332641601562, "logps/rejected": -120.09466552734375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.2215611934661865, "rewards/margins": 26.91391944885254, "rewards/rejected": -25.692358016967773, "step": 5330 }, { "epoch": 2.44, "learning_rate": 6.253592561284868e-08, "logits/chosen": -1.4018067121505737, "logits/rejected": -1.3270535469055176, "logps/chosen": -93.98942565917969, "logps/rejected": -125.354248046875, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2943921089172363, "rewards/margins": 28.14287757873535, "rewards/rejected": -26.84848403930664, "step": 5340 }, { "epoch": 2.44, "learning_rate": 6.202874049027895e-08, "logits/chosen": -1.3864609003067017, "logits/rejected": -1.3177391290664673, "logps/chosen": -91.09075927734375, "logps/rejected": -124.11067962646484, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.8371120691299438, "rewards/margins": 28.26397705078125, "rewards/rejected": -27.426868438720703, "step": 5350 }, { "epoch": 2.45, "learning_rate": 6.15215553677092e-08, "logits/chosen": -1.4059492349624634, "logits/rejected": -1.3294079303741455, "logps/chosen": -93.74061584472656, "logps/rejected": -119.74947357177734, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.4449167251586914, "rewards/margins": 27.587207794189453, "rewards/rejected": -26.142292022705078, "step": 5360 }, { "epoch": 2.45, "learning_rate": 6.101437024513947e-08, "logits/chosen": -1.3770349025726318, "logits/rejected": -1.3040486574172974, "logps/chosen": -92.32209014892578, "logps/rejected": -123.76420593261719, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.0093997716903687, "rewards/margins": 27.536449432373047, "rewards/rejected": -26.527050018310547, "step": 5370 }, { "epoch": 2.46, "learning_rate": 6.050718512256973e-08, "logits/chosen": -1.3874174356460571, "logits/rejected": -1.3079521656036377, "logps/chosen": -89.07852172851562, "logps/rejected": -120.56114196777344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3143458366394043, "rewards/margins": 27.301443099975586, "rewards/rejected": -25.987096786499023, "step": 5380 }, { "epoch": 2.46, "learning_rate": 6e-08, "logits/chosen": -1.4424034357070923, "logits/rejected": -1.3676373958587646, "logps/chosen": -93.13945770263672, "logps/rejected": -124.12135314941406, "loss": 0.006, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4384705424308777, "rewards/margins": 27.193965911865234, "rewards/rejected": -26.755496978759766, "step": 5390 }, { "epoch": 2.46, "learning_rate": 5.949281487743026e-08, "logits/chosen": -1.4014787673950195, "logits/rejected": -1.3362300395965576, "logps/chosen": -88.39823150634766, "logps/rejected": -119.18223571777344, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.5621711611747742, "rewards/margins": 27.4252872467041, "rewards/rejected": -26.863113403320312, "step": 5400 }, { "epoch": 2.46, "eval_logits/chosen": -1.5089606046676636, "eval_logits/rejected": -1.4168381690979004, "eval_logps/chosen": -88.5813217163086, "eval_logps/rejected": -119.76658630371094, "eval_loss": 0.00763333635404706, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.42555317282676697, "eval_rewards/margins": 27.637985229492188, "eval_rewards/rejected": -27.212432861328125, "eval_runtime": 62.6634, "eval_samples_per_second": 45.673, "eval_steps_per_second": 2.857, "step": 5400 }, { "epoch": 2.47, "learning_rate": 5.898562975486052e-08, "logits/chosen": -1.4362767934799194, "logits/rejected": -1.3550790548324585, "logps/chosen": -89.90331268310547, "logps/rejected": -124.781982421875, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.522152304649353, "rewards/margins": 29.26023292541504, "rewards/rejected": -27.738079071044922, "step": 5410 }, { "epoch": 2.47, "learning_rate": 5.8478444632290784e-08, "logits/chosen": -1.4165668487548828, "logits/rejected": -1.343462586402893, "logps/chosen": -90.41630554199219, "logps/rejected": -127.66859436035156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.15914836525917053, "rewards/margins": 29.304555892944336, "rewards/rejected": -29.145410537719727, "step": 5420 }, { "epoch": 2.48, "learning_rate": 5.7971259509721045e-08, "logits/chosen": -1.3894436359405518, "logits/rejected": -1.317000150680542, "logps/chosen": -88.46882629394531, "logps/rejected": -120.24174499511719, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.4189438819885254, "rewards/margins": 27.224233627319336, "rewards/rejected": -26.805288314819336, "step": 5430 }, { "epoch": 2.48, "learning_rate": 5.7464074387151306e-08, "logits/chosen": -1.4001535177230835, "logits/rejected": -1.3215277194976807, "logps/chosen": -87.59601593017578, "logps/rejected": -119.82057189941406, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6788640022277832, "rewards/margins": 27.242944717407227, "rewards/rejected": -26.5640811920166, "step": 5440 }, { "epoch": 2.49, "learning_rate": 5.695688926458157e-08, "logits/chosen": -1.4465348720550537, "logits/rejected": -1.3477718830108643, "logps/chosen": -89.69527435302734, "logps/rejected": -118.54779052734375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.376774549484253, "rewards/margins": 28.213756561279297, "rewards/rejected": -25.836984634399414, "step": 5450 }, { "epoch": 2.49, "learning_rate": 5.6449704142011834e-08, "logits/chosen": -1.440271019935608, "logits/rejected": -1.3544074296951294, "logps/chosen": -85.5431900024414, "logps/rejected": -122.612060546875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.067859172821045, "rewards/margins": 29.92409896850586, "rewards/rejected": -28.85623550415039, "step": 5460 }, { "epoch": 2.5, "learning_rate": 5.5942519019442095e-08, "logits/chosen": -1.39127779006958, "logits/rejected": -1.325758695602417, "logps/chosen": -83.56703186035156, "logps/rejected": -124.7584457397461, "loss": 0.0022, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6307471990585327, "rewards/margins": 28.204599380493164, "rewards/rejected": -27.5738468170166, "step": 5470 }, { "epoch": 2.5, "learning_rate": 5.5435333896872356e-08, "logits/chosen": -1.4168808460235596, "logits/rejected": -1.3478556871414185, "logps/chosen": -89.90568542480469, "logps/rejected": -121.96623229980469, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.0229051113128662, "rewards/margins": 28.127361297607422, "rewards/rejected": -27.10445785522461, "step": 5480 }, { "epoch": 2.51, "learning_rate": 5.492814877430262e-08, "logits/chosen": -1.432664394378662, "logits/rejected": -1.359386920928955, "logps/chosen": -86.13729858398438, "logps/rejected": -120.99588775634766, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.0249274969100952, "rewards/margins": 28.6977481842041, "rewards/rejected": -27.67281723022461, "step": 5490 }, { "epoch": 2.51, "learning_rate": 5.442096365173288e-08, "logits/chosen": -1.4274625778198242, "logits/rejected": -1.3294579982757568, "logps/chosen": -94.29920959472656, "logps/rejected": -124.21602630615234, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.37506765127182007, "rewards/margins": 28.83444595336914, "rewards/rejected": -28.45937728881836, "step": 5500 }, { "epoch": 2.51, "eval_logits/chosen": -1.530104637145996, "eval_logits/rejected": -1.4378955364227295, "eval_logps/chosen": -88.8484115600586, "eval_logps/rejected": -120.21839141845703, "eval_loss": 0.007588541135191917, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.2920094132423401, "eval_rewards/margins": 27.730342864990234, "eval_rewards/rejected": -27.438335418701172, "eval_runtime": 63.525, "eval_samples_per_second": 45.053, "eval_steps_per_second": 2.818, "step": 5500 }, { "epoch": 2.51, "learning_rate": 5.3913778529163145e-08, "logits/chosen": -1.4144002199172974, "logits/rejected": -1.337476372718811, "logps/chosen": -86.93864440917969, "logps/rejected": -125.42852783203125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.5241835117340088, "rewards/margins": 29.53707504272461, "rewards/rejected": -29.012889862060547, "step": 5510 }, { "epoch": 2.52, "learning_rate": 5.3406593406593406e-08, "logits/chosen": -1.3884155750274658, "logits/rejected": -1.324973464012146, "logps/chosen": -83.95980834960938, "logps/rejected": -120.20356750488281, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.15396860241889954, "rewards/margins": 26.457748413085938, "rewards/rejected": -26.61171531677246, "step": 5520 }, { "epoch": 2.52, "learning_rate": 5.289940828402367e-08, "logits/chosen": -1.4237573146820068, "logits/rejected": -1.3433820009231567, "logps/chosen": -94.42403411865234, "logps/rejected": -125.9416732788086, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3251895606517792, "rewards/margins": 28.843847274780273, "rewards/rejected": -28.51865577697754, "step": 5530 }, { "epoch": 2.53, "learning_rate": 5.239222316145393e-08, "logits/chosen": -1.4355299472808838, "logits/rejected": -1.363010048866272, "logps/chosen": -88.18751525878906, "logps/rejected": -125.2325668334961, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.37160930037498474, "rewards/margins": 28.638696670532227, "rewards/rejected": -28.26708984375, "step": 5540 }, { "epoch": 2.53, "learning_rate": 5.188503803888419e-08, "logits/chosen": -1.4135535955429077, "logits/rejected": -1.331364631652832, "logps/chosen": -94.18409729003906, "logps/rejected": -123.58796691894531, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.10003487765789032, "rewards/margins": 28.183786392211914, "rewards/rejected": -28.083749771118164, "step": 5550 }, { "epoch": 2.54, "learning_rate": 5.137785291631445e-08, "logits/chosen": -1.4515702724456787, "logits/rejected": -1.3703655004501343, "logps/chosen": -86.55001068115234, "logps/rejected": -129.2012481689453, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.22907297313213348, "rewards/margins": 29.383153915405273, "rewards/rejected": -29.15407943725586, "step": 5560 }, { "epoch": 2.54, "learning_rate": 5.087066779374472e-08, "logits/chosen": -1.3880281448364258, "logits/rejected": -1.3039556741714478, "logps/chosen": -93.82992553710938, "logps/rejected": -118.81803131103516, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 1.1777141094207764, "rewards/margins": 27.936527252197266, "rewards/rejected": -26.758813858032227, "step": 5570 }, { "epoch": 2.55, "learning_rate": 5.036348267117498e-08, "logits/chosen": -1.4143790006637573, "logits/rejected": -1.3327438831329346, "logps/chosen": -90.89906311035156, "logps/rejected": -125.50030517578125, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.46656179428100586, "rewards/margins": 29.756689071655273, "rewards/rejected": -29.290124893188477, "step": 5580 }, { "epoch": 2.55, "learning_rate": 4.985629754860524e-08, "logits/chosen": -1.3946322202682495, "logits/rejected": -1.3158382177352905, "logps/chosen": -93.13166809082031, "logps/rejected": -120.45831298828125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.6907525062561035, "rewards/margins": 27.038888931274414, "rewards/rejected": -27.72964096069336, "step": 5590 }, { "epoch": 2.56, "learning_rate": 4.93491124260355e-08, "logits/chosen": -1.444240927696228, "logits/rejected": -1.3477418422698975, "logps/chosen": -92.03813171386719, "logps/rejected": -123.6197280883789, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.03295283392071724, "rewards/margins": 28.512805938720703, "rewards/rejected": -28.479854583740234, "step": 5600 }, { "epoch": 2.56, "eval_logits/chosen": -1.5288245677947998, "eval_logits/rejected": -1.4369823932647705, "eval_logps/chosen": -89.01703643798828, "eval_logps/rejected": -120.62100982666016, "eval_loss": 0.007766298484057188, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.2076922357082367, "eval_rewards/margins": 27.84734535217285, "eval_rewards/rejected": -27.639650344848633, "eval_runtime": 65.4543, "eval_samples_per_second": 43.725, "eval_steps_per_second": 2.735, "step": 5600 }, { "epoch": 2.56, "learning_rate": 4.884192730346576e-08, "logits/chosen": -1.4230222702026367, "logits/rejected": -1.3513391017913818, "logps/chosen": -90.27880096435547, "logps/rejected": -119.94172668457031, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.44236135482788086, "rewards/margins": 28.575183868408203, "rewards/rejected": -28.132823944091797, "step": 5610 }, { "epoch": 2.57, "learning_rate": 4.833474218089602e-08, "logits/chosen": -1.4552013874053955, "logits/rejected": -1.3836065530776978, "logps/chosen": -87.5635986328125, "logps/rejected": -122.162353515625, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -1.6596953868865967, "rewards/margins": 26.969858169555664, "rewards/rejected": -28.629552841186523, "step": 5620 }, { "epoch": 2.57, "learning_rate": 4.782755705832629e-08, "logits/chosen": -1.4242037534713745, "logits/rejected": -1.3452680110931396, "logps/chosen": -88.09815979003906, "logps/rejected": -124.96726989746094, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.3257520198822021, "rewards/margins": 29.73336410522461, "rewards/rejected": -28.407611846923828, "step": 5630 }, { "epoch": 2.57, "learning_rate": 4.732037193575655e-08, "logits/chosen": -1.4236841201782227, "logits/rejected": -1.3491275310516357, "logps/chosen": -91.39635467529297, "logps/rejected": -121.4232406616211, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2293356955051422, "rewards/margins": 27.425186157226562, "rewards/rejected": -27.195850372314453, "step": 5640 }, { "epoch": 2.58, "learning_rate": 4.681318681318681e-08, "logits/chosen": -1.4207613468170166, "logits/rejected": -1.3308521509170532, "logps/chosen": -91.08123779296875, "logps/rejected": -118.93770599365234, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.274003028869629, "rewards/margins": 28.229358673095703, "rewards/rejected": -26.955352783203125, "step": 5650 }, { "epoch": 2.58, "learning_rate": 4.630600169061707e-08, "logits/chosen": -1.4237556457519531, "logits/rejected": -1.3543760776519775, "logps/chosen": -85.66486358642578, "logps/rejected": -124.79557037353516, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.14971943199634552, "rewards/margins": 28.228811264038086, "rewards/rejected": -28.079092025756836, "step": 5660 }, { "epoch": 2.59, "learning_rate": 4.579881656804733e-08, "logits/chosen": -1.452664852142334, "logits/rejected": -1.3631222248077393, "logps/chosen": -93.4820556640625, "logps/rejected": -123.2723617553711, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.7186456918716431, "rewards/margins": 28.007343292236328, "rewards/rejected": -28.72598648071289, "step": 5670 }, { "epoch": 2.59, "learning_rate": 4.5291631445477594e-08, "logits/chosen": -1.4738563299179077, "logits/rejected": -1.396863341331482, "logps/chosen": -85.08771514892578, "logps/rejected": -125.86419677734375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.7788127660751343, "rewards/margins": 29.58379554748535, "rewards/rejected": -28.804983139038086, "step": 5680 }, { "epoch": 2.6, "learning_rate": 4.478444632290786e-08, "logits/chosen": -1.4112271070480347, "logits/rejected": -1.342129111289978, "logps/chosen": -93.12382507324219, "logps/rejected": -132.10922241210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.42668676376342773, "rewards/margins": 29.515426635742188, "rewards/rejected": -29.0887393951416, "step": 5690 }, { "epoch": 2.6, "learning_rate": 4.427726120033812e-08, "logits/chosen": -1.4182324409484863, "logits/rejected": -1.3355042934417725, "logps/chosen": -95.10716247558594, "logps/rejected": -121.40716552734375, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.24203085899353027, "rewards/margins": 27.118215560913086, "rewards/rejected": -27.360248565673828, "step": 5700 }, { "epoch": 2.6, "eval_logits/chosen": -1.521308422088623, "eval_logits/rejected": -1.4287875890731812, "eval_logps/chosen": -88.81271362304688, "eval_logps/rejected": -120.16915893554688, "eval_loss": 0.007557415869086981, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.3098601996898651, "eval_rewards/margins": 27.723575592041016, "eval_rewards/rejected": -27.413715362548828, "eval_runtime": 64.7947, "eval_samples_per_second": 44.17, "eval_steps_per_second": 2.763, "step": 5700 }, { "epoch": 2.61, "learning_rate": 4.377007607776838e-08, "logits/chosen": -1.4278671741485596, "logits/rejected": -1.3431296348571777, "logps/chosen": -88.24993133544922, "logps/rejected": -121.37858581542969, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.6779876947402954, "rewards/margins": 29.29281997680664, "rewards/rejected": -27.614831924438477, "step": 5710 }, { "epoch": 2.61, "learning_rate": 4.3262890955198644e-08, "logits/chosen": -1.3866733312606812, "logits/rejected": -1.3263394832611084, "logps/chosen": -89.70895385742188, "logps/rejected": -118.44575500488281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.36240631341934204, "rewards/margins": 26.25311851501465, "rewards/rejected": -25.89071273803711, "step": 5720 }, { "epoch": 2.62, "learning_rate": 4.2755705832628905e-08, "logits/chosen": -1.4577974081039429, "logits/rejected": -1.3561092615127563, "logps/chosen": -90.57540893554688, "logps/rejected": -122.50830078125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.9288503527641296, "rewards/margins": 29.591562271118164, "rewards/rejected": -28.662710189819336, "step": 5730 }, { "epoch": 2.62, "learning_rate": 4.2248520710059166e-08, "logits/chosen": -1.430964708328247, "logits/rejected": -1.362330675125122, "logps/chosen": -93.49796295166016, "logps/rejected": -127.06739807128906, "loss": 0.0033, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5318681001663208, "rewards/margins": 28.357601165771484, "rewards/rejected": -27.82573890686035, "step": 5740 }, { "epoch": 2.62, "learning_rate": 4.1741335587489433e-08, "logits/chosen": -1.4357731342315674, "logits/rejected": -1.3574306964874268, "logps/chosen": -88.33089447021484, "logps/rejected": -124.7070083618164, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.2313622683286667, "rewards/margins": 29.021398544311523, "rewards/rejected": -28.790035247802734, "step": 5750 }, { "epoch": 2.63, "learning_rate": 4.1234150464919694e-08, "logits/chosen": -1.4213526248931885, "logits/rejected": -1.364757776260376, "logps/chosen": -81.57777404785156, "logps/rejected": -119.826171875, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9536989331245422, "rewards/margins": 27.143407821655273, "rewards/rejected": -26.189708709716797, "step": 5760 }, { "epoch": 2.63, "learning_rate": 4.0726965342349955e-08, "logits/chosen": -1.4012161493301392, "logits/rejected": -1.3225289583206177, "logps/chosen": -91.90386199951172, "logps/rejected": -119.8282241821289, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.7034690380096436, "rewards/margins": 28.835796356201172, "rewards/rejected": -27.132328033447266, "step": 5770 }, { "epoch": 2.64, "learning_rate": 4.0219780219780216e-08, "logits/chosen": -1.4080169200897217, "logits/rejected": -1.338749647140503, "logps/chosen": -94.41007995605469, "logps/rejected": -125.92796325683594, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.140974998474121, "rewards/margins": 26.483749389648438, "rewards/rejected": -25.342771530151367, "step": 5780 }, { "epoch": 2.64, "learning_rate": 3.971259509721048e-08, "logits/chosen": -1.4085487127304077, "logits/rejected": -1.327182650566101, "logps/chosen": -88.75342559814453, "logps/rejected": -119.1405029296875, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 1.0149381160736084, "rewards/margins": 27.902973175048828, "rewards/rejected": -26.888031005859375, "step": 5790 }, { "epoch": 2.65, "learning_rate": 3.920540997464074e-08, "logits/chosen": -1.433281421661377, "logits/rejected": -1.3512113094329834, "logps/chosen": -90.19625854492188, "logps/rejected": -126.81318664550781, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 0.6478980779647827, "rewards/margins": 29.77631187438965, "rewards/rejected": -29.12841796875, "step": 5800 }, { "epoch": 2.65, "eval_logits/chosen": -1.5241363048553467, "eval_logits/rejected": -1.4326359033584595, "eval_logps/chosen": -88.73265838623047, "eval_logps/rejected": -120.24764251708984, "eval_loss": 0.007415792904794216, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.3498854637145996, "eval_rewards/margins": 27.802854537963867, "eval_rewards/rejected": -27.452966690063477, "eval_runtime": 66.3571, "eval_samples_per_second": 43.13, "eval_steps_per_second": 2.698, "step": 5800 }, { "epoch": 2.65, "learning_rate": 3.8698224852071005e-08, "logits/chosen": -1.3824363946914673, "logits/rejected": -1.3102766275405884, "logps/chosen": -90.2943115234375, "logps/rejected": -120.10107421875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.4802871346473694, "rewards/margins": 27.660263061523438, "rewards/rejected": -27.179973602294922, "step": 5810 }, { "epoch": 2.66, "learning_rate": 3.8191039729501266e-08, "logits/chosen": -1.405815601348877, "logits/rejected": -1.3217995166778564, "logps/chosen": -83.80717468261719, "logps/rejected": -120.4073715209961, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3315509557724, "rewards/margins": 28.07989501953125, "rewards/rejected": -26.748342514038086, "step": 5820 }, { "epoch": 2.66, "learning_rate": 3.768385460693153e-08, "logits/chosen": -1.400803565979004, "logits/rejected": -1.3341869115829468, "logps/chosen": -85.88481140136719, "logps/rejected": -122.70805358886719, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.08723556995391846, "rewards/margins": 27.86008071899414, "rewards/rejected": -27.947315216064453, "step": 5830 }, { "epoch": 2.67, "learning_rate": 3.717666948436179e-08, "logits/chosen": -1.4672653675079346, "logits/rejected": -1.3654979467391968, "logps/chosen": -88.60813903808594, "logps/rejected": -124.53550720214844, "loss": 0.0013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0332581996917725, "rewards/margins": 29.575796127319336, "rewards/rejected": -28.54253578186035, "step": 5840 }, { "epoch": 2.67, "learning_rate": 3.6669484361792056e-08, "logits/chosen": -1.4773852825164795, "logits/rejected": -1.3942975997924805, "logps/chosen": -84.61897277832031, "logps/rejected": -118.28257751464844, "loss": 0.0059, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.075499415397644, "rewards/margins": 27.52908706665039, "rewards/rejected": -26.45358657836914, "step": 5850 }, { "epoch": 2.67, "learning_rate": 3.6162299239222316e-08, "logits/chosen": -1.4367072582244873, "logits/rejected": -1.3531558513641357, "logps/chosen": -92.48080444335938, "logps/rejected": -124.60569763183594, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9651349782943726, "rewards/margins": 30.93218421936035, "rewards/rejected": -28.9670467376709, "step": 5860 }, { "epoch": 2.68, "learning_rate": 3.565511411665258e-08, "logits/chosen": -1.4036922454833984, "logits/rejected": -1.3271372318267822, "logps/chosen": -91.8263168334961, "logps/rejected": -123.1822509765625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.49861449003219604, "rewards/margins": 27.064804077148438, "rewards/rejected": -26.56618881225586, "step": 5870 }, { "epoch": 2.68, "learning_rate": 3.514792899408284e-08, "logits/chosen": -1.3846232891082764, "logits/rejected": -1.3122532367706299, "logps/chosen": -84.51948547363281, "logps/rejected": -121.88045501708984, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.880649209022522, "rewards/margins": 28.404842376708984, "rewards/rejected": -27.524194717407227, "step": 5880 }, { "epoch": 2.69, "learning_rate": 3.46407438715131e-08, "logits/chosen": -1.4396214485168457, "logits/rejected": -1.3636229038238525, "logps/chosen": -93.21275329589844, "logps/rejected": -124.11541748046875, "loss": 0.0087, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.49375027418136597, "rewards/margins": 28.062625885009766, "rewards/rejected": -27.568878173828125, "step": 5890 }, { "epoch": 2.69, "learning_rate": 3.413355874894336e-08, "logits/chosen": -1.4009182453155518, "logits/rejected": -1.3155372142791748, "logps/chosen": -90.7492446899414, "logps/rejected": -121.24101257324219, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.570563018321991, "rewards/margins": 28.15814781188965, "rewards/rejected": -27.587589263916016, "step": 5900 }, { "epoch": 2.69, "eval_logits/chosen": -1.5298057794570923, "eval_logits/rejected": -1.4371873140335083, "eval_logps/chosen": -88.87010955810547, "eval_logps/rejected": -120.50233459472656, "eval_loss": 0.007439317647367716, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.28115907311439514, "eval_rewards/margins": 27.861465454101562, "eval_rewards/rejected": -27.580307006835938, "eval_runtime": 61.2948, "eval_samples_per_second": 46.692, "eval_steps_per_second": 2.92, "step": 5900 }, { "epoch": 2.7, "learning_rate": 3.362637362637363e-08, "logits/chosen": -1.386823058128357, "logits/rejected": -1.317825198173523, "logps/chosen": -88.96937561035156, "logps/rejected": -118.41639709472656, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.38685184717178345, "rewards/margins": 26.644840240478516, "rewards/rejected": -26.257986068725586, "step": 5910 }, { "epoch": 2.7, "learning_rate": 3.311918850380389e-08, "logits/chosen": -1.4166361093521118, "logits/rejected": -1.3327529430389404, "logps/chosen": -91.54283905029297, "logps/rejected": -125.4769515991211, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.6538678407669067, "rewards/margins": 29.736963272094727, "rewards/rejected": -29.083093643188477, "step": 5920 }, { "epoch": 2.71, "learning_rate": 3.261200338123415e-08, "logits/chosen": -1.4290189743041992, "logits/rejected": -1.3503470420837402, "logps/chosen": -93.33061218261719, "logps/rejected": -120.43190002441406, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.5092540979385376, "rewards/margins": 27.29727554321289, "rewards/rejected": -25.788021087646484, "step": 5930 }, { "epoch": 2.71, "learning_rate": 3.210481825866441e-08, "logits/chosen": -1.4096345901489258, "logits/rejected": -1.3365631103515625, "logps/chosen": -92.1152114868164, "logps/rejected": -129.61373901367188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.6155846118927002, "rewards/margins": 27.218673706054688, "rewards/rejected": -27.83425521850586, "step": 5940 }, { "epoch": 2.72, "learning_rate": 3.159763313609467e-08, "logits/chosen": -1.4408024549484253, "logits/rejected": -1.358703851699829, "logps/chosen": -92.8883056640625, "logps/rejected": -127.37141418457031, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.4508030414581299, "rewards/margins": 29.662633895874023, "rewards/rejected": -28.211828231811523, "step": 5950 }, { "epoch": 2.72, "learning_rate": 3.109044801352493e-08, "logits/chosen": -1.4268407821655273, "logits/rejected": -1.3649609088897705, "logps/chosen": -91.47252655029297, "logps/rejected": -127.54573059082031, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.9097731709480286, "rewards/margins": 28.423721313476562, "rewards/rejected": -27.51395034790039, "step": 5960 }, { "epoch": 2.72, "learning_rate": 3.05832628909552e-08, "logits/chosen": -1.427631139755249, "logits/rejected": -1.349169373512268, "logps/chosen": -88.80760192871094, "logps/rejected": -119.60186767578125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.3762190341949463, "rewards/margins": 28.201892852783203, "rewards/rejected": -26.825674057006836, "step": 5970 }, { "epoch": 2.73, "learning_rate": 3.007607776838546e-08, "logits/chosen": -1.433571219444275, "logits/rejected": -1.3482530117034912, "logps/chosen": -92.32679748535156, "logps/rejected": -124.57658386230469, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.08425948768854141, "rewards/margins": 28.501611709594727, "rewards/rejected": -28.4173526763916, "step": 5980 }, { "epoch": 2.73, "learning_rate": 2.956889264581572e-08, "logits/chosen": -1.4247334003448486, "logits/rejected": -1.3412024974822998, "logps/chosen": -97.79167938232422, "logps/rejected": -129.82369995117188, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.341404676437378, "rewards/margins": 30.7274169921875, "rewards/rejected": -29.38601303100586, "step": 5990 }, { "epoch": 2.74, "learning_rate": 2.9061707523245986e-08, "logits/chosen": -1.4527391195297241, "logits/rejected": -1.3852466344833374, "logps/chosen": -84.54109191894531, "logps/rejected": -123.23868560791016, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.4699872136116028, "rewards/margins": 28.9207706451416, "rewards/rejected": -28.450780868530273, "step": 6000 }, { "epoch": 2.74, "eval_logits/chosen": -1.5361018180847168, "eval_logits/rejected": -1.4470702409744263, "eval_logps/chosen": -88.81427764892578, "eval_logps/rejected": -120.10942077636719, "eval_loss": 0.0073595428839325905, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 0.30907416343688965, "eval_rewards/margins": 27.692928314208984, "eval_rewards/rejected": -27.383853912353516, "eval_runtime": 66.1237, "eval_samples_per_second": 43.283, "eval_steps_per_second": 2.707, "step": 6000 }, { "epoch": 2.74, "learning_rate": 2.8554522400676247e-08, "logits/chosen": -1.437772512435913, "logits/rejected": -1.3702958822250366, "logps/chosen": -88.56996154785156, "logps/rejected": -120.36534118652344, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.8493738174438477, "rewards/margins": 25.939929962158203, "rewards/rejected": -26.789306640625, "step": 6010 }, { "epoch": 2.75, "learning_rate": 2.8047337278106507e-08, "logits/chosen": -1.3986080884933472, "logits/rejected": -1.3344132900238037, "logps/chosen": -87.32596588134766, "logps/rejected": -124.3438949584961, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7483171820640564, "rewards/margins": 28.974384307861328, "rewards/rejected": -28.2260684967041, "step": 6020 }, { "epoch": 2.75, "learning_rate": 2.754015215553677e-08, "logits/chosen": -1.4268220663070679, "logits/rejected": -1.3452110290527344, "logps/chosen": -91.21931457519531, "logps/rejected": -122.6042709350586, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.31938114762306213, "rewards/margins": 27.742395401000977, "rewards/rejected": -27.423009872436523, "step": 6030 }, { "epoch": 2.76, "learning_rate": 2.7032967032967033e-08, "logits/chosen": -1.4078203439712524, "logits/rejected": -1.341101884841919, "logps/chosen": -88.9620132446289, "logps/rejected": -126.86651611328125, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.03730924054980278, "rewards/margins": 28.27874755859375, "rewards/rejected": -28.24143409729004, "step": 6040 }, { "epoch": 2.76, "learning_rate": 2.6525781910397293e-08, "logits/chosen": -1.4218459129333496, "logits/rejected": -1.3417071104049683, "logps/chosen": -94.39894104003906, "logps/rejected": -126.17645263671875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.41776663064956665, "rewards/margins": 27.610363006591797, "rewards/rejected": -27.192596435546875, "step": 6050 }, { "epoch": 2.77, "learning_rate": 2.6018596787827558e-08, "logits/chosen": -1.3829963207244873, "logits/rejected": -1.3349157571792603, "logps/chosen": -81.25392150878906, "logps/rejected": -116.95680236816406, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.4236263334751129, "rewards/margins": 25.6158447265625, "rewards/rejected": -25.192218780517578, "step": 6060 }, { "epoch": 2.77, "learning_rate": 2.551141166525782e-08, "logits/chosen": -1.4441430568695068, "logits/rejected": -1.3542683124542236, "logps/chosen": -94.20352172851562, "logps/rejected": -124.4933090209961, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.7039328813552856, "rewards/margins": 27.073434829711914, "rewards/rejected": -26.369503021240234, "step": 6070 }, { "epoch": 2.77, "learning_rate": 2.500422654268808e-08, "logits/chosen": -1.4258487224578857, "logits/rejected": -1.357102632522583, "logps/chosen": -86.0672607421875, "logps/rejected": -125.6698226928711, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.5518980026245117, "rewards/margins": 29.156414031982422, "rewards/rejected": -28.604516983032227, "step": 6080 }, { "epoch": 2.78, "learning_rate": 2.4497041420118344e-08, "logits/chosen": -1.44197678565979, "logits/rejected": -1.3703984022140503, "logps/chosen": -84.29279327392578, "logps/rejected": -122.15272521972656, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.0316746234893799, "rewards/margins": 28.804452896118164, "rewards/rejected": -27.772777557373047, "step": 6090 }, { "epoch": 2.78, "learning_rate": 2.3989856297548605e-08, "logits/chosen": -1.4140945672988892, "logits/rejected": -1.3373467922210693, "logps/chosen": -92.00602722167969, "logps/rejected": -121.0610122680664, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.4240033030509949, "rewards/margins": 27.450969696044922, "rewards/rejected": -27.026966094970703, "step": 6100 }, { "epoch": 2.78, "eval_logits/chosen": -1.528950810432434, "eval_logits/rejected": -1.4393686056137085, "eval_logps/chosen": -88.54615783691406, "eval_logps/rejected": -119.571533203125, "eval_loss": 0.007185075432062149, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.4431401193141937, "eval_rewards/margins": 27.55805015563965, "eval_rewards/rejected": -27.114912033081055, "eval_runtime": 67.6666, "eval_samples_per_second": 42.296, "eval_steps_per_second": 2.645, "step": 6100 }, { "epoch": 2.79, "learning_rate": 2.3482671174978865e-08, "logits/chosen": -1.3959143161773682, "logits/rejected": -1.329087495803833, "logps/chosen": -90.89431762695312, "logps/rejected": -123.38334655761719, "loss": 0.0033, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0483791828155518, "rewards/margins": 25.339763641357422, "rewards/rejected": -26.388141632080078, "step": 6110 }, { "epoch": 2.79, "learning_rate": 2.297548605240913e-08, "logits/chosen": -1.4264185428619385, "logits/rejected": -1.3347035646438599, "logps/chosen": -95.05891418457031, "logps/rejected": -122.20552825927734, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.40551090240478516, "rewards/margins": 27.485652923583984, "rewards/rejected": -27.080142974853516, "step": 6120 }, { "epoch": 2.8, "learning_rate": 2.246830092983939e-08, "logits/chosen": -1.4183063507080078, "logits/rejected": -1.3475987911224365, "logps/chosen": -90.90924835205078, "logps/rejected": -123.7392349243164, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.11213073879480362, "rewards/margins": 26.713592529296875, "rewards/rejected": -26.82572364807129, "step": 6130 }, { "epoch": 2.8, "learning_rate": 2.196111580726965e-08, "logits/chosen": -1.3835358619689941, "logits/rejected": -1.304900884628296, "logps/chosen": -94.07560729980469, "logps/rejected": -119.58497619628906, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.7558839321136475, "rewards/margins": 26.48470115661621, "rewards/rejected": -25.72881507873535, "step": 6140 }, { "epoch": 2.81, "learning_rate": 2.1453930684699916e-08, "logits/chosen": -1.4276055097579956, "logits/rejected": -1.356466293334961, "logps/chosen": -92.86775207519531, "logps/rejected": -124.24592590332031, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.07010960578918457, "rewards/margins": 28.186676025390625, "rewards/rejected": -28.25678062438965, "step": 6150 }, { "epoch": 2.81, "learning_rate": 2.0946745562130177e-08, "logits/chosen": -1.3897449970245361, "logits/rejected": -1.3085477352142334, "logps/chosen": -89.38758850097656, "logps/rejected": -123.3631820678711, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.502859890460968, "rewards/margins": 29.507598876953125, "rewards/rejected": -29.00473976135254, "step": 6160 }, { "epoch": 2.82, "learning_rate": 2.0439560439560437e-08, "logits/chosen": -1.407470464706421, "logits/rejected": -1.3406095504760742, "logps/chosen": -93.42985534667969, "logps/rejected": -124.22257995605469, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7225341200828552, "rewards/margins": 28.852397918701172, "rewards/rejected": -28.12986183166504, "step": 6170 }, { "epoch": 2.82, "learning_rate": 1.99323753169907e-08, "logits/chosen": -1.3827449083328247, "logits/rejected": -1.3094285726547241, "logps/chosen": -93.71525573730469, "logps/rejected": -119.5619125366211, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6188391447067261, "rewards/margins": 26.20868492126465, "rewards/rejected": -25.58984375, "step": 6180 }, { "epoch": 2.83, "learning_rate": 1.9425190194420963e-08, "logits/chosen": -1.3776720762252808, "logits/rejected": -1.3083680868148804, "logps/chosen": -91.71514129638672, "logps/rejected": -120.59306335449219, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.6132274270057678, "rewards/margins": 26.796030044555664, "rewards/rejected": -27.409259796142578, "step": 6190 }, { "epoch": 2.83, "learning_rate": 1.8918005071851223e-08, "logits/chosen": -1.4177929162979126, "logits/rejected": -1.3377645015716553, "logps/chosen": -87.91746520996094, "logps/rejected": -118.75276184082031, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.8456829786300659, "rewards/margins": 27.677001953125, "rewards/rejected": -26.831317901611328, "step": 6200 }, { "epoch": 2.83, "eval_logits/chosen": -1.5227910280227661, "eval_logits/rejected": -1.4297844171524048, "eval_logps/chosen": -88.48534393310547, "eval_logps/rejected": -119.60254669189453, "eval_loss": 0.00707243150100112, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.47354137897491455, "eval_rewards/margins": 27.603960037231445, "eval_rewards/rejected": -27.130416870117188, "eval_runtime": 65.0475, "eval_samples_per_second": 43.999, "eval_steps_per_second": 2.752, "step": 6200 }, { "epoch": 2.83, "learning_rate": 1.8410819949281488e-08, "logits/chosen": -1.4211013317108154, "logits/rejected": -1.336397409439087, "logps/chosen": -89.54491424560547, "logps/rejected": -122.1443862915039, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 1.5423094034194946, "rewards/margins": 28.126476287841797, "rewards/rejected": -26.58416748046875, "step": 6210 }, { "epoch": 2.84, "learning_rate": 1.790363482671175e-08, "logits/chosen": -1.4224960803985596, "logits/rejected": -1.3450286388397217, "logps/chosen": -91.13011169433594, "logps/rejected": -124.55561828613281, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6848041415214539, "rewards/margins": 27.732412338256836, "rewards/rejected": -27.047607421875, "step": 6220 }, { "epoch": 2.84, "learning_rate": 1.7396449704142013e-08, "logits/chosen": -1.4057667255401611, "logits/rejected": -1.3270689249038696, "logps/chosen": -94.62673950195312, "logps/rejected": -121.52132415771484, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3413497507572174, "rewards/margins": 27.458669662475586, "rewards/rejected": -27.117321014404297, "step": 6230 }, { "epoch": 2.85, "learning_rate": 1.6889264581572274e-08, "logits/chosen": -1.433934211730957, "logits/rejected": -1.343518614768982, "logps/chosen": -90.16001892089844, "logps/rejected": -122.25347900390625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.359809547662735, "rewards/margins": 28.131118774414062, "rewards/rejected": -27.771310806274414, "step": 6240 }, { "epoch": 2.85, "learning_rate": 1.6382079459002535e-08, "logits/chosen": -1.4110838174819946, "logits/rejected": -1.3541514873504639, "logps/chosen": -91.34024810791016, "logps/rejected": -128.32180786132812, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.36720380187034607, "rewards/margins": 27.321441650390625, "rewards/rejected": -27.688648223876953, "step": 6250 }, { "epoch": 2.86, "learning_rate": 1.58748943364328e-08, "logits/chosen": -1.389146089553833, "logits/rejected": -1.3081094026565552, "logps/chosen": -88.67632293701172, "logps/rejected": -115.4472885131836, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.8905091285705566, "rewards/margins": 27.710601806640625, "rewards/rejected": -26.82008934020996, "step": 6260 }, { "epoch": 2.86, "learning_rate": 1.536770921386306e-08, "logits/chosen": -1.3963115215301514, "logits/rejected": -1.303027868270874, "logps/chosen": -94.36498260498047, "logps/rejected": -124.65643310546875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.32212838530540466, "rewards/margins": 29.300546646118164, "rewards/rejected": -28.97841453552246, "step": 6270 }, { "epoch": 2.87, "learning_rate": 1.486052409129332e-08, "logits/chosen": -1.4071121215820312, "logits/rejected": -1.324487566947937, "logps/chosen": -91.63459777832031, "logps/rejected": -124.7200698852539, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.18130116164684296, "rewards/margins": 28.662633895874023, "rewards/rejected": -28.481334686279297, "step": 6280 }, { "epoch": 2.87, "learning_rate": 1.4353338968723583e-08, "logits/chosen": -1.3938955068588257, "logits/rejected": -1.3304332494735718, "logps/chosen": -86.27269744873047, "logps/rejected": -119.1937255859375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.13392703235149384, "rewards/margins": 26.717966079711914, "rewards/rejected": -26.584041595458984, "step": 6290 }, { "epoch": 2.88, "learning_rate": 1.3846153846153846e-08, "logits/chosen": -1.4227640628814697, "logits/rejected": -1.3536622524261475, "logps/chosen": -89.34703063964844, "logps/rejected": -123.28599548339844, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 2.276293992996216, "rewards/margins": 29.977630615234375, "rewards/rejected": -27.701339721679688, "step": 6300 }, { "epoch": 2.88, "eval_logits/chosen": -1.5174129009246826, "eval_logits/rejected": -1.4249118566513062, "eval_logps/chosen": -88.27791595458984, "eval_logps/rejected": -119.14039611816406, "eval_loss": 0.007073402404785156, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.5772566795349121, "eval_rewards/margins": 27.476600646972656, "eval_rewards/rejected": -26.89933967590332, "eval_runtime": 65.0621, "eval_samples_per_second": 43.989, "eval_steps_per_second": 2.751, "step": 6300 }, { "epoch": 2.88, "learning_rate": 1.3338968723584107e-08, "logits/chosen": -1.4048776626586914, "logits/rejected": -1.3209692239761353, "logps/chosen": -94.9172592163086, "logps/rejected": -121.06196594238281, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.9303966760635376, "rewards/margins": 27.744037628173828, "rewards/rejected": -26.813640594482422, "step": 6310 }, { "epoch": 2.88, "learning_rate": 1.2831783601014369e-08, "logits/chosen": -1.4251675605773926, "logits/rejected": -1.3501653671264648, "logps/chosen": -87.30450439453125, "logps/rejected": -124.9731674194336, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.17765673995018005, "rewards/margins": 27.48089599609375, "rewards/rejected": -27.303241729736328, "step": 6320 }, { "epoch": 2.89, "learning_rate": 1.2324598478444632e-08, "logits/chosen": -1.3993405103683472, "logits/rejected": -1.3213218450546265, "logps/chosen": -89.23213958740234, "logps/rejected": -119.50053405761719, "loss": 0.0077, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9308843612670898, "rewards/margins": 29.413715362548828, "rewards/rejected": -28.482830047607422, "step": 6330 }, { "epoch": 2.89, "learning_rate": 1.1817413355874893e-08, "logits/chosen": -1.3939188718795776, "logits/rejected": -1.326827883720398, "logps/chosen": -90.73262786865234, "logps/rejected": -124.04630279541016, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.2764853835105896, "rewards/margins": 27.666738510131836, "rewards/rejected": -27.390254974365234, "step": 6340 }, { "epoch": 2.9, "learning_rate": 1.1310228233305155e-08, "logits/chosen": -1.3985464572906494, "logits/rejected": -1.3156406879425049, "logps/chosen": -86.709716796875, "logps/rejected": -120.27693176269531, "loss": 0.0025, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8232513666152954, "rewards/margins": 27.919193267822266, "rewards/rejected": -27.0959415435791, "step": 6350 }, { "epoch": 2.9, "learning_rate": 1.080304311073542e-08, "logits/chosen": -1.3861408233642578, "logits/rejected": -1.3142074346542358, "logps/chosen": -89.95799255371094, "logps/rejected": -118.38623046875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.4884513318538666, "rewards/margins": 26.449548721313477, "rewards/rejected": -26.937997817993164, "step": 6360 }, { "epoch": 2.91, "learning_rate": 1.0295857988165679e-08, "logits/chosen": -1.4367835521697998, "logits/rejected": -1.3561252355575562, "logps/chosen": -91.15764617919922, "logps/rejected": -122.2125473022461, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2589377164840698, "rewards/margins": 28.840377807617188, "rewards/rejected": -27.581439971923828, "step": 6370 }, { "epoch": 2.91, "learning_rate": 9.788672865595943e-09, "logits/chosen": -1.4210089445114136, "logits/rejected": -1.3463882207870483, "logps/chosen": -85.81571197509766, "logps/rejected": -118.3035659790039, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.9070496559143066, "rewards/margins": 27.036890029907227, "rewards/rejected": -26.129840850830078, "step": 6380 }, { "epoch": 2.92, "learning_rate": 9.281487743026204e-09, "logits/chosen": -1.4335598945617676, "logits/rejected": -1.355756163597107, "logps/chosen": -90.24996185302734, "logps/rejected": -118.03495025634766, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.1046497821807861, "rewards/margins": 27.585540771484375, "rewards/rejected": -26.480892181396484, "step": 6390 }, { "epoch": 2.92, "learning_rate": 8.774302620456466e-09, "logits/chosen": -1.3927547931671143, "logits/rejected": -1.3146473169326782, "logps/chosen": -85.73943328857422, "logps/rejected": -122.47743225097656, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 2.633237600326538, "rewards/margins": 30.070592880249023, "rewards/rejected": -27.43735694885254, "step": 6400 }, { "epoch": 2.92, "eval_logits/chosen": -1.5202581882476807, "eval_logits/rejected": -1.428078293800354, "eval_logps/chosen": -88.32048034667969, "eval_logps/rejected": -119.28140258789062, "eval_loss": 0.007134352345019579, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.5559725165367126, "eval_rewards/margins": 27.525819778442383, "eval_rewards/rejected": -26.969846725463867, "eval_runtime": 69.0433, "eval_samples_per_second": 41.452, "eval_steps_per_second": 2.593, "step": 6400 }, { "epoch": 2.93, "learning_rate": 8.267117497886729e-09, "logits/chosen": -1.4425251483917236, "logits/rejected": -1.3791589736938477, "logps/chosen": -87.10059356689453, "logps/rejected": -123.00959777832031, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.5705853700637817, "rewards/margins": 27.75478744506836, "rewards/rejected": -27.1842041015625, "step": 6410 }, { "epoch": 2.93, "learning_rate": 7.75993237531699e-09, "logits/chosen": -1.3837153911590576, "logits/rejected": -1.322196364402771, "logps/chosen": -84.60411071777344, "logps/rejected": -121.41426849365234, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2655871510505676, "rewards/margins": 26.094432830810547, "rewards/rejected": -25.828845977783203, "step": 6420 }, { "epoch": 2.93, "learning_rate": 7.252747252747252e-09, "logits/chosen": -1.4268747568130493, "logits/rejected": -1.349467158317566, "logps/chosen": -89.53246307373047, "logps/rejected": -123.66633605957031, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6228126883506775, "rewards/margins": 29.124929428100586, "rewards/rejected": -28.502117156982422, "step": 6430 }, { "epoch": 2.94, "learning_rate": 6.745562130177514e-09, "logits/chosen": -1.3894002437591553, "logits/rejected": -1.328368067741394, "logps/chosen": -85.84336853027344, "logps/rejected": -120.75773620605469, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8587135076522827, "rewards/margins": 26.4877986907959, "rewards/rejected": -25.62908363342285, "step": 6440 }, { "epoch": 2.94, "learning_rate": 6.238377007607776e-09, "logits/chosen": -1.4132146835327148, "logits/rejected": -1.3355062007904053, "logps/chosen": -87.57127380371094, "logps/rejected": -124.08438873291016, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.5653873682022095, "rewards/margins": 28.114559173583984, "rewards/rejected": -27.549169540405273, "step": 6450 }, { "epoch": 2.95, "learning_rate": 5.731191885038039e-09, "logits/chosen": -1.4229376316070557, "logits/rejected": -1.3501025438308716, "logps/chosen": -88.9939956665039, "logps/rejected": -123.60148620605469, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.9622406959533691, "rewards/margins": 28.20614242553711, "rewards/rejected": -27.243900299072266, "step": 6460 }, { "epoch": 2.95, "learning_rate": 5.224006762468301e-09, "logits/chosen": -1.3894935846328735, "logits/rejected": -1.3000738620758057, "logps/chosen": -93.95217895507812, "logps/rejected": -121.9637451171875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.0586116313934326, "rewards/margins": 28.589069366455078, "rewards/rejected": -27.530452728271484, "step": 6470 }, { "epoch": 2.96, "learning_rate": 4.7168216398985626e-09, "logits/chosen": -1.396301507949829, "logits/rejected": -1.3185522556304932, "logps/chosen": -89.41260528564453, "logps/rejected": -119.04069519042969, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.45848971605300903, "rewards/margins": 27.00503158569336, "rewards/rejected": -26.54654312133789, "step": 6480 }, { "epoch": 2.96, "learning_rate": 4.209636517328825e-09, "logits/chosen": -1.3974330425262451, "logits/rejected": -1.3252642154693604, "logps/chosen": -85.48316955566406, "logps/rejected": -120.0402603149414, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.14276909828186035, "rewards/margins": 27.301326751708984, "rewards/rejected": -27.158557891845703, "step": 6490 }, { "epoch": 2.97, "learning_rate": 3.702451394759087e-09, "logits/chosen": -1.4150840044021606, "logits/rejected": -1.333387017250061, "logps/chosen": -89.85456085205078, "logps/rejected": -121.59066009521484, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.14147010445594788, "rewards/margins": 26.220012664794922, "rewards/rejected": -26.078542709350586, "step": 6500 }, { "epoch": 2.97, "eval_logits/chosen": -1.524153232574463, "eval_logits/rejected": -1.4333622455596924, "eval_logps/chosen": -88.29774475097656, "eval_logps/rejected": -119.24208068847656, "eval_loss": 0.006867639254778624, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.5673406720161438, "eval_rewards/margins": 27.51752281188965, "eval_rewards/rejected": -26.95018196105957, "eval_runtime": 78.1379, "eval_samples_per_second": 36.628, "eval_steps_per_second": 2.291, "step": 6500 } ], "logging_steps": 10, "max_steps": 6573, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }