{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5234231876472127, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005234231876472127, "grad_norm": 126.29230499267578, "learning_rate": 0.0, "logits/chosen": -0.40118408203125, "logits/rejected": -0.41802978515625, "logps/chosen": -297.609375, "logps/rejected": -247.84375, "logps/weighted_chosen": -4.5152587890625, "logps/weighted_rejected": -3.032470703125, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/weighted_accuracies": 0.0, "rewards/weighted_chosen": 0.0, "rewards/weighted_margins": 0.0, "rewards/weighted_rejected": 0.0, "step": 1 }, { "epoch": 0.005234231876472127, "grad_norm": 296.4369812011719, "learning_rate": 4.6875e-08, "logits/chosen": -0.3177456259727478, "logits/rejected": -0.3534359335899353, "logps/chosen": -275.5711669921875, "logps/rejected": -255.90451049804688, "logps/weighted_chosen": -2.350965738296509, "logps/weighted_rejected": -2.549940347671509, "loss": 0.6917, "rewards/accuracies": 0.25, "rewards/chosen": -0.0401475690305233, "rewards/margins": 0.04296875, "rewards/rejected": -0.0831163227558136, "rewards/weighted_accuracies": 0.3229166567325592, "rewards/weighted_chosen": -0.00032212998485192657, "rewards/weighted_margins": 0.00019327799964230508, "rewards/weighted_rejected": -0.0005154079990461469, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 21.67967414855957, "learning_rate": 9.895833333333332e-08, "logits/chosen": -0.29769057035446167, "logits/rejected": -0.313650518655777, "logps/chosen": -294.3374938964844, "logps/rejected": -272.6703186035156, "logps/weighted_chosen": -2.13031005859375, "logps/weighted_rejected": -2.103222608566284, "loss": 0.6908, "rewards/accuracies": 0.3125, "rewards/chosen": -0.06103515625, "rewards/margins": -0.01318359375, "rewards/rejected": -0.0478515625, "rewards/weighted_accuracies": 0.4437499940395355, "rewards/weighted_chosen": 0.0014366150135174394, "rewards/weighted_margins": 0.0021545409690588713, "rewards/weighted_rejected": -0.0007179260137490928, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 76.9887466430664, "learning_rate": 1.5104166666666664e-07, "logits/chosen": -0.2917121946811676, "logits/rejected": -0.337240606546402, "logps/chosen": -298.02655029296875, "logps/rejected": -268.12188720703125, "logps/weighted_chosen": -2.0724120140075684, "logps/weighted_rejected": -2.4466919898986816, "loss": 0.6912, "rewards/accuracies": 0.28125, "rewards/chosen": -0.0062500000931322575, "rewards/margins": -0.02509765699505806, "rewards/rejected": 0.01884765550494194, "rewards/weighted_accuracies": 0.4281249940395355, "rewards/weighted_chosen": 0.0027938843704760075, "rewards/weighted_margins": 0.0019706725142896175, "rewards/weighted_rejected": 0.0008232116815634072, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 32.98203659057617, "learning_rate": 2.03125e-07, "logits/chosen": -0.3011154234409332, "logits/rejected": -0.3432762026786804, "logps/chosen": -278.63751220703125, "logps/rejected": -253.88125610351562, "logps/weighted_chosen": -2.2070555686950684, "logps/weighted_rejected": -2.605224609375, "loss": 0.692, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": 0.0034667968284338713, "rewards/margins": -0.05991210788488388, "rewards/rejected": 0.06337890774011612, "rewards/weighted_accuracies": 0.35624998807907104, "rewards/weighted_chosen": 0.0014549255138263106, "rewards/weighted_margins": -0.00034332275390625, "rewards/weighted_rejected": 0.0017982482677325606, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 20.751684188842773, "learning_rate": 2.552083333333333e-07, "logits/chosen": -0.2822524905204773, "logits/rejected": -0.32080918550491333, "logps/chosen": -280.31329345703125, "logps/rejected": -267.58709716796875, "logps/weighted_chosen": -2.136962890625, "logps/weighted_rejected": -2.1753907203674316, "loss": 0.6883, "rewards/accuracies": 0.3125, "rewards/chosen": -0.07236327975988388, "rewards/margins": -0.09189452975988388, "rewards/rejected": 0.01953125, "rewards/weighted_accuracies": 0.4375, "rewards/weighted_chosen": 0.0054107666946947575, "rewards/weighted_margins": 0.0078063965775072575, "rewards/weighted_rejected": -0.0023956298828125, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 40.70024108886719, "learning_rate": 3.0729166666666665e-07, "logits/chosen": -0.3149581849575043, "logits/rejected": -0.3086872100830078, "logps/chosen": -277.6031188964844, "logps/rejected": -261.8031311035156, "logps/weighted_chosen": -2.5905518531799316, "logps/weighted_rejected": -2.4834961891174316, "loss": 0.6874, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": 0.03662109375, "rewards/margins": 0.12646484375, "rewards/rejected": -0.08984375, "rewards/weighted_accuracies": 0.5, "rewards/weighted_chosen": 0.0004280090215615928, "rewards/weighted_margins": 0.01105651818215847, "rewards/weighted_rejected": -0.01062927208840847, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 67.51947021484375, "learning_rate": 3.59375e-07, "logits/chosen": -0.318746954202652, "logits/rejected": -0.32574766874313354, "logps/chosen": -289.90313720703125, "logps/rejected": -245.04452514648438, "logps/weighted_chosen": -2.098431348800659, "logps/weighted_rejected": -2.392407178878784, "loss": 0.6841, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.16708984971046448, "rewards/margins": 0.4442382752895355, "rewards/rejected": -0.27714842557907104, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": 0.015575408935546875, "rewards/weighted_margins": 0.02174072340130806, "rewards/weighted_rejected": -0.00616531353443861, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 68.87100982666016, "learning_rate": 4.114583333333333e-07, "logits/chosen": -0.286581426858902, "logits/rejected": -0.3082527220249176, "logps/chosen": -289.5101623535156, "logps/rejected": -270.4375, "logps/weighted_chosen": -2.2385497093200684, "logps/weighted_rejected": -2.4218382835388184, "loss": 0.6727, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.31572264432907104, "rewards/margins": 0.5547851324081421, "rewards/rejected": -0.23906250298023224, "rewards/weighted_accuracies": 0.596875011920929, "rewards/weighted_chosen": 0.03613891452550888, "rewards/weighted_margins": 0.05283202975988388, "rewards/weighted_rejected": -0.01669769361615181, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 40.29203414916992, "learning_rate": 4.6354166666666664e-07, "logits/chosen": -0.3158706724643707, "logits/rejected": -0.30914992094039917, "logps/chosen": -280.5726623535156, "logps/rejected": -258.17657470703125, "logps/weighted_chosen": -2.45281982421875, "logps/weighted_rejected": -2.5444703102111816, "loss": 0.6683, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.15966796875, "rewards/margins": 0.599609375, "rewards/rejected": -0.43994140625, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": 0.05808715894818306, "rewards/weighted_margins": 0.07471618801355362, "rewards/weighted_rejected": -0.0166168212890625, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 46.855377197265625, "learning_rate": 5.156249999999999e-07, "logits/chosen": -0.2856552004814148, "logits/rejected": -0.3585342466831207, "logps/chosen": -291.05548095703125, "logps/rejected": -287.078125, "logps/weighted_chosen": -1.9577789306640625, "logps/weighted_rejected": -2.532482862472534, "loss": 0.6785, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13925781846046448, "rewards/margins": 0.9869140386581421, "rewards/rejected": -1.1261718273162842, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": 0.03715210035443306, "rewards/weighted_margins": 0.0635833740234375, "rewards/weighted_rejected": -0.02643737755715847, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 55.04579162597656, "learning_rate": 5.677083333333333e-07, "logits/chosen": -0.33493995666503906, "logits/rejected": -0.3254844546318054, "logps/chosen": -297.2953186035156, "logps/rejected": -262.6773376464844, "logps/weighted_chosen": -2.606689453125, "logps/weighted_rejected": -2.648364305496216, "loss": 0.6821, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.9228515625, "rewards/margins": 0.8955078125, "rewards/rejected": -1.818359375, "rewards/weighted_accuracies": 0.518750011920929, "rewards/weighted_chosen": -0.005747986026108265, "rewards/weighted_margins": 0.05161895602941513, "rewards/weighted_rejected": -0.05732421949505806, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 22.23135757446289, "learning_rate": 6.197916666666666e-07, "logits/chosen": -0.3393222689628601, "logits/rejected": -0.36481350660324097, "logps/chosen": -295.6703186035156, "logps/rejected": -256.3296813964844, "logps/weighted_chosen": -1.8351562023162842, "logps/weighted_rejected": -2.124218702316284, "loss": 0.6752, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.071679711341858, "rewards/margins": 1.46142578125, "rewards/rejected": -2.5331053733825684, "rewards/weighted_accuracies": 0.546875, "rewards/weighted_chosen": 0.0018810272449627519, "rewards/weighted_margins": 0.06835174560546875, "rewards/weighted_rejected": -0.0664466843008995, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 57.93917465209961, "learning_rate": 6.718749999999999e-07, "logits/chosen": -0.30284881591796875, "logits/rejected": -0.2989334166049957, "logps/chosen": -306.5074157714844, "logps/rejected": -279.8265686035156, "logps/weighted_chosen": -1.910064697265625, "logps/weighted_rejected": -2.2278685569763184, "loss": 0.6738, "rewards/accuracies": 0.578125, "rewards/chosen": -0.45097655057907104, "rewards/margins": 1.7268555164337158, "rewards/rejected": -2.177734375, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": 0.02166290208697319, "rewards/weighted_margins": 0.07758025825023651, "rewards/weighted_rejected": -0.05589141696691513, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 66.64070892333984, "learning_rate": 7.239583333333333e-07, "logits/chosen": -0.34190064668655396, "logits/rejected": -0.3586837649345398, "logps/chosen": -300.01483154296875, "logps/rejected": -276.1703186035156, "logps/weighted_chosen": -2.202807664871216, "logps/weighted_rejected": -2.474353075027466, "loss": 0.6635, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3230469226837158, "rewards/margins": 2.2220702171325684, "rewards/rejected": -3.545117139816284, "rewards/weighted_accuracies": 0.609375, "rewards/weighted_chosen": 0.0006683349492959678, "rewards/weighted_margins": 0.10604552924633026, "rewards/weighted_rejected": -0.1053924560546875, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 18.789766311645508, "learning_rate": 7.760416666666666e-07, "logits/chosen": -0.2976974546909332, "logits/rejected": -0.3081321716308594, "logps/chosen": -286.27813720703125, "logps/rejected": -255.4640655517578, "logps/weighted_chosen": -2.7657103538513184, "logps/weighted_rejected": -2.831347703933716, "loss": 0.6605, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.104687452316284, "rewards/margins": 2.5054688453674316, "rewards/rejected": -4.610156059265137, "rewards/weighted_accuracies": 0.621874988079071, "rewards/weighted_chosen": -0.0018630981212481856, "rewards/weighted_margins": 0.158416748046875, "rewards/weighted_rejected": -0.1603546142578125, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 51.51210021972656, "learning_rate": 8.28125e-07, "logits/chosen": -0.3341739773750305, "logits/rejected": -0.3859619200229645, "logps/chosen": -306.4765625, "logps/rejected": -279.1148376464844, "logps/weighted_chosen": -2.3189454078674316, "logps/weighted_rejected": -2.36669921875, "loss": 0.636, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.575390577316284, "rewards/margins": 3.349609375, "rewards/rejected": -5.925000190734863, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": 0.02147369459271431, "rewards/weighted_margins": 0.22438660264015198, "rewards/weighted_rejected": -0.203105166554451, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 398.3809509277344, "learning_rate": 8.802083333333333e-07, "logits/chosen": -0.36855775117874146, "logits/rejected": -0.37070387601852417, "logps/chosen": -307.1656188964844, "logps/rejected": -265.78436279296875, "logps/weighted_chosen": -2.459460496902466, "logps/weighted_rejected": -2.757373094558716, "loss": 0.6811, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.097460746765137, "rewards/margins": 3.488476514816284, "rewards/rejected": -7.585839748382568, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": 0.018505096435546875, "rewards/weighted_margins": 0.19701537489891052, "rewards/weighted_rejected": -0.17839965224266052, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 55.77580261230469, "learning_rate": 9.322916666666666e-07, "logits/chosen": -0.3392753601074219, "logits/rejected": -0.35816192626953125, "logps/chosen": -278.99530029296875, "logps/rejected": -265.18359375, "logps/weighted_chosen": -2.362103223800659, "logps/weighted_rejected": -2.754711866378784, "loss": 0.6944, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -5.346972465515137, "rewards/margins": 3.5015625953674316, "rewards/rejected": -8.848730087280273, "rewards/weighted_accuracies": 0.578125, "rewards/weighted_chosen": -0.05782318115234375, "rewards/weighted_margins": 0.16480103135108948, "rewards/weighted_rejected": -0.22255554795265198, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 38.015960693359375, "learning_rate": 9.84375e-07, "logits/chosen": -0.3686843812465668, "logits/rejected": -0.4041244387626648, "logps/chosen": -314.3070373535156, "logps/rejected": -276.7484436035156, "logps/weighted_chosen": -2.123486280441284, "logps/weighted_rejected": -2.6261963844299316, "loss": 0.6392, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -8.581738471984863, "rewards/margins": 4.317968845367432, "rewards/rejected": -12.900781631469727, "rewards/weighted_accuracies": 0.621874988079071, "rewards/weighted_chosen": -0.129638671875, "rewards/weighted_margins": 0.2160186767578125, "rewards/weighted_rejected": -0.345590204000473, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 46.52367401123047, "learning_rate": 9.99959085414323e-07, "logits/chosen": -0.4128967225551605, "logits/rejected": -0.4471847414970398, "logps/chosen": -320.0546875, "logps/rejected": -273.11248779296875, "logps/weighted_chosen": -2.5019164085388184, "logps/weighted_rejected": -2.9936890602111816, "loss": 0.6473, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -10.737597465515137, "rewards/margins": 4.738671779632568, "rewards/rejected": -15.476171493530273, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.11443634331226349, "rewards/weighted_margins": 0.2610321044921875, "rewards/weighted_rejected": -0.37534791231155396, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 21.238189697265625, "learning_rate": 9.997587035630105e-07, "logits/chosen": -0.4288749694824219, "logits/rejected": -0.4688262939453125, "logps/chosen": -300.0765686035156, "logps/rejected": -304.63751220703125, "logps/weighted_chosen": -2.32427978515625, "logps/weighted_rejected": -3.0592284202575684, "loss": 0.6424, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -13.117578506469727, "rewards/margins": 7.013671875, "rewards/rejected": -20.133594512939453, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -0.21423491835594177, "rewards/weighted_margins": 0.27025145292282104, "rewards/weighted_rejected": -0.4845077395439148, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 24.92041015625, "learning_rate": 9.99391406364405e-07, "logits/chosen": -0.42696380615234375, "logits/rejected": -0.429006963968277, "logps/chosen": -305.4906311035156, "logps/rejected": -288.6312561035156, "logps/weighted_chosen": -2.625018358230591, "logps/weighted_rejected": -3.102160692214966, "loss": 0.6601, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -13.349413871765137, "rewards/margins": 6.373632907867432, "rewards/rejected": -19.72265625, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.20062866806983948, "rewards/weighted_margins": 0.316873162984848, "rewards/weighted_rejected": -0.5174545049667358, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 147.95851135253906, "learning_rate": 9.988573164927884e-07, "logits/chosen": -0.3811447024345398, "logits/rejected": -0.4161086976528168, "logps/chosen": -281.33203125, "logps/rejected": -274.234375, "logps/weighted_chosen": -2.32806396484375, "logps/weighted_rejected": -2.6552734375, "loss": 0.7195, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -12.46875, "rewards/margins": 10.046093940734863, "rewards/rejected": -22.515430450439453, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.24639587104320526, "rewards/weighted_margins": 0.23908081650733948, "rewards/weighted_rejected": -0.4853073060512543, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 26.882122039794922, "learning_rate": 9.98156612329838e-07, "logits/chosen": -0.4748245179653168, "logits/rejected": -0.5250595211982727, "logps/chosen": -278.16717529296875, "logps/rejected": -306.29376220703125, "logps/weighted_chosen": -2.348803758621216, "logps/weighted_rejected": -2.9455933570861816, "loss": 0.6674, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -13.405566215515137, "rewards/margins": 10.753710746765137, "rewards/rejected": -24.158985137939453, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": -0.14908751845359802, "rewards/weighted_margins": 0.33162689208984375, "rewards/weighted_rejected": -0.48021697998046875, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 86.49760437011719, "learning_rate": 9.97289527905053e-07, "logits/chosen": -0.478302001953125, "logits/rejected": -0.48861923813819885, "logps/chosen": -277.0523376464844, "logps/rejected": -275.80938720703125, "logps/weighted_chosen": -2.61376953125, "logps/weighted_rejected": -2.787853956222534, "loss": 0.7022, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -12.673730850219727, "rewards/margins": 7.1806640625, "rewards/rejected": -19.852344512939453, "rewards/weighted_accuracies": 0.5687500238418579, "rewards/weighted_chosen": -0.12388916313648224, "rewards/weighted_margins": 0.19627074897289276, "rewards/weighted_rejected": -0.3203796446323395, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 19.698871612548828, "learning_rate": 9.962563528175875e-07, "logits/chosen": -0.4065658450126648, "logits/rejected": -0.4432968199253082, "logps/chosen": -310.62890625, "logps/rejected": -281.46405029296875, "logps/weighted_chosen": -2.184094190597534, "logps/weighted_rejected": -3.0492796897888184, "loss": 0.6507, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -11.896581649780273, "rewards/margins": 8.622265815734863, "rewards/rejected": -20.520313262939453, "rewards/weighted_accuracies": 0.606249988079071, "rewards/weighted_chosen": -0.1260833740234375, "rewards/weighted_margins": 0.25025635957717896, "rewards/weighted_rejected": -0.3761749267578125, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 16.363121032714844, "learning_rate": 9.950574321395277e-07, "logits/chosen": -0.42208632826805115, "logits/rejected": -0.4458427429199219, "logps/chosen": -305.9046936035156, "logps/rejected": -286.06561279296875, "logps/weighted_chosen": -2.40838623046875, "logps/weighted_rejected": -2.7938475608825684, "loss": 0.6573, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -16.622364044189453, "rewards/margins": 6.233202934265137, "rewards/rejected": -22.855077743530273, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.189697265625, "rewards/weighted_margins": 0.27490538358688354, "rewards/weighted_rejected": -0.4645233154296875, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 54.42692947387695, "learning_rate": 9.936931663006413e-07, "logits/chosen": -0.45263671875, "logits/rejected": -0.44363707304000854, "logps/chosen": -316.171875, "logps/rejected": -303.3656311035156, "logps/weighted_chosen": -2.4659423828125, "logps/weighted_rejected": -3.0541749000549316, "loss": 0.6068, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -13.402734756469727, "rewards/margins": 10.619824409484863, "rewards/rejected": -24.025390625, "rewards/weighted_accuracies": 0.6875, "rewards/weighted_chosen": -0.05214080959558487, "rewards/weighted_margins": 0.40336912870407104, "rewards/weighted_rejected": -0.455657958984375, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 39.20017623901367, "learning_rate": 9.921640109546357e-07, "logits/chosen": -0.42310255765914917, "logits/rejected": -0.48920440673828125, "logps/chosen": -283.7171936035156, "logps/rejected": -278.1859436035156, "logps/weighted_chosen": -2.396167039871216, "logps/weighted_rejected": -3.5881590843200684, "loss": 0.6649, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -16.9111328125, "rewards/margins": 9.704492568969727, "rewards/rejected": -26.62109375, "rewards/weighted_accuracies": 0.6031249761581421, "rewards/weighted_chosen": -0.130279541015625, "rewards/weighted_margins": 0.3882461488246918, "rewards/weighted_rejected": -0.5187179446220398, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 28.03601837158203, "learning_rate": 9.90470476826975e-07, "logits/chosen": -0.485189825296402, "logits/rejected": -0.48862916231155396, "logps/chosen": -289.09765625, "logps/rejected": -297.625, "logps/weighted_chosen": -2.2784485816955566, "logps/weighted_rejected": -2.771862745285034, "loss": 0.6608, "rewards/accuracies": 0.640625, "rewards/chosen": -19.676952362060547, "rewards/margins": 10.679491996765137, "rewards/rejected": -30.360937118530273, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.1669921875, "rewards/weighted_margins": 0.28967589139938354, "rewards/weighted_rejected": -0.4566032290458679, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 538.50927734375, "learning_rate": 9.886131295443002e-07, "logits/chosen": -0.654278576374054, "logits/rejected": -0.7076683044433594, "logps/chosen": -341.85467529296875, "logps/rejected": -309.89764404296875, "logps/weighted_chosen": -2.629150390625, "logps/weighted_rejected": -2.8698973655700684, "loss": 0.6788, "rewards/accuracies": 0.515625, "rewards/chosen": -60.568748474121094, "rewards/margins": -0.7822265625, "rewards/rejected": -59.785743713378906, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.24791869521141052, "rewards/weighted_margins": 0.28089600801467896, "rewards/weighted_rejected": -0.528765857219696, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 34.656883239746094, "learning_rate": 9.865925894455166e-07, "logits/chosen": -0.7003936767578125, "logits/rejected": -0.719250500202179, "logps/chosen": -326.3960876464844, "logps/rejected": -290.3453063964844, "logps/weighted_chosen": -2.553356885910034, "logps/weighted_rejected": -3.10992431640625, "loss": 0.7054, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.532812118530273, "rewards/margins": 6.3564453125, "rewards/rejected": -32.88788986206055, "rewards/weighted_accuracies": 0.609375, "rewards/weighted_chosen": -0.18020018935203552, "rewards/weighted_margins": 0.3489990234375, "rewards/weighted_rejected": -0.5293639898300171, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 61.1888542175293, "learning_rate": 9.84409531374603e-07, "logits/chosen": -0.6631911993026733, "logits/rejected": -0.6448425054550171, "logps/chosen": -324.87579345703125, "logps/rejected": -291.71875, "logps/weighted_chosen": -2.5611815452575684, "logps/weighted_rejected": -3.060229539871216, "loss": 0.6449, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -20.070018768310547, "rewards/margins": 8.8251953125, "rewards/rejected": -28.8876953125, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.140888974070549, "rewards/weighted_margins": 0.3719635009765625, "rewards/weighted_rejected": -0.5127013921737671, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 27.2315673828125, "learning_rate": 9.820646844552219e-07, "logits/chosen": -0.6496349573135376, "logits/rejected": -0.7006805539131165, "logps/chosen": -295.5882873535156, "logps/rejected": -297.4906311035156, "logps/weighted_chosen": -2.6988892555236816, "logps/weighted_rejected": -2.898681640625, "loss": 0.6788, "rewards/accuracies": 0.6875, "rewards/chosen": -19.176855087280273, "rewards/margins": 12.649316787719727, "rewards/rejected": -31.822460174560547, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -0.23918533325195312, "rewards/weighted_margins": 0.2961669862270355, "rewards/weighted_rejected": -0.5351837277412415, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 24.073888778686523, "learning_rate": 9.795588318471964e-07, "logits/chosen": -0.7137314081192017, "logits/rejected": -0.7225399017333984, "logps/chosen": -277.8890686035156, "logps/rejected": -304.59063720703125, "logps/weighted_chosen": -2.4300780296325684, "logps/weighted_rejected": -2.771411180496216, "loss": 0.6675, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -18.445703506469727, "rewards/margins": 9.8720703125, "rewards/rejected": -28.31640625, "rewards/weighted_accuracies": 0.637499988079071, "rewards/weighted_chosen": -0.20159301161766052, "rewards/weighted_margins": 0.2779785096645355, "rewards/weighted_rejected": -0.47947996854782104, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 12.750471115112305, "learning_rate": 9.768928104849415e-07, "logits/chosen": -0.7212737798690796, "logits/rejected": -0.7225433588027954, "logps/chosen": -299.53594970703125, "logps/rejected": -275.5718688964844, "logps/weighted_chosen": -2.667529344558716, "logps/weighted_rejected": -2.800830125808716, "loss": 0.6916, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -16.816015243530273, "rewards/margins": 9.876562118530273, "rewards/rejected": -26.690235137939453, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.15018615126609802, "rewards/weighted_margins": 0.301962286233902, "rewards/weighted_rejected": -0.45206451416015625, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 29.441747665405273, "learning_rate": 9.740675107979355e-07, "logits/chosen": -0.6865798830986023, "logits/rejected": -0.7117553949356079, "logps/chosen": -331.06561279296875, "logps/rejected": -300.31719970703125, "logps/weighted_chosen": -1.9907715320587158, "logps/weighted_rejected": -2.932177782058716, "loss": 0.6819, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -16.022266387939453, "rewards/margins": 9.630078315734863, "rewards/rejected": -25.654491424560547, "rewards/weighted_accuracies": 0.640625, "rewards/weighted_chosen": -0.18877258896827698, "rewards/weighted_margins": 0.2808380126953125, "rewards/weighted_rejected": -0.4699081480503082, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 29.483524322509766, "learning_rate": 9.71083876413323e-07, "logits/chosen": -0.6637862920761108, "logits/rejected": -0.669873058795929, "logps/chosen": -322.3882751464844, "logps/rejected": -300.85858154296875, "logps/weighted_chosen": -2.189379930496216, "logps/weighted_rejected": -2.9217162132263184, "loss": 0.6846, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -20.004688262939453, "rewards/margins": 11.246289253234863, "rewards/rejected": -31.24609375, "rewards/weighted_accuracies": 0.6000000238418579, "rewards/weighted_chosen": -0.22467346489429474, "rewards/weighted_margins": 0.2720580995082855, "rewards/weighted_rejected": -0.4967102110385895, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 20.563907623291016, "learning_rate": 9.67942903840751e-07, "logits/chosen": -0.7051689028739929, "logits/rejected": -0.7537201046943665, "logps/chosen": -324.1015625, "logps/rejected": -310.375, "logps/weighted_chosen": -2.397631883621216, "logps/weighted_rejected": -2.950610399246216, "loss": 0.6478, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -19.788671493530273, "rewards/margins": 16.317577362060547, "rewards/rejected": -36.111328125, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.22498169541358948, "rewards/weighted_margins": 0.3520751893520355, "rewards/weighted_rejected": -0.5770629644393921, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 23.1771183013916, "learning_rate": 9.646456421395447e-07, "logits/chosen": -0.7504974603652954, "logits/rejected": -0.7628723382949829, "logps/chosen": -341.2171936035156, "logps/rejected": -343.9375, "logps/weighted_chosen": -2.2680420875549316, "logps/weighted_rejected": -3.0065674781799316, "loss": 0.6746, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -22.944530487060547, "rewards/margins": 17.315624237060547, "rewards/rejected": -40.2587890625, "rewards/weighted_accuracies": 0.621874988079071, "rewards/weighted_chosen": -0.20337525010108948, "rewards/weighted_margins": 0.216084286570549, "rewards/weighted_rejected": -0.4196624755859375, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 14.420520782470703, "learning_rate": 9.611931925683266e-07, "logits/chosen": -0.7154334783554077, "logits/rejected": -0.7491073608398438, "logps/chosen": -331.38983154296875, "logps/rejected": -303.3890686035156, "logps/weighted_chosen": -2.190844774246216, "logps/weighted_rejected": -2.7472167015075684, "loss": 0.6135, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -25.111621856689453, "rewards/margins": 14.543554306030273, "rewards/rejected": -39.658592224121094, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.21762695908546448, "rewards/weighted_margins": 0.36674195528030396, "rewards/weighted_rejected": -0.584503173828125, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 27.429603576660156, "learning_rate": 9.575867082172085e-07, "logits/chosen": -0.7379547357559204, "logits/rejected": -0.7826202511787415, "logps/chosen": -337.46405029296875, "logps/rejected": -320.23907470703125, "logps/weighted_chosen": -2.6387085914611816, "logps/weighted_rejected": -2.712329149246216, "loss": 0.6716, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -32.554298400878906, "rewards/margins": 17.513866424560547, "rewards/rejected": -50.060157775878906, "rewards/weighted_accuracies": 0.6468750238418579, "rewards/weighted_chosen": -0.35613709688186646, "rewards/weighted_margins": 0.38392335176467896, "rewards/weighted_rejected": -0.7401062250137329, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 18.97144889831543, "learning_rate": 9.538273936226673e-07, "logits/chosen": -0.778491199016571, "logits/rejected": -0.811004638671875, "logps/chosen": -292.83984375, "logps/rejected": -304.05352783203125, "logps/weighted_chosen": -2.796630859375, "logps/weighted_rejected": -3.2444825172424316, "loss": 0.6544, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -27.327733993530273, "rewards/margins": 12.26318359375, "rewards/rejected": -39.58984375, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.21069030463695526, "rewards/weighted_margins": 0.3678832948207855, "rewards/weighted_rejected": -0.57806396484375, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 25.43462371826172, "learning_rate": 9.499165043652391e-07, "logits/chosen": -0.7674010992050171, "logits/rejected": -0.7686828374862671, "logps/chosen": -319.55859375, "logps/rejected": -309.03436279296875, "logps/weighted_chosen": -2.82305908203125, "logps/weighted_rejected": -2.993237257003784, "loss": 0.631, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -28.386133193969727, "rewards/margins": 13.9384765625, "rewards/rejected": -42.326072692871094, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.26903897523880005, "rewards/weighted_margins": 0.353515625, "rewards/weighted_rejected": -0.622546374797821, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 65.78443908691406, "learning_rate": 9.458553466501665e-07, "logits/chosen": -0.8066772222518921, "logits/rejected": -0.8363037109375, "logps/chosen": -314.7945251464844, "logps/rejected": -287.65313720703125, "logps/weighted_chosen": -2.8233399391174316, "logps/weighted_rejected": -3.013622999191284, "loss": 0.6831, "rewards/accuracies": 0.640625, "rewards/chosen": -28.642578125, "rewards/margins": 15.162694931030273, "rewards/rejected": -43.80976486206055, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -0.3802246153354645, "rewards/weighted_margins": 0.3342132568359375, "rewards/weighted_rejected": -0.7139984369277954, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 13.290085792541504, "learning_rate": 9.416452768711366e-07, "logits/chosen": -0.7957550287246704, "logits/rejected": -0.8287414312362671, "logps/chosen": -323.0093688964844, "logps/rejected": -306.98907470703125, "logps/weighted_chosen": -2.544872999191284, "logps/weighted_rejected": -3.100903272628784, "loss": 0.6759, "rewards/accuracies": 0.640625, "rewards/chosen": -30.906835556030273, "rewards/margins": 16.355859756469727, "rewards/rejected": -47.24980545043945, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.343759149312973, "rewards/weighted_margins": 0.40337830781936646, "rewards/weighted_rejected": -0.747100830078125, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 21.766939163208008, "learning_rate": 9.372877011572557e-07, "logits/chosen": -0.7200164794921875, "logits/rejected": -0.742279052734375, "logps/chosen": -342.75079345703125, "logps/rejected": -318.60626220703125, "logps/weighted_chosen": -2.5311522483825684, "logps/weighted_rejected": -2.990124464035034, "loss": 0.63, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -32.077247619628906, "rewards/margins": 11.883398056030273, "rewards/rejected": -43.959373474121094, "rewards/weighted_accuracies": 0.6656249761581421, "rewards/weighted_chosen": -0.30525511503219604, "rewards/weighted_margins": 0.45678406953811646, "rewards/weighted_rejected": -0.7624969482421875, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 28.76239776611328, "learning_rate": 9.327840749034141e-07, "logits/chosen": -0.7930053472518921, "logits/rejected": -0.8311401605606079, "logps/chosen": -316.79998779296875, "logps/rejected": -326.5062561035156, "logps/weighted_chosen": -2.4120116233825684, "logps/weighted_rejected": -3.591870069503784, "loss": 0.6639, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -29.640039443969727, "rewards/margins": 20.350976943969727, "rewards/rejected": -49.9853515625, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -0.27521055936813354, "rewards/weighted_margins": 0.4522705078125, "rewards/weighted_rejected": -0.7274719476699829, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 36.629127502441406, "learning_rate": 9.281359022841965e-07, "logits/chosen": -0.72747802734375, "logits/rejected": -0.7426910400390625, "logps/chosen": -308.5406188964844, "logps/rejected": -300.71484375, "logps/weighted_chosen": -2.6044554710388184, "logps/weighted_rejected": -3.862866163253784, "loss": 0.6178, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -33.54804611206055, "rewards/margins": 22.3515625, "rewards/rejected": -55.88496017456055, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -0.38198548555374146, "rewards/weighted_margins": 0.525561511516571, "rewards/weighted_rejected": -0.9073349237442017, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 20.296154022216797, "learning_rate": 9.233447357514989e-07, "logits/chosen": -0.7092193365097046, "logits/rejected": -0.751629650592804, "logps/chosen": -337.10467529296875, "logps/rejected": -328.71875, "logps/weighted_chosen": -3.054370164871216, "logps/weighted_rejected": -3.5334715843200684, "loss": 0.6534, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -39.15234375, "rewards/margins": 19.770116806030273, "rewards/rejected": -58.90898513793945, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.470510870218277, "rewards/weighted_margins": 0.566607654094696, "rewards/weighted_rejected": -1.0376465320587158, "step": 500 }, { "epoch": 0.26171159382360637, "eval_logits/chosen": -0.8052441477775574, "eval_logits/rejected": -0.8225547075271606, "eval_logps/chosen": -333.44000244140625, "eval_logps/rejected": -331.98199462890625, "eval_logps/weighted_chosen": -2.756896734237671, "eval_logps/weighted_rejected": -3.441680908203125, "eval_loss": 0.6561886668205261, "eval_rewards/accuracies": 0.6370000243186951, "eval_rewards/chosen": -44.67877960205078, "eval_rewards/margins": 19.602703094482422, "eval_rewards/rejected": -64.27362823486328, "eval_rewards/weighted_accuracies": 0.6445000171661377, "eval_rewards/weighted_chosen": -0.485819548368454, "eval_rewards/weighted_margins": 0.4620407819747925, "eval_rewards/weighted_rejected": -0.9478604793548584, "eval_runtime": 1263.3333, "eval_samples_per_second": 1.583, "eval_steps_per_second": 0.396, "step": 500 }, { "epoch": 0.2669458257000785, "grad_norm": 44.70832824707031, "learning_rate": 9.184121755160232e-07, "logits/chosen": -0.7849181890487671, "logits/rejected": -0.8171790838241577, "logps/chosen": -344.84454345703125, "logps/rejected": -356.3671875, "logps/weighted_chosen": -2.9210448265075684, "logps/weighted_rejected": -3.3969483375549316, "loss": 0.6735, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -43.604881286621094, "rewards/margins": 24.157032012939453, "rewards/rejected": -67.75703430175781, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.43794554471969604, "rewards/weighted_margins": 0.407052606344223, "rewards/weighted_rejected": -0.8448547124862671, "step": 510 }, { "epoch": 0.2721800575765506, "grad_norm": 34.907981872558594, "learning_rate": 9.133398690128193e-07, "logits/chosen": -0.8243468999862671, "logits/rejected": -0.852618396282196, "logps/chosen": -370.2640686035156, "logps/rejected": -357.4296875, "logps/weighted_chosen": -2.7059326171875, "logps/weighted_rejected": -3.559033155441284, "loss": 0.6233, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -48.807029724121094, "rewards/margins": 28.050586700439453, "rewards/rejected": -76.85234069824219, "rewards/weighted_accuracies": 0.6875, "rewards/weighted_chosen": -0.3096374571323395, "rewards/weighted_margins": 0.5285431146621704, "rewards/weighted_rejected": -0.838287353515625, "step": 520 }, { "epoch": 0.27741428945302277, "grad_norm": 17.412511825561523, "learning_rate": 9.081295103510554e-07, "logits/chosen": -0.7943557500839233, "logits/rejected": -0.8541763424873352, "logps/chosen": -339.65313720703125, "logps/rejected": -351.77813720703125, "logps/weighted_chosen": -2.331298828125, "logps/weighted_rejected": -3.5838379859924316, "loss": 0.5587, "rewards/accuracies": 0.65625, "rewards/chosen": -55.494140625, "rewards/margins": 26.642187118530273, "rewards/rejected": -82.14140319824219, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -0.33439940214157104, "rewards/weighted_margins": 0.7252563238143921, "rewards/weighted_rejected": -1.0597717761993408, "step": 530 }, { "epoch": 0.2826485213294949, "grad_norm": 30.677711486816406, "learning_rate": 9.027828397481989e-07, "logits/chosen": -0.7925201654434204, "logits/rejected": -0.8262939453125, "logps/chosen": -318.59063720703125, "logps/rejected": -337.55157470703125, "logps/weighted_chosen": -3.059436082839966, "logps/weighted_rejected": -3.750012159347534, "loss": 0.6464, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -57.99492263793945, "rewards/margins": 24.116796493530273, "rewards/rejected": -82.107421875, "rewards/weighted_accuracies": 0.637499988079071, "rewards/weighted_chosen": -0.507769763469696, "rewards/weighted_margins": 0.4883270263671875, "rewards/weighted_rejected": -0.995800793170929, "step": 540 }, { "epoch": 0.287882753205967, "grad_norm": 34.808658599853516, "learning_rate": 8.973016429487988e-07, "logits/chosen": -0.8280746340751648, "logits/rejected": -0.8393570184707642, "logps/chosen": -340.02032470703125, "logps/rejected": -340.90625, "logps/weighted_chosen": -3.016833543777466, "logps/weighted_rejected": -3.3585205078125, "loss": 0.6373, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -58.115234375, "rewards/margins": 28.575389862060547, "rewards/rejected": -86.7035140991211, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -0.5330657958984375, "rewards/weighted_margins": 0.48836976289749146, "rewards/weighted_rejected": -1.0212554931640625, "step": 550 }, { "epoch": 0.29311698508243916, "grad_norm": 21.559553146362305, "learning_rate": 8.916877506280601e-07, "logits/chosen": -0.8576828241348267, "logits/rejected": -0.850115954875946, "logps/chosen": -343.0625, "logps/rejected": -340.73126220703125, "logps/weighted_chosen": -3.004504442214966, "logps/weighted_rejected": -3.3214111328125, "loss": 0.6493, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -62.214454650878906, "rewards/margins": 24.381053924560547, "rewards/rejected": -86.59492492675781, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -0.5099731683731079, "rewards/weighted_margins": 0.535810112953186, "rewards/weighted_rejected": -1.0458984375, "step": 560 }, { "epoch": 0.29835121695891126, "grad_norm": 16.77034568786621, "learning_rate": 8.85943037780415e-07, "logits/chosen": -0.901629626750946, "logits/rejected": -0.9031143188476562, "logps/chosen": -347.7562561035156, "logps/rejected": -320.90936279296875, "logps/weighted_chosen": -2.9189209938049316, "logps/weighted_rejected": -3.346874952316284, "loss": 0.6796, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -61.52226638793945, "rewards/margins": 17.764842987060547, "rewards/rejected": -79.3128890991211, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.5591338872909546, "rewards/weighted_margins": 0.44241029024124146, "rewards/weighted_rejected": -1.00177001953125, "step": 570 }, { "epoch": 0.3035854488353834, "grad_norm": 23.179088592529297, "learning_rate": 8.800694230932884e-07, "logits/chosen": -0.808392345905304, "logits/rejected": -0.8254486322402954, "logps/chosen": -345.52032470703125, "logps/rejected": -338.59844970703125, "logps/weighted_chosen": -2.4705810546875, "logps/weighted_rejected": -3.031982421875, "loss": 0.6672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -56.009376525878906, "rewards/margins": 17.356250762939453, "rewards/rejected": -73.3921890258789, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.3469276428222656, "rewards/weighted_margins": 0.38211363554000854, "rewards/weighted_rejected": -0.7289062738418579, "step": 580 }, { "epoch": 0.30881968071185556, "grad_norm": 16.077539443969727, "learning_rate": 8.740688683062723e-07, "logits/chosen": -0.8602691888809204, "logits/rejected": -0.874432384967804, "logps/chosen": -382.03436279296875, "logps/rejected": -349.27655029296875, "logps/weighted_chosen": -2.4807372093200684, "logps/weighted_rejected": -3.101879835128784, "loss": 0.6615, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -52.95586013793945, "rewards/margins": 20.580469131469727, "rewards/rejected": -73.5445327758789, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -0.31566160917282104, "rewards/weighted_margins": 0.39473265409469604, "rewards/weighted_rejected": -0.7103912234306335, "step": 590 }, { "epoch": 0.31405391258832765, "grad_norm": 17.0419864654541, "learning_rate": 8.679433775559215e-07, "logits/chosen": -0.8191520571708679, "logits/rejected": -0.8663116693496704, "logps/chosen": -379.78125, "logps/rejected": -361.57501220703125, "logps/weighted_chosen": -2.305920362472534, "logps/weighted_rejected": -3.3094482421875, "loss": 0.6241, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -55.09687423706055, "rewards/margins": 20.994531631469727, "rewards/rejected": -76.0738296508789, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -0.31669920682907104, "rewards/weighted_margins": 0.4563964903354645, "rewards/weighted_rejected": -0.7728790044784546, "step": 600 }, { "epoch": 0.3192881444647998, "grad_norm": 30.722089767456055, "learning_rate": 8.616949967063871e-07, "logits/chosen": -0.7851959466934204, "logits/rejected": -0.82568359375, "logps/chosen": -323.5859375, "logps/rejected": -338.609375, "logps/weighted_chosen": -2.7684326171875, "logps/weighted_rejected": -3.191943407058716, "loss": 0.6918, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -55.26250076293945, "rewards/margins": 22.568164825439453, "rewards/rejected": -77.8238296508789, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": -0.423666387796402, "rewards/weighted_margins": 0.310333251953125, "rewards/weighted_rejected": -0.733630359172821, "step": 610 }, { "epoch": 0.3245223763412719, "grad_norm": 14.894518852233887, "learning_rate": 8.553258126661154e-07, "logits/chosen": -0.831768810749054, "logits/rejected": -0.846484363079071, "logps/chosen": -338.09686279296875, "logps/rejected": -336.015625, "logps/weighted_chosen": -2.84112548828125, "logps/weighted_rejected": -3.4341063499450684, "loss": 0.708, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -58.20390701293945, "rewards/margins": 21.783594131469727, "rewards/rejected": -79.9749984741211, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.4535583555698395, "rewards/weighted_margins": 0.33486634492874146, "rewards/weighted_rejected": -0.788104236125946, "step": 620 }, { "epoch": 0.32975660821774405, "grad_norm": 23.813823699951172, "learning_rate": 8.488379526908368e-07, "logits/chosen": -0.826812744140625, "logits/rejected": -0.837506115436554, "logps/chosen": -352.4593811035156, "logps/rejected": -357.29376220703125, "logps/weighted_chosen": -2.6490235328674316, "logps/weighted_rejected": -3.211181640625, "loss": 0.6454, "rewards/accuracies": 0.640625, "rewards/chosen": -61.892189025878906, "rewards/margins": 26.424219131469727, "rewards/rejected": -88.33320617675781, "rewards/weighted_accuracies": 0.6468750238418579, "rewards/weighted_chosen": -0.3862060606479645, "rewards/weighted_margins": 0.415771484375, "rewards/weighted_rejected": -0.8020385503768921, "step": 630 }, { "epoch": 0.33499084009421615, "grad_norm": 19.41891098022461, "learning_rate": 8.422335836730802e-07, "logits/chosen": -0.7994629144668579, "logits/rejected": -0.7995041012763977, "logps/chosen": -333.2593688964844, "logps/rejected": -366.6499938964844, "logps/weighted_chosen": -2.6496825218200684, "logps/weighted_rejected": -3.1250548362731934, "loss": 0.6734, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -59.58320236206055, "rewards/margins": 29.476757049560547, "rewards/rejected": -89.0625, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -0.47590941190719604, "rewards/weighted_margins": 0.394134521484375, "rewards/weighted_rejected": -0.8701080083847046, "step": 640 }, { "epoch": 0.3402250719706883, "grad_norm": 16.819276809692383, "learning_rate": 8.355149114184485e-07, "logits/chosen": -0.846386730670929, "logits/rejected": -0.8338836431503296, "logps/chosen": -370.0062561035156, "logps/rejected": -376.0843811035156, "logps/weighted_chosen": -2.787951707839966, "logps/weighted_rejected": -3.106738328933716, "loss": 0.6483, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -62.594337463378906, "rewards/margins": 31.204687118530273, "rewards/rejected": -93.80000305175781, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.5348541140556335, "rewards/weighted_margins": 0.446258544921875, "rewards/weighted_rejected": -0.980926513671875, "step": 650 }, { "epoch": 0.34545930384716045, "grad_norm": 18.66504669189453, "learning_rate": 8.286841799088963e-07, "logits/chosen": -0.8683761358261108, "logits/rejected": -0.860211193561554, "logps/chosen": -344.94061279296875, "logps/rejected": -343.61407470703125, "logps/weighted_chosen": -2.3468871116638184, "logps/weighted_rejected": -2.966168165206909, "loss": 0.6577, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -60.857032775878906, "rewards/margins": 19.649999618530273, "rewards/rejected": -80.50117492675781, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.4206695556640625, "rewards/weighted_margins": 0.41551512479782104, "rewards/weighted_rejected": -0.8359512090682983, "step": 660 }, { "epoch": 0.35069353572363254, "grad_norm": 26.941055297851562, "learning_rate": 8.217436705532599e-07, "logits/chosen": -0.8248077630996704, "logits/rejected": -0.8512848019599915, "logps/chosen": -369.5484313964844, "logps/rejected": -348.30157470703125, "logps/weighted_chosen": -2.3807740211486816, "logps/weighted_rejected": -3.006176710128784, "loss": 0.6373, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -66.71875, "rewards/margins": 17.846094131469727, "rewards/rejected": -84.57929992675781, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.3303161561489105, "rewards/weighted_margins": 0.45032960176467896, "rewards/weighted_rejected": -0.7810913324356079, "step": 670 }, { "epoch": 0.3559277676001047, "grad_norm": 1516.0828857421875, "learning_rate": 8.14695701425284e-07, "logits/chosen": -0.8020523190498352, "logits/rejected": -0.845899224281311, "logps/chosen": -371.13751220703125, "logps/rejected": -352.38592529296875, "logps/weighted_chosen": -2.49072265625, "logps/weighted_rejected": -3.2708497047424316, "loss": 0.5885, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -65.716796875, "rewards/margins": 24.369531631469727, "rewards/rejected": -90.1097640991211, "rewards/weighted_accuracies": 0.6937500238418579, "rewards/weighted_chosen": -0.330526739358902, "rewards/weighted_margins": 0.5371948480606079, "rewards/weighted_rejected": -0.867462158203125, "step": 680 }, { "epoch": 0.3611619994765768, "grad_norm": 295.9232482910156, "learning_rate": 8.075426264894046e-07, "logits/chosen": -0.7686309814453125, "logits/rejected": -0.805737316608429, "logps/chosen": -370.75, "logps/rejected": -373.64373779296875, "logps/weighted_chosen": -2.509265184402466, "logps/weighted_rejected": -3.719970703125, "loss": 0.5532, "rewards/accuracies": 0.6875, "rewards/chosen": -65.3414077758789, "rewards/margins": 30.711523056030273, "rewards/rejected": -96.052734375, "rewards/weighted_accuracies": 0.7406250238418579, "rewards/weighted_chosen": -0.3384948670864105, "rewards/weighted_margins": 0.6478027105331421, "rewards/weighted_rejected": -0.9860213994979858, "step": 690 }, { "epoch": 0.36639623135304894, "grad_norm": 80.06324768066406, "learning_rate": 8.002868348145435e-07, "logits/chosen": -0.7615035772323608, "logits/rejected": -0.758954644203186, "logps/chosen": -364.390625, "logps/rejected": -353.75, "logps/weighted_chosen": -2.562756299972534, "logps/weighted_rejected": -2.821521043777466, "loss": 0.6219, "rewards/accuracies": 0.609375, "rewards/chosen": -64.1957015991211, "rewards/margins": 21.513866424560547, "rewards/rejected": -85.70429992675781, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -0.38551026582717896, "rewards/weighted_margins": 0.45988160371780396, "rewards/weighted_rejected": -0.8446716070175171, "step": 700 }, { "epoch": 0.3716304632295211, "grad_norm": 1476.4896240234375, "learning_rate": 7.92930749776179e-07, "logits/chosen": -0.75201416015625, "logits/rejected": -0.7803069949150085, "logps/chosen": -337.1851501464844, "logps/rejected": -346.5375061035156, "logps/weighted_chosen": -2.752087354660034, "logps/weighted_rejected": -3.417065382003784, "loss": 0.6452, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -65.52070617675781, "rewards/margins": 20.681640625, "rewards/rejected": -86.203125, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.2934509217739105, "rewards/weighted_margins": 0.537158191204071, "rewards/weighted_rejected": -0.8301132321357727, "step": 710 }, { "epoch": 0.3768646951059932, "grad_norm": 37.869117736816406, "learning_rate": 7.854768282469582e-07, "logits/chosen": -0.814867377281189, "logits/rejected": -0.8494598269462585, "logps/chosen": -332.6953125, "logps/rejected": -362.8187561035156, "logps/weighted_chosen": -2.5288939476013184, "logps/weighted_rejected": -3.1172118186950684, "loss": 0.6496, "rewards/accuracies": 0.640625, "rewards/chosen": -55.247657775878906, "rewards/margins": 29.8330078125, "rewards/rejected": -85.080078125, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -0.355978399515152, "rewards/weighted_margins": 0.4522338807582855, "rewards/weighted_rejected": -0.8084503412246704, "step": 720 }, { "epoch": 0.38209892698246534, "grad_norm": 48.1231575012207, "learning_rate": 7.779275597761215e-07, "logits/chosen": -0.7673202753067017, "logits/rejected": -0.81195068359375, "logps/chosen": -330.94451904296875, "logps/rejected": -355.31561279296875, "logps/weighted_chosen": -2.6014404296875, "logps/weighted_rejected": -3.1932616233825684, "loss": 0.5952, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -54.890235900878906, "rewards/margins": 31.880468368530273, "rewards/rejected": -86.75312805175781, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -0.2228240966796875, "rewards/weighted_margins": 0.6149749755859375, "rewards/weighted_rejected": -0.8373657464981079, "step": 730 }, { "epoch": 0.38733315885893743, "grad_norm": 296.0299377441406, "learning_rate": 7.702854657580126e-07, "logits/chosen": -0.8295089602470398, "logits/rejected": -0.843798816204071, "logps/chosen": -352.19842529296875, "logps/rejected": -333.7593688964844, "logps/weighted_chosen": -2.45697021484375, "logps/weighted_rejected": -3.375244140625, "loss": 0.6318, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -57.54804611206055, "rewards/margins": 21.8720703125, "rewards/rejected": -79.4625015258789, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -0.19748535752296448, "rewards/weighted_margins": 0.5647827386856079, "rewards/weighted_rejected": -0.762377917766571, "step": 740 }, { "epoch": 0.3925673907354096, "grad_norm": 222.2068328857422, "learning_rate": 7.625530985899547e-07, "logits/chosen": -0.8145691156387329, "logits/rejected": -0.8263305425643921, "logps/chosen": -328.7578125, "logps/rejected": -331.390625, "logps/weighted_chosen": -2.581188917160034, "logps/weighted_rejected": -3.505688428878784, "loss": 0.6377, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -61.987892150878906, "rewards/margins": 24.421483993530273, "rewards/rejected": -86.39921569824219, "rewards/weighted_accuracies": 0.6656249761581421, "rewards/weighted_chosen": -0.493093878030777, "rewards/weighted_margins": 0.4742370545864105, "rewards/weighted_rejected": -0.9672302007675171, "step": 750 }, { "epoch": 0.39780162261188173, "grad_norm": 34.57517623901367, "learning_rate": 7.547330408197694e-07, "logits/chosen": -0.8249969482421875, "logits/rejected": -0.8720428347587585, "logps/chosen": -363.44219970703125, "logps/rejected": -345.2250061035156, "logps/weighted_chosen": -2.4618163108825684, "logps/weighted_rejected": -3.28759765625, "loss": 0.6383, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -67.7855453491211, "rewards/margins": 21.181835174560547, "rewards/rejected": -88.9535140991211, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.3757568299770355, "rewards/weighted_margins": 0.4518585205078125, "rewards/weighted_rejected": -0.8270477056503296, "step": 760 }, { "epoch": 0.40303585448835383, "grad_norm": 26.55402946472168, "learning_rate": 7.468279042832271e-07, "logits/chosen": -0.8424628973007202, "logits/rejected": -0.8771301507949829, "logps/chosen": -347.6484375, "logps/rejected": -388.42266845703125, "logps/weighted_chosen": -2.660815477371216, "logps/weighted_rejected": -3.1720213890075684, "loss": 0.6743, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -67.98554992675781, "rewards/margins": 30.81640625, "rewards/rejected": -98.75859069824219, "rewards/weighted_accuracies": 0.6312500238418579, "rewards/weighted_chosen": -0.5194793939590454, "rewards/weighted_margins": 0.3566345274448395, "rewards/weighted_rejected": -0.8764098882675171, "step": 770 }, { "epoch": 0.408270086364826, "grad_norm": 14.42599868774414, "learning_rate": 7.388403292317154e-07, "logits/chosen": -0.7979522943496704, "logits/rejected": -0.8573578000068665, "logps/chosen": -367.4046936035156, "logps/rejected": -358.2515563964844, "logps/weighted_chosen": -2.490283250808716, "logps/weighted_rejected": -3.1418213844299316, "loss": 0.6398, "rewards/accuracies": 0.65625, "rewards/chosen": -66.2933578491211, "rewards/margins": 26.066015243530273, "rewards/rejected": -92.384765625, "rewards/weighted_accuracies": 0.690625011920929, "rewards/weighted_chosen": -0.461639404296875, "rewards/weighted_margins": 0.452981561422348, "rewards/weighted_rejected": -0.914324939250946, "step": 780 }, { "epoch": 0.4135043182412981, "grad_norm": 21.45990753173828, "learning_rate": 7.307729834504154e-07, "logits/chosen": -0.8032287359237671, "logits/rejected": -0.8670104742050171, "logps/chosen": -351.39532470703125, "logps/rejected": -360.1734313964844, "logps/weighted_chosen": -2.3328614234924316, "logps/weighted_rejected": -3.198779344558716, "loss": 0.6356, "rewards/accuracies": 0.609375, "rewards/chosen": -69.6167984008789, "rewards/margins": 24.462499618530273, "rewards/rejected": -94.07890319824219, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -0.3527267575263977, "rewards/weighted_margins": 0.45725250244140625, "rewards/weighted_rejected": -0.8100005984306335, "step": 790 }, { "epoch": 0.4187385501177702, "grad_norm": 20.13976287841797, "learning_rate": 7.226285613672847e-07, "logits/chosen": -0.741473376750946, "logits/rejected": -0.7820758819580078, "logps/chosen": -342.1156311035156, "logps/rejected": -382.6937561035156, "logps/weighted_chosen": -2.3370361328125, "logps/weighted_rejected": -3.3424315452575684, "loss": 0.6236, "rewards/accuracies": 0.703125, "rewards/chosen": -64.8167953491211, "rewards/margins": 39.174217224121094, "rewards/rejected": -103.96992492675781, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -0.34785765409469604, "rewards/weighted_margins": 0.5167236328125, "rewards/weighted_rejected": -0.864398181438446, "step": 800 }, { "epoch": 0.4239727819942423, "grad_norm": 21.571788787841797, "learning_rate": 7.144097831531398e-07, "logits/chosen": -0.6900985836982727, "logits/rejected": -0.7189788818359375, "logps/chosen": -344.78436279296875, "logps/rejected": -364.53436279296875, "logps/weighted_chosen": -2.346606492996216, "logps/weighted_rejected": -3.139209032058716, "loss": 0.6171, "rewards/accuracies": 0.640625, "rewards/chosen": -74.72969055175781, "rewards/margins": 25.596094131469727, "rewards/rejected": -100.31640625, "rewards/weighted_accuracies": 0.6937500238418579, "rewards/weighted_chosen": -0.332855224609375, "rewards/weighted_margins": 0.513507068157196, "rewards/weighted_rejected": -0.8466736078262329, "step": 810 }, { "epoch": 0.42920701387071447, "grad_norm": 23.761091232299805, "learning_rate": 7.061193938131396e-07, "logits/chosen": -0.620227038860321, "logits/rejected": -0.6747413873672485, "logps/chosen": -377.5609436035156, "logps/rejected": -363.4296875, "logps/weighted_chosen": -2.772265672683716, "logps/weighted_rejected": -3.069580078125, "loss": 0.6365, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -77.7945327758789, "rewards/margins": 20.221874237060547, "rewards/rejected": -97.98515319824219, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -0.3748535215854645, "rewards/weighted_margins": 0.473724365234375, "rewards/weighted_rejected": -0.8482757806777954, "step": 820 }, { "epoch": 0.4344412457471866, "grad_norm": 21.571779251098633, "learning_rate": 6.977601622699789e-07, "logits/chosen": -0.689013659954071, "logits/rejected": -0.7498534917831421, "logps/chosen": -354.41876220703125, "logps/rejected": -392.3500061035156, "logps/weighted_chosen": -2.6583251953125, "logps/weighted_rejected": -3.377002000808716, "loss": 0.5618, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -69.55390930175781, "rewards/margins": 43.986717224121094, "rewards/rejected": -113.5484390258789, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -0.28594970703125, "rewards/weighted_margins": 0.6694701910018921, "rewards/weighted_rejected": -0.955474853515625, "step": 830 }, { "epoch": 0.4396754776236587, "grad_norm": 21.845787048339844, "learning_rate": 6.893348804390882e-07, "logits/chosen": -0.7911956906318665, "logits/rejected": -0.8087249994277954, "logps/chosen": -377.0531311035156, "logps/rejected": -377.48126220703125, "logps/weighted_chosen": -2.844287157058716, "logps/weighted_rejected": -3.24560546875, "loss": 0.5927, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -81.9203109741211, "rewards/margins": 35.79375076293945, "rewards/rejected": -117.70625305175781, "rewards/weighted_accuracies": 0.703125, "rewards/weighted_chosen": -0.3613952696323395, "rewards/weighted_margins": 0.5852203369140625, "rewards/weighted_rejected": -0.94671630859375, "step": 840 }, { "epoch": 0.44490970950013087, "grad_norm": 13.673724174499512, "learning_rate": 6.808463622961578e-07, "logits/chosen": -0.765423595905304, "logits/rejected": -0.8230966329574585, "logps/chosen": -385.33905029296875, "logps/rejected": -413.21563720703125, "logps/weighted_chosen": -2.7145752906799316, "logps/weighted_rejected": -3.412890672683716, "loss": 0.5718, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -92.6429672241211, "rewards/margins": 38.67695236206055, "rewards/rejected": -131.3136749267578, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -0.43825072050094604, "rewards/weighted_margins": 0.649249255657196, "rewards/weighted_rejected": -1.0877685546875, "step": 850 }, { "epoch": 0.45014394137660296, "grad_norm": 17.156784057617188, "learning_rate": 6.722974429372925e-07, "logits/chosen": -0.733477771282196, "logits/rejected": -0.7933975458145142, "logps/chosen": -418.08282470703125, "logps/rejected": -417.9937438964844, "logps/weighted_chosen": -2.5140380859375, "logps/weighted_rejected": -3.9316039085388184, "loss": 0.5611, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -114.50508117675781, "rewards/margins": 41.392189025878906, "rewards/rejected": -155.9523468017578, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -0.5683807134628296, "rewards/weighted_margins": 0.804516613483429, "rewards/weighted_rejected": -1.373052954673767, "step": 860 }, { "epoch": 0.4553781732530751, "grad_norm": 23.082002639770508, "learning_rate": 6.636909776321128e-07, "logits/chosen": -0.8063064813613892, "logits/rejected": -0.802105724811554, "logps/chosen": -369.12811279296875, "logps/rejected": -413.70782470703125, "logps/weighted_chosen": -2.945758104324341, "logps/weighted_rejected": -3.6049561500549316, "loss": 0.5946, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -107.4222640991211, "rewards/margins": 41.763671875, "rewards/rejected": -149.2078094482422, "rewards/weighted_accuracies": 0.65625, "rewards/weighted_chosen": -0.640515148639679, "rewards/weighted_margins": 0.601641833782196, "rewards/weighted_rejected": -1.2423064708709717, "step": 870 }, { "epoch": 0.46061240512954726, "grad_norm": 27.672487258911133, "learning_rate": 6.550298408701174e-07, "logits/chosen": -0.778796374797821, "logits/rejected": -0.830426037311554, "logps/chosen": -389.8421936035156, "logps/rejected": -428.6312561035156, "logps/weighted_chosen": -3.13909912109375, "logps/weighted_rejected": -4.012915134429932, "loss": 0.6358, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -103.14042663574219, "rewards/margins": 43.176368713378906, "rewards/rejected": -146.2609405517578, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.5248870849609375, "rewards/weighted_margins": 0.555926501750946, "rewards/weighted_rejected": -1.0807831287384033, "step": 880 }, { "epoch": 0.46584663700601936, "grad_norm": 20.153644561767578, "learning_rate": 6.463169254006276e-07, "logits/chosen": -0.7750915288925171, "logits/rejected": -0.8219833374023438, "logps/chosen": -377.8421936035156, "logps/rejected": -378.10467529296875, "logps/weighted_chosen": -2.7843995094299316, "logps/weighted_rejected": -3.576098680496216, "loss": 0.5705, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -96.46601867675781, "rewards/margins": 34.888282775878906, "rewards/rejected": -131.3464813232422, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -0.39057618379592896, "rewards/weighted_margins": 0.706072986125946, "rewards/weighted_rejected": -1.0967223644256592, "step": 890 }, { "epoch": 0.4710808688824915, "grad_norm": 42.77175521850586, "learning_rate": 6.375551412666326e-07, "logits/chosen": -0.7759063839912415, "logits/rejected": -0.8005096316337585, "logps/chosen": -379.19219970703125, "logps/rejected": -388.72344970703125, "logps/weighted_chosen": -2.5501952171325684, "logps/weighted_rejected": -3.5579466819763184, "loss": 0.6503, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -105.25859069824219, "rewards/margins": 27.150781631469727, "rewards/rejected": -132.42733764648438, "rewards/weighted_accuracies": 0.6312500238418579, "rewards/weighted_chosen": -0.5587005615234375, "rewards/weighted_margins": 0.5813232660293579, "rewards/weighted_rejected": -1.1405792236328125, "step": 900 }, { "epoch": 0.4763151007589636, "grad_norm": 24.800992965698242, "learning_rate": 6.287474148328583e-07, "logits/chosen": -0.7202819585800171, "logits/rejected": -0.7240753173828125, "logps/chosen": -371.1812438964844, "logps/rejected": -371.109375, "logps/weighted_chosen": -2.950915575027466, "logps/weighted_rejected": -4.175073146820068, "loss": 0.6282, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -100.2894515991211, "rewards/margins": 23.904687881469727, "rewards/rejected": -124.20625305175781, "rewards/weighted_accuracies": 0.6656249761581421, "rewards/weighted_chosen": -0.5755615234375, "rewards/weighted_margins": 0.5273803472518921, "rewards/weighted_rejected": -1.1032683849334717, "step": 910 }, { "epoch": 0.48154933263543576, "grad_norm": 18.644733428955078, "learning_rate": 6.198966878083857e-07, "logits/chosen": -0.7572265863418579, "logits/rejected": -0.7787246704101562, "logps/chosen": -368.4359436035156, "logps/rejected": -402.46563720703125, "logps/weighted_chosen": -2.8515868186950684, "logps/weighted_rejected": -3.4952635765075684, "loss": 0.6159, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -97.32890319824219, "rewards/margins": 38.184959411621094, "rewards/rejected": -135.45703125, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -0.4532226622104645, "rewards/weighted_margins": 0.559436023235321, "rewards/weighted_rejected": -1.012457251548767, "step": 920 }, { "epoch": 0.48678356451190785, "grad_norm": 44.07575988769531, "learning_rate": 6.110059162641439e-07, "logits/chosen": -0.7723480463027954, "logits/rejected": -0.802471935749054, "logps/chosen": -375.984375, "logps/rejected": -391.7906188964844, "logps/weighted_chosen": -2.391467332839966, "logps/weighted_rejected": -3.1367430686950684, "loss": 0.6244, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -93.6128921508789, "rewards/margins": 30.975391387939453, "rewards/rejected": -124.58906555175781, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -0.4246048033237457, "rewards/weighted_margins": 0.47894287109375, "rewards/weighted_rejected": -0.904034435749054, "step": 930 }, { "epoch": 0.49201779638838, "grad_norm": 26.321582794189453, "learning_rate": 6.020780696456059e-07, "logits/chosen": -0.7484909296035767, "logits/rejected": -0.7926574945449829, "logps/chosen": -359.21875, "logps/rejected": -411.1890563964844, "logps/weighted_chosen": -2.2330689430236816, "logps/weighted_rejected": -3.3442625999450684, "loss": 0.5653, "rewards/accuracies": 0.6875, "rewards/chosen": -88.5511703491211, "rewards/margins": 55.419921875, "rewards/rejected": -143.99453735351562, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -0.47887879610061646, "rewards/weighted_margins": 0.6291259527206421, "rewards/weighted_rejected": -1.1078612804412842, "step": 940 }, { "epoch": 0.49725202826485215, "grad_norm": 44.13637924194336, "learning_rate": 5.931161297810185e-07, "logits/chosen": -0.8126861453056335, "logits/rejected": -0.829357922077179, "logps/chosen": -376.6875, "logps/rejected": -399.51251220703125, "logps/weighted_chosen": -3.1253294944763184, "logps/weighted_rejected": -3.8741211891174316, "loss": 0.6461, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -103.4652328491211, "rewards/margins": 34.713279724121094, "rewards/rejected": -138.1457061767578, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -0.616656482219696, "rewards/weighted_margins": 0.5488006472587585, "rewards/weighted_rejected": -1.1659362316131592, "step": 950 }, { "epoch": 0.5024862601413242, "grad_norm": 24.634550094604492, "learning_rate": 5.841230898854959e-07, "logits/chosen": -0.742846667766571, "logits/rejected": -0.765765368938446, "logps/chosen": -421.84686279296875, "logps/rejected": -429.4312438964844, "logps/weighted_chosen": -2.9749999046325684, "logps/weighted_rejected": -3.778076171875, "loss": 0.6955, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -121.6539077758789, "rewards/margins": 46.04961013793945, "rewards/rejected": -167.69375610351562, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.820935070514679, "rewards/weighted_margins": 0.548413097858429, "rewards/weighted_rejected": -1.369299292564392, "step": 960 }, { "epoch": 0.5077204920177963, "grad_norm": 28.748939514160156, "learning_rate": 5.751019535613102e-07, "logits/chosen": -0.6985992193222046, "logits/rejected": -0.7225921750068665, "logps/chosen": -365.7984313964844, "logps/rejected": -400.90625, "logps/weighted_chosen": -2.9286131858825684, "logps/weighted_rejected": -4.021093845367432, "loss": 0.6325, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -106.0390625, "rewards/margins": 47.953514099121094, "rewards/rejected": -153.97265625, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -0.7157821655273438, "rewards/weighted_margins": 0.7237914800643921, "rewards/weighted_rejected": -1.439599633216858, "step": 970 }, { "epoch": 0.5129547238942685, "grad_norm": 15.98474407196045, "learning_rate": 5.660557337947117e-07, "logits/chosen": -0.6841033697128296, "logits/rejected": -0.6997619867324829, "logps/chosen": -409.1937561035156, "logps/rejected": -406.05938720703125, "logps/weighted_chosen": -2.480639696121216, "logps/weighted_rejected": -3.3584961891174316, "loss": 0.5997, "rewards/accuracies": 0.640625, "rewards/chosen": -114.5328140258789, "rewards/margins": 34.099998474121094, "rewards/rejected": -148.6570281982422, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -0.572741687297821, "rewards/weighted_margins": 0.527575671672821, "rewards/weighted_rejected": -1.100128173828125, "step": 980 }, { "epoch": 0.5181889557707406, "grad_norm": 41.099185943603516, "learning_rate": 5.569874519496174e-07, "logits/chosen": -0.7119758725166321, "logits/rejected": -0.7671966552734375, "logps/chosen": -381.44842529296875, "logps/rejected": -410.5015563964844, "logps/weighted_chosen": -2.8494019508361816, "logps/weighted_rejected": -3.8323974609375, "loss": 0.6259, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -106.146484375, "rewards/margins": 37.33867263793945, "rewards/rejected": -143.4011688232422, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -0.4599502682685852, "rewards/weighted_margins": 0.6096404790878296, "rewards/weighted_rejected": -1.070257544517517, "step": 990 }, { "epoch": 0.5234231876472127, "grad_norm": 34.9498176574707, "learning_rate": 5.47900136758499e-07, "logits/chosen": -0.6499813199043274, "logits/rejected": -0.7194549441337585, "logps/chosen": -369.4429626464844, "logps/rejected": -382.0953063964844, "logps/weighted_chosen": -2.7041993141174316, "logps/weighted_rejected": -3.534008741378784, "loss": 0.5974, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -101.8080062866211, "rewards/margins": 38.08086013793945, "rewards/rejected": -139.9406280517578, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -0.44673460721969604, "rewards/weighted_margins": 0.6419677734375, "rewards/weighted_rejected": -1.08880615234375, "step": 1000 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -0.7614516615867615, "eval_logits/rejected": -0.7845029234886169, "eval_logps/chosen": -403.2145080566406, "eval_logps/rejected": -419.8420104980469, "eval_logps/weighted_chosen": -2.8744430541992188, "eval_logps/weighted_rejected": -3.6894454956054688, "eval_loss": 0.6146492958068848, "eval_rewards/accuracies": 0.6269999742507935, "eval_rewards/chosen": -114.45649719238281, "eval_rewards/margins": 37.66427993774414, "eval_rewards/rejected": -152.1232452392578, "eval_rewards/weighted_accuracies": 0.6679999828338623, "eval_rewards/weighted_chosen": -0.6033662557601929, "eval_rewards/weighted_margins": 0.5922585129737854, "eval_rewards/weighted_rejected": -1.195624828338623, "eval_runtime": 1076.2039, "eval_samples_per_second": 1.858, "eval_steps_per_second": 0.465, "step": 1000 } ], "logging_steps": 10, "max_steps": 1911, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }