{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.26171159382360637, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005234231876472127, "grad_norm": 126.29230499267578, "learning_rate": 0.0, "logits/chosen": -0.40118408203125, "logits/rejected": -0.41802978515625, "logps/chosen": -297.609375, "logps/rejected": -247.84375, "logps/weighted_chosen": -4.5152587890625, "logps/weighted_rejected": -3.032470703125, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/weighted_accuracies": 0.0, "rewards/weighted_chosen": 0.0, "rewards/weighted_margins": 0.0, "rewards/weighted_rejected": 0.0, "step": 1 }, { "epoch": 0.005234231876472127, "grad_norm": 296.4369812011719, "learning_rate": 4.6875e-08, "logits/chosen": -0.3177456259727478, "logits/rejected": -0.3534359335899353, "logps/chosen": -275.5711669921875, "logps/rejected": -255.90451049804688, "logps/weighted_chosen": -2.350965738296509, "logps/weighted_rejected": -2.549940347671509, "loss": 0.6917, "rewards/accuracies": 0.25, "rewards/chosen": -0.0401475690305233, "rewards/margins": 0.04296875, "rewards/rejected": -0.0831163227558136, "rewards/weighted_accuracies": 0.3229166567325592, "rewards/weighted_chosen": -0.00032212998485192657, "rewards/weighted_margins": 0.00019327799964230508, "rewards/weighted_rejected": -0.0005154079990461469, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 21.67967414855957, "learning_rate": 9.895833333333332e-08, "logits/chosen": -0.29769057035446167, "logits/rejected": -0.313650518655777, "logps/chosen": -294.3374938964844, "logps/rejected": -272.6703186035156, "logps/weighted_chosen": -2.13031005859375, "logps/weighted_rejected": -2.103222608566284, "loss": 0.6908, "rewards/accuracies": 0.3125, "rewards/chosen": -0.06103515625, "rewards/margins": -0.01318359375, "rewards/rejected": -0.0478515625, "rewards/weighted_accuracies": 0.4437499940395355, "rewards/weighted_chosen": 0.0014366150135174394, "rewards/weighted_margins": 0.0021545409690588713, "rewards/weighted_rejected": -0.0007179260137490928, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 76.9887466430664, "learning_rate": 1.5104166666666664e-07, "logits/chosen": -0.2917121946811676, "logits/rejected": -0.337240606546402, "logps/chosen": -298.02655029296875, "logps/rejected": -268.12188720703125, "logps/weighted_chosen": -2.0724120140075684, "logps/weighted_rejected": -2.4466919898986816, "loss": 0.6912, "rewards/accuracies": 0.28125, "rewards/chosen": -0.0062500000931322575, "rewards/margins": -0.02509765699505806, "rewards/rejected": 0.01884765550494194, "rewards/weighted_accuracies": 0.4281249940395355, "rewards/weighted_chosen": 0.0027938843704760075, "rewards/weighted_margins": 0.0019706725142896175, "rewards/weighted_rejected": 0.0008232116815634072, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 32.98203659057617, "learning_rate": 2.03125e-07, "logits/chosen": -0.3011154234409332, "logits/rejected": -0.3432762026786804, "logps/chosen": -278.63751220703125, "logps/rejected": -253.88125610351562, "logps/weighted_chosen": -2.2070555686950684, "logps/weighted_rejected": -2.605224609375, "loss": 0.692, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": 0.0034667968284338713, "rewards/margins": -0.05991210788488388, "rewards/rejected": 0.06337890774011612, "rewards/weighted_accuracies": 0.35624998807907104, "rewards/weighted_chosen": 0.0014549255138263106, "rewards/weighted_margins": -0.00034332275390625, "rewards/weighted_rejected": 0.0017982482677325606, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 20.751684188842773, "learning_rate": 2.552083333333333e-07, "logits/chosen": -0.2822524905204773, "logits/rejected": -0.32080918550491333, "logps/chosen": -280.31329345703125, "logps/rejected": -267.58709716796875, "logps/weighted_chosen": -2.136962890625, "logps/weighted_rejected": -2.1753907203674316, "loss": 0.6883, "rewards/accuracies": 0.3125, "rewards/chosen": -0.07236327975988388, "rewards/margins": -0.09189452975988388, "rewards/rejected": 0.01953125, "rewards/weighted_accuracies": 0.4375, "rewards/weighted_chosen": 0.0054107666946947575, "rewards/weighted_margins": 0.0078063965775072575, "rewards/weighted_rejected": -0.0023956298828125, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 40.70024108886719, "learning_rate": 3.0729166666666665e-07, "logits/chosen": -0.3149581849575043, "logits/rejected": -0.3086872100830078, "logps/chosen": -277.6031188964844, "logps/rejected": -261.8031311035156, "logps/weighted_chosen": -2.5905518531799316, "logps/weighted_rejected": -2.4834961891174316, "loss": 0.6874, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": 0.03662109375, "rewards/margins": 0.12646484375, "rewards/rejected": -0.08984375, "rewards/weighted_accuracies": 0.5, "rewards/weighted_chosen": 0.0004280090215615928, "rewards/weighted_margins": 0.01105651818215847, "rewards/weighted_rejected": -0.01062927208840847, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 67.51947021484375, "learning_rate": 3.59375e-07, "logits/chosen": -0.318746954202652, "logits/rejected": -0.32574766874313354, "logps/chosen": -289.90313720703125, "logps/rejected": -245.04452514648438, "logps/weighted_chosen": -2.098431348800659, "logps/weighted_rejected": -2.392407178878784, "loss": 0.6841, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.16708984971046448, "rewards/margins": 0.4442382752895355, "rewards/rejected": -0.27714842557907104, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": 0.015575408935546875, "rewards/weighted_margins": 0.02174072340130806, "rewards/weighted_rejected": -0.00616531353443861, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 68.87100982666016, "learning_rate": 4.114583333333333e-07, "logits/chosen": -0.286581426858902, "logits/rejected": -0.3082527220249176, "logps/chosen": -289.5101623535156, "logps/rejected": -270.4375, "logps/weighted_chosen": -2.2385497093200684, "logps/weighted_rejected": -2.4218382835388184, "loss": 0.6727, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.31572264432907104, "rewards/margins": 0.5547851324081421, "rewards/rejected": -0.23906250298023224, "rewards/weighted_accuracies": 0.596875011920929, "rewards/weighted_chosen": 0.03613891452550888, "rewards/weighted_margins": 0.05283202975988388, "rewards/weighted_rejected": -0.01669769361615181, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 40.29203414916992, "learning_rate": 4.6354166666666664e-07, "logits/chosen": -0.3158706724643707, "logits/rejected": -0.30914992094039917, "logps/chosen": -280.5726623535156, "logps/rejected": -258.17657470703125, "logps/weighted_chosen": -2.45281982421875, "logps/weighted_rejected": -2.5444703102111816, "loss": 0.6683, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.15966796875, "rewards/margins": 0.599609375, "rewards/rejected": -0.43994140625, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": 0.05808715894818306, "rewards/weighted_margins": 0.07471618801355362, "rewards/weighted_rejected": -0.0166168212890625, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 46.855377197265625, "learning_rate": 5.156249999999999e-07, "logits/chosen": -0.2856552004814148, "logits/rejected": -0.3585342466831207, "logps/chosen": -291.05548095703125, "logps/rejected": -287.078125, "logps/weighted_chosen": -1.9577789306640625, "logps/weighted_rejected": -2.532482862472534, "loss": 0.6785, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13925781846046448, "rewards/margins": 0.9869140386581421, "rewards/rejected": -1.1261718273162842, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": 0.03715210035443306, "rewards/weighted_margins": 0.0635833740234375, "rewards/weighted_rejected": -0.02643737755715847, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 55.04579162597656, "learning_rate": 5.677083333333333e-07, "logits/chosen": -0.33493995666503906, "logits/rejected": -0.3254844546318054, "logps/chosen": -297.2953186035156, "logps/rejected": -262.6773376464844, "logps/weighted_chosen": -2.606689453125, "logps/weighted_rejected": -2.648364305496216, "loss": 0.6821, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.9228515625, "rewards/margins": 0.8955078125, "rewards/rejected": -1.818359375, "rewards/weighted_accuracies": 0.518750011920929, "rewards/weighted_chosen": -0.005747986026108265, "rewards/weighted_margins": 0.05161895602941513, "rewards/weighted_rejected": -0.05732421949505806, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 22.23135757446289, "learning_rate": 6.197916666666666e-07, "logits/chosen": -0.3393222689628601, "logits/rejected": -0.36481350660324097, "logps/chosen": -295.6703186035156, "logps/rejected": -256.3296813964844, "logps/weighted_chosen": -1.8351562023162842, "logps/weighted_rejected": -2.124218702316284, "loss": 0.6752, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.071679711341858, "rewards/margins": 1.46142578125, "rewards/rejected": -2.5331053733825684, "rewards/weighted_accuracies": 0.546875, "rewards/weighted_chosen": 0.0018810272449627519, "rewards/weighted_margins": 0.06835174560546875, "rewards/weighted_rejected": -0.0664466843008995, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 57.93917465209961, "learning_rate": 6.718749999999999e-07, "logits/chosen": -0.30284881591796875, "logits/rejected": -0.2989334166049957, "logps/chosen": -306.5074157714844, "logps/rejected": -279.8265686035156, "logps/weighted_chosen": -1.910064697265625, "logps/weighted_rejected": -2.2278685569763184, "loss": 0.6738, "rewards/accuracies": 0.578125, "rewards/chosen": -0.45097655057907104, "rewards/margins": 1.7268555164337158, "rewards/rejected": -2.177734375, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": 0.02166290208697319, "rewards/weighted_margins": 0.07758025825023651, "rewards/weighted_rejected": -0.05589141696691513, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 66.64070892333984, "learning_rate": 7.239583333333333e-07, "logits/chosen": -0.34190064668655396, "logits/rejected": -0.3586837649345398, "logps/chosen": -300.01483154296875, "logps/rejected": -276.1703186035156, "logps/weighted_chosen": -2.202807664871216, "logps/weighted_rejected": -2.474353075027466, "loss": 0.6635, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3230469226837158, "rewards/margins": 2.2220702171325684, "rewards/rejected": -3.545117139816284, "rewards/weighted_accuracies": 0.609375, "rewards/weighted_chosen": 0.0006683349492959678, "rewards/weighted_margins": 0.10604552924633026, "rewards/weighted_rejected": -0.1053924560546875, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 18.789766311645508, "learning_rate": 7.760416666666666e-07, "logits/chosen": -0.2976974546909332, "logits/rejected": -0.3081321716308594, "logps/chosen": -286.27813720703125, "logps/rejected": -255.4640655517578, "logps/weighted_chosen": -2.7657103538513184, "logps/weighted_rejected": -2.831347703933716, "loss": 0.6605, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.104687452316284, "rewards/margins": 2.5054688453674316, "rewards/rejected": -4.610156059265137, "rewards/weighted_accuracies": 0.621874988079071, "rewards/weighted_chosen": -0.0018630981212481856, "rewards/weighted_margins": 0.158416748046875, "rewards/weighted_rejected": -0.1603546142578125, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 51.51210021972656, "learning_rate": 8.28125e-07, "logits/chosen": -0.3341739773750305, "logits/rejected": -0.3859619200229645, "logps/chosen": -306.4765625, "logps/rejected": -279.1148376464844, "logps/weighted_chosen": -2.3189454078674316, "logps/weighted_rejected": -2.36669921875, "loss": 0.636, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.575390577316284, "rewards/margins": 3.349609375, "rewards/rejected": -5.925000190734863, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": 0.02147369459271431, "rewards/weighted_margins": 0.22438660264015198, "rewards/weighted_rejected": -0.203105166554451, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 398.3809509277344, "learning_rate": 8.802083333333333e-07, "logits/chosen": -0.36855775117874146, "logits/rejected": -0.37070387601852417, "logps/chosen": -307.1656188964844, "logps/rejected": -265.78436279296875, "logps/weighted_chosen": -2.459460496902466, "logps/weighted_rejected": -2.757373094558716, "loss": 0.6811, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.097460746765137, "rewards/margins": 3.488476514816284, "rewards/rejected": -7.585839748382568, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": 0.018505096435546875, "rewards/weighted_margins": 0.19701537489891052, "rewards/weighted_rejected": -0.17839965224266052, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 55.77580261230469, "learning_rate": 9.322916666666666e-07, "logits/chosen": -0.3392753601074219, "logits/rejected": -0.35816192626953125, "logps/chosen": -278.99530029296875, "logps/rejected": -265.18359375, "logps/weighted_chosen": -2.362103223800659, "logps/weighted_rejected": -2.754711866378784, "loss": 0.6944, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -5.346972465515137, "rewards/margins": 3.5015625953674316, "rewards/rejected": -8.848730087280273, "rewards/weighted_accuracies": 0.578125, "rewards/weighted_chosen": -0.05782318115234375, "rewards/weighted_margins": 0.16480103135108948, "rewards/weighted_rejected": -0.22255554795265198, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 38.015960693359375, "learning_rate": 9.84375e-07, "logits/chosen": -0.3686843812465668, "logits/rejected": -0.4041244387626648, "logps/chosen": -314.3070373535156, "logps/rejected": -276.7484436035156, "logps/weighted_chosen": -2.123486280441284, "logps/weighted_rejected": -2.6261963844299316, "loss": 0.6392, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -8.581738471984863, "rewards/margins": 4.317968845367432, "rewards/rejected": -12.900781631469727, "rewards/weighted_accuracies": 0.621874988079071, "rewards/weighted_chosen": -0.129638671875, "rewards/weighted_margins": 0.2160186767578125, "rewards/weighted_rejected": -0.345590204000473, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 46.52367401123047, "learning_rate": 9.99959085414323e-07, "logits/chosen": -0.4128967225551605, "logits/rejected": -0.4471847414970398, "logps/chosen": -320.0546875, "logps/rejected": -273.11248779296875, "logps/weighted_chosen": -2.5019164085388184, "logps/weighted_rejected": -2.9936890602111816, "loss": 0.6473, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -10.737597465515137, "rewards/margins": 4.738671779632568, "rewards/rejected": -15.476171493530273, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.11443634331226349, "rewards/weighted_margins": 0.2610321044921875, "rewards/weighted_rejected": -0.37534791231155396, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 21.238189697265625, "learning_rate": 9.997587035630105e-07, "logits/chosen": -0.4288749694824219, "logits/rejected": -0.4688262939453125, "logps/chosen": -300.0765686035156, "logps/rejected": -304.63751220703125, "logps/weighted_chosen": -2.32427978515625, "logps/weighted_rejected": -3.0592284202575684, "loss": 0.6424, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -13.117578506469727, "rewards/margins": 7.013671875, "rewards/rejected": -20.133594512939453, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -0.21423491835594177, "rewards/weighted_margins": 0.27025145292282104, "rewards/weighted_rejected": -0.4845077395439148, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 24.92041015625, "learning_rate": 9.99391406364405e-07, "logits/chosen": -0.42696380615234375, "logits/rejected": -0.429006963968277, "logps/chosen": -305.4906311035156, "logps/rejected": -288.6312561035156, "logps/weighted_chosen": -2.625018358230591, "logps/weighted_rejected": -3.102160692214966, "loss": 0.6601, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -13.349413871765137, "rewards/margins": 6.373632907867432, "rewards/rejected": -19.72265625, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.20062866806983948, "rewards/weighted_margins": 0.316873162984848, "rewards/weighted_rejected": -0.5174545049667358, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 147.95851135253906, "learning_rate": 9.988573164927884e-07, "logits/chosen": -0.3811447024345398, "logits/rejected": -0.4161086976528168, "logps/chosen": -281.33203125, "logps/rejected": -274.234375, "logps/weighted_chosen": -2.32806396484375, "logps/weighted_rejected": -2.6552734375, "loss": 0.7195, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -12.46875, "rewards/margins": 10.046093940734863, "rewards/rejected": -22.515430450439453, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.24639587104320526, "rewards/weighted_margins": 0.23908081650733948, "rewards/weighted_rejected": -0.4853073060512543, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 26.882122039794922, "learning_rate": 9.98156612329838e-07, "logits/chosen": -0.4748245179653168, "logits/rejected": -0.5250595211982727, "logps/chosen": -278.16717529296875, "logps/rejected": -306.29376220703125, "logps/weighted_chosen": -2.348803758621216, "logps/weighted_rejected": -2.9455933570861816, "loss": 0.6674, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -13.405566215515137, "rewards/margins": 10.753710746765137, "rewards/rejected": -24.158985137939453, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": -0.14908751845359802, "rewards/weighted_margins": 0.33162689208984375, "rewards/weighted_rejected": -0.48021697998046875, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 86.49760437011719, "learning_rate": 9.97289527905053e-07, "logits/chosen": -0.478302001953125, "logits/rejected": -0.48861923813819885, "logps/chosen": -277.0523376464844, "logps/rejected": -275.80938720703125, "logps/weighted_chosen": -2.61376953125, "logps/weighted_rejected": -2.787853956222534, "loss": 0.7022, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -12.673730850219727, "rewards/margins": 7.1806640625, "rewards/rejected": -19.852344512939453, "rewards/weighted_accuracies": 0.5687500238418579, "rewards/weighted_chosen": -0.12388916313648224, "rewards/weighted_margins": 0.19627074897289276, "rewards/weighted_rejected": -0.3203796446323395, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 19.698871612548828, "learning_rate": 9.962563528175875e-07, "logits/chosen": -0.4065658450126648, "logits/rejected": -0.4432968199253082, "logps/chosen": -310.62890625, "logps/rejected": -281.46405029296875, "logps/weighted_chosen": -2.184094190597534, "logps/weighted_rejected": -3.0492796897888184, "loss": 0.6507, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -11.896581649780273, "rewards/margins": 8.622265815734863, "rewards/rejected": -20.520313262939453, "rewards/weighted_accuracies": 0.606249988079071, "rewards/weighted_chosen": -0.1260833740234375, "rewards/weighted_margins": 0.25025635957717896, "rewards/weighted_rejected": -0.3761749267578125, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 16.363121032714844, "learning_rate": 9.950574321395277e-07, "logits/chosen": -0.42208632826805115, "logits/rejected": -0.4458427429199219, "logps/chosen": -305.9046936035156, "logps/rejected": -286.06561279296875, "logps/weighted_chosen": -2.40838623046875, "logps/weighted_rejected": -2.7938475608825684, "loss": 0.6573, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -16.622364044189453, "rewards/margins": 6.233202934265137, "rewards/rejected": -22.855077743530273, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.189697265625, "rewards/weighted_margins": 0.27490538358688354, "rewards/weighted_rejected": -0.4645233154296875, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 54.42692947387695, "learning_rate": 9.936931663006413e-07, "logits/chosen": -0.45263671875, "logits/rejected": -0.44363707304000854, "logps/chosen": -316.171875, "logps/rejected": -303.3656311035156, "logps/weighted_chosen": -2.4659423828125, "logps/weighted_rejected": -3.0541749000549316, "loss": 0.6068, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -13.402734756469727, "rewards/margins": 10.619824409484863, "rewards/rejected": -24.025390625, "rewards/weighted_accuracies": 0.6875, "rewards/weighted_chosen": -0.05214080959558487, "rewards/weighted_margins": 0.40336912870407104, "rewards/weighted_rejected": -0.455657958984375, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 39.20017623901367, "learning_rate": 9.921640109546357e-07, "logits/chosen": -0.42310255765914917, "logits/rejected": -0.48920440673828125, "logps/chosen": -283.7171936035156, "logps/rejected": -278.1859436035156, "logps/weighted_chosen": -2.396167039871216, "logps/weighted_rejected": -3.5881590843200684, "loss": 0.6649, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -16.9111328125, "rewards/margins": 9.704492568969727, "rewards/rejected": -26.62109375, "rewards/weighted_accuracies": 0.6031249761581421, "rewards/weighted_chosen": -0.130279541015625, "rewards/weighted_margins": 0.3882461488246918, "rewards/weighted_rejected": -0.5187179446220398, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 28.03601837158203, "learning_rate": 9.90470476826975e-07, "logits/chosen": -0.485189825296402, "logits/rejected": -0.48862916231155396, "logps/chosen": -289.09765625, "logps/rejected": -297.625, "logps/weighted_chosen": -2.2784485816955566, "logps/weighted_rejected": -2.771862745285034, "loss": 0.6608, "rewards/accuracies": 0.640625, "rewards/chosen": -19.676952362060547, "rewards/margins": 10.679491996765137, "rewards/rejected": -30.360937118530273, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.1669921875, "rewards/weighted_margins": 0.28967589139938354, "rewards/weighted_rejected": -0.4566032290458679, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 538.50927734375, "learning_rate": 9.886131295443002e-07, "logits/chosen": -0.654278576374054, "logits/rejected": -0.7076683044433594, "logps/chosen": -341.85467529296875, "logps/rejected": -309.89764404296875, "logps/weighted_chosen": -2.629150390625, "logps/weighted_rejected": -2.8698973655700684, "loss": 0.6788, "rewards/accuracies": 0.515625, "rewards/chosen": -60.568748474121094, "rewards/margins": -0.7822265625, "rewards/rejected": -59.785743713378906, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.24791869521141052, "rewards/weighted_margins": 0.28089600801467896, "rewards/weighted_rejected": -0.528765857219696, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 34.656883239746094, "learning_rate": 9.865925894455166e-07, "logits/chosen": -0.7003936767578125, "logits/rejected": -0.719250500202179, "logps/chosen": -326.3960876464844, "logps/rejected": -290.3453063964844, "logps/weighted_chosen": -2.553356885910034, "logps/weighted_rejected": -3.10992431640625, "loss": 0.7054, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.532812118530273, "rewards/margins": 6.3564453125, "rewards/rejected": -32.88788986206055, "rewards/weighted_accuracies": 0.609375, "rewards/weighted_chosen": -0.18020018935203552, "rewards/weighted_margins": 0.3489990234375, "rewards/weighted_rejected": -0.5293639898300171, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 61.1888542175293, "learning_rate": 9.84409531374603e-07, "logits/chosen": -0.6631911993026733, "logits/rejected": -0.6448425054550171, "logps/chosen": -324.87579345703125, "logps/rejected": -291.71875, "logps/weighted_chosen": -2.5611815452575684, "logps/weighted_rejected": -3.060229539871216, "loss": 0.6449, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -20.070018768310547, "rewards/margins": 8.8251953125, "rewards/rejected": -28.8876953125, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.140888974070549, "rewards/weighted_margins": 0.3719635009765625, "rewards/weighted_rejected": -0.5127013921737671, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 27.2315673828125, "learning_rate": 9.820646844552219e-07, "logits/chosen": -0.6496349573135376, "logits/rejected": -0.7006805539131165, "logps/chosen": -295.5882873535156, "logps/rejected": -297.4906311035156, "logps/weighted_chosen": -2.6988892555236816, "logps/weighted_rejected": -2.898681640625, "loss": 0.6788, "rewards/accuracies": 0.6875, "rewards/chosen": -19.176855087280273, "rewards/margins": 12.649316787719727, "rewards/rejected": -31.822460174560547, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -0.23918533325195312, "rewards/weighted_margins": 0.2961669862270355, "rewards/weighted_rejected": -0.5351837277412415, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 24.073888778686523, "learning_rate": 9.795588318471964e-07, "logits/chosen": -0.7137314081192017, "logits/rejected": -0.7225399017333984, "logps/chosen": -277.8890686035156, "logps/rejected": -304.59063720703125, "logps/weighted_chosen": -2.4300780296325684, "logps/weighted_rejected": -2.771411180496216, "loss": 0.6675, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -18.445703506469727, "rewards/margins": 9.8720703125, "rewards/rejected": -28.31640625, "rewards/weighted_accuracies": 0.637499988079071, "rewards/weighted_chosen": -0.20159301161766052, "rewards/weighted_margins": 0.2779785096645355, "rewards/weighted_rejected": -0.47947996854782104, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 12.750471115112305, "learning_rate": 9.768928104849415e-07, "logits/chosen": -0.7212737798690796, "logits/rejected": -0.7225433588027954, "logps/chosen": -299.53594970703125, "logps/rejected": -275.5718688964844, "logps/weighted_chosen": -2.667529344558716, "logps/weighted_rejected": -2.800830125808716, "loss": 0.6916, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -16.816015243530273, "rewards/margins": 9.876562118530273, "rewards/rejected": -26.690235137939453, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.15018615126609802, "rewards/weighted_margins": 0.301962286233902, "rewards/weighted_rejected": -0.45206451416015625, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 29.441747665405273, "learning_rate": 9.740675107979355e-07, "logits/chosen": -0.6865798830986023, "logits/rejected": -0.7117553949356079, "logps/chosen": -331.06561279296875, "logps/rejected": -300.31719970703125, "logps/weighted_chosen": -1.9907715320587158, "logps/weighted_rejected": -2.932177782058716, "loss": 0.6819, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -16.022266387939453, "rewards/margins": 9.630078315734863, "rewards/rejected": -25.654491424560547, "rewards/weighted_accuracies": 0.640625, "rewards/weighted_chosen": -0.18877258896827698, "rewards/weighted_margins": 0.2808380126953125, "rewards/weighted_rejected": -0.4699081480503082, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 29.483524322509766, "learning_rate": 9.71083876413323e-07, "logits/chosen": -0.6637862920761108, "logits/rejected": -0.669873058795929, "logps/chosen": -322.3882751464844, "logps/rejected": -300.85858154296875, "logps/weighted_chosen": -2.189379930496216, "logps/weighted_rejected": -2.9217162132263184, "loss": 0.6846, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -20.004688262939453, "rewards/margins": 11.246289253234863, "rewards/rejected": -31.24609375, "rewards/weighted_accuracies": 0.6000000238418579, "rewards/weighted_chosen": -0.22467346489429474, "rewards/weighted_margins": 0.2720580995082855, "rewards/weighted_rejected": -0.4967102110385895, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 20.563907623291016, "learning_rate": 9.67942903840751e-07, "logits/chosen": -0.7051689028739929, "logits/rejected": -0.7537201046943665, "logps/chosen": -324.1015625, "logps/rejected": -310.375, "logps/weighted_chosen": -2.397631883621216, "logps/weighted_rejected": -2.950610399246216, "loss": 0.6478, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -19.788671493530273, "rewards/margins": 16.317577362060547, "rewards/rejected": -36.111328125, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.22498169541358948, "rewards/weighted_margins": 0.3520751893520355, "rewards/weighted_rejected": -0.5770629644393921, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 23.1771183013916, "learning_rate": 9.646456421395447e-07, "logits/chosen": -0.7504974603652954, "logits/rejected": -0.7628723382949829, "logps/chosen": -341.2171936035156, "logps/rejected": -343.9375, "logps/weighted_chosen": -2.2680420875549316, "logps/weighted_rejected": -3.0065674781799316, "loss": 0.6746, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -22.944530487060547, "rewards/margins": 17.315624237060547, "rewards/rejected": -40.2587890625, "rewards/weighted_accuracies": 0.621874988079071, "rewards/weighted_chosen": -0.20337525010108948, "rewards/weighted_margins": 0.216084286570549, "rewards/weighted_rejected": -0.4196624755859375, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 14.420520782470703, "learning_rate": 9.611931925683266e-07, "logits/chosen": -0.7154334783554077, "logits/rejected": -0.7491073608398438, "logps/chosen": -331.38983154296875, "logps/rejected": -303.3890686035156, "logps/weighted_chosen": -2.190844774246216, "logps/weighted_rejected": -2.7472167015075684, "loss": 0.6135, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -25.111621856689453, "rewards/margins": 14.543554306030273, "rewards/rejected": -39.658592224121094, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.21762695908546448, "rewards/weighted_margins": 0.36674195528030396, "rewards/weighted_rejected": -0.584503173828125, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 27.429603576660156, "learning_rate": 9.575867082172085e-07, "logits/chosen": -0.7379547357559204, "logits/rejected": -0.7826202511787415, "logps/chosen": -337.46405029296875, "logps/rejected": -320.23907470703125, "logps/weighted_chosen": -2.6387085914611816, "logps/weighted_rejected": -2.712329149246216, "loss": 0.6716, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -32.554298400878906, "rewards/margins": 17.513866424560547, "rewards/rejected": -50.060157775878906, "rewards/weighted_accuracies": 0.6468750238418579, "rewards/weighted_chosen": -0.35613709688186646, "rewards/weighted_margins": 0.38392335176467896, "rewards/weighted_rejected": -0.7401062250137329, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 18.97144889831543, "learning_rate": 9.538273936226673e-07, "logits/chosen": -0.778491199016571, "logits/rejected": -0.811004638671875, "logps/chosen": -292.83984375, "logps/rejected": -304.05352783203125, "logps/weighted_chosen": -2.796630859375, "logps/weighted_rejected": -3.2444825172424316, "loss": 0.6544, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -27.327733993530273, "rewards/margins": 12.26318359375, "rewards/rejected": -39.58984375, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.21069030463695526, "rewards/weighted_margins": 0.3678832948207855, "rewards/weighted_rejected": -0.57806396484375, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 25.43462371826172, "learning_rate": 9.499165043652391e-07, "logits/chosen": -0.7674010992050171, "logits/rejected": -0.7686828374862671, "logps/chosen": -319.55859375, "logps/rejected": -309.03436279296875, "logps/weighted_chosen": -2.82305908203125, "logps/weighted_rejected": -2.993237257003784, "loss": 0.631, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -28.386133193969727, "rewards/margins": 13.9384765625, "rewards/rejected": -42.326072692871094, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.26903897523880005, "rewards/weighted_margins": 0.353515625, "rewards/weighted_rejected": -0.622546374797821, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 65.78443908691406, "learning_rate": 9.458553466501665e-07, "logits/chosen": -0.8066772222518921, "logits/rejected": -0.8363037109375, "logps/chosen": -314.7945251464844, "logps/rejected": -287.65313720703125, "logps/weighted_chosen": -2.8233399391174316, "logps/weighted_rejected": -3.013622999191284, "loss": 0.6831, "rewards/accuracies": 0.640625, "rewards/chosen": -28.642578125, "rewards/margins": 15.162694931030273, "rewards/rejected": -43.80976486206055, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -0.3802246153354645, "rewards/weighted_margins": 0.3342132568359375, "rewards/weighted_rejected": -0.7139984369277954, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 13.290085792541504, "learning_rate": 9.416452768711366e-07, "logits/chosen": -0.7957550287246704, "logits/rejected": -0.8287414312362671, "logps/chosen": -323.0093688964844, "logps/rejected": -306.98907470703125, "logps/weighted_chosen": -2.544872999191284, "logps/weighted_rejected": -3.100903272628784, "loss": 0.6759, "rewards/accuracies": 0.640625, "rewards/chosen": -30.906835556030273, "rewards/margins": 16.355859756469727, "rewards/rejected": -47.24980545043945, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.343759149312973, "rewards/weighted_margins": 0.40337830781936646, "rewards/weighted_rejected": -0.747100830078125, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 21.766939163208008, "learning_rate": 9.372877011572557e-07, "logits/chosen": -0.7200164794921875, "logits/rejected": -0.742279052734375, "logps/chosen": -342.75079345703125, "logps/rejected": -318.60626220703125, "logps/weighted_chosen": -2.5311522483825684, "logps/weighted_rejected": -2.990124464035034, "loss": 0.63, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -32.077247619628906, "rewards/margins": 11.883398056030273, "rewards/rejected": -43.959373474121094, "rewards/weighted_accuracies": 0.6656249761581421, "rewards/weighted_chosen": -0.30525511503219604, "rewards/weighted_margins": 0.45678406953811646, "rewards/weighted_rejected": -0.7624969482421875, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 28.76239776611328, "learning_rate": 9.327840749034141e-07, "logits/chosen": -0.7930053472518921, "logits/rejected": -0.8311401605606079, "logps/chosen": -316.79998779296875, "logps/rejected": -326.5062561035156, "logps/weighted_chosen": -2.4120116233825684, "logps/weighted_rejected": -3.591870069503784, "loss": 0.6639, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -29.640039443969727, "rewards/margins": 20.350976943969727, "rewards/rejected": -49.9853515625, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -0.27521055936813354, "rewards/weighted_margins": 0.4522705078125, "rewards/weighted_rejected": -0.7274719476699829, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 36.629127502441406, "learning_rate": 9.281359022841965e-07, "logits/chosen": -0.72747802734375, "logits/rejected": -0.7426910400390625, "logps/chosen": -308.5406188964844, "logps/rejected": -300.71484375, "logps/weighted_chosen": -2.6044554710388184, "logps/weighted_rejected": -3.862866163253784, "loss": 0.6178, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -33.54804611206055, "rewards/margins": 22.3515625, "rewards/rejected": -55.88496017456055, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -0.38198548555374146, "rewards/weighted_margins": 0.525561511516571, "rewards/weighted_rejected": -0.9073349237442017, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 20.296154022216797, "learning_rate": 9.233447357514989e-07, "logits/chosen": -0.7092193365097046, "logits/rejected": -0.751629650592804, "logps/chosen": -337.10467529296875, "logps/rejected": -328.71875, "logps/weighted_chosen": -3.054370164871216, "logps/weighted_rejected": -3.5334715843200684, "loss": 0.6534, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -39.15234375, "rewards/margins": 19.770116806030273, "rewards/rejected": -58.90898513793945, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.470510870218277, "rewards/weighted_margins": 0.566607654094696, "rewards/weighted_rejected": -1.0376465320587158, "step": 500 }, { "epoch": 0.26171159382360637, "eval_logits/chosen": -0.8052441477775574, "eval_logits/rejected": -0.8225547075271606, "eval_logps/chosen": -333.44000244140625, "eval_logps/rejected": -331.98199462890625, "eval_logps/weighted_chosen": -2.756896734237671, "eval_logps/weighted_rejected": -3.441680908203125, "eval_loss": 0.6561886668205261, "eval_rewards/accuracies": 0.6370000243186951, "eval_rewards/chosen": -44.67877960205078, "eval_rewards/margins": 19.602703094482422, "eval_rewards/rejected": -64.27362823486328, "eval_rewards/weighted_accuracies": 0.6445000171661377, "eval_rewards/weighted_chosen": -0.485819548368454, "eval_rewards/weighted_margins": 0.4620407819747925, "eval_rewards/weighted_rejected": -0.9478604793548584, "eval_runtime": 1263.3333, "eval_samples_per_second": 1.583, "eval_steps_per_second": 0.396, "step": 500 } ], "logging_steps": 10, "max_steps": 1911, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }