{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3086864060199574, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.000000000000001e-07, "logits/chosen": -1.977054476737976, "logits/rejected": -2.017892599105835, "logps/chosen": -169.97320556640625, "logps/rejected": -186.7821807861328, "loss": 0.7082, "rewards/accuracies": 0.4375, "rewards/chosen": -0.016062308102846146, "rewards/margins": -0.02699420601129532, "rewards/rejected": 0.010931899771094322, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -1.8305251598358154, "logits/rejected": -1.8582998514175415, "logps/chosen": -155.8516082763672, "logps/rejected": -165.23692321777344, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.0017804148374125361, "rewards/margins": 0.004515504464507103, "rewards/rejected": -0.0027350913733243942, "step": 2 }, { "epoch": 0.0, "learning_rate": 1.5e-06, "logits/chosen": -1.7455682754516602, "logits/rejected": -1.7944730520248413, "logps/chosen": -158.9869842529297, "logps/rejected": -179.4861602783203, "loss": 0.6886, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00722665898501873, "rewards/margins": 0.01148004550486803, "rewards/rejected": -0.004253389313817024, "step": 3 }, { "epoch": 0.01, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -1.8648240566253662, "logits/rejected": -1.8583375215530396, "logps/chosen": -186.27041625976562, "logps/rejected": -174.98153686523438, "loss": 0.6867, "rewards/accuracies": 0.5625, "rewards/chosen": 0.013353967107832432, "rewards/margins": 0.014140583574771881, "rewards/rejected": -0.0007866150699555874, "step": 4 }, { "epoch": 0.01, "learning_rate": 2.5e-06, "logits/chosen": -1.5987751483917236, "logits/rejected": -1.602742075920105, "logps/chosen": -166.3009796142578, "logps/rejected": -182.16493225097656, "loss": 0.6973, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01031036488711834, "rewards/margins": -0.007737827021628618, "rewards/rejected": -0.0025725378654897213, "step": 5 }, { "epoch": 0.01, "learning_rate": 3e-06, "logits/chosen": -1.4330253601074219, "logits/rejected": -1.4852710962295532, "logps/chosen": -170.77926635742188, "logps/rejected": -203.38731384277344, "loss": 0.6967, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0022835731506347656, "rewards/margins": -0.005300428252667189, "rewards/rejected": 0.003016853705048561, "step": 6 }, { "epoch": 0.01, "learning_rate": 3.5000000000000004e-06, "logits/chosen": -1.5559855699539185, "logits/rejected": -1.5930315256118774, "logps/chosen": -200.5897979736328, "logps/rejected": -216.58615112304688, "loss": 0.6832, "rewards/accuracies": 0.4375, "rewards/chosen": 0.015020323917269707, "rewards/margins": 0.02334442362189293, "rewards/rejected": -0.008324098773300648, "step": 7 }, { "epoch": 0.01, "learning_rate": 4.000000000000001e-06, "logits/chosen": -1.7643598318099976, "logits/rejected": -1.7637088298797607, "logps/chosen": -166.2308349609375, "logps/rejected": -167.15399169921875, "loss": 0.6973, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03954277187585831, "rewards/margins": -0.005299141630530357, "rewards/rejected": -0.0342436321079731, "step": 8 }, { "epoch": 0.01, "learning_rate": 4.5e-06, "logits/chosen": -2.035048246383667, "logits/rejected": -2.1090240478515625, "logps/chosen": -192.5548095703125, "logps/rejected": -188.7212371826172, "loss": 0.6788, "rewards/accuracies": 0.625, "rewards/chosen": 0.019565440714359283, "rewards/margins": 0.030419450253248215, "rewards/rejected": -0.010854003950953484, "step": 9 }, { "epoch": 0.01, "learning_rate": 5e-06, "logits/chosen": -1.78806471824646, "logits/rejected": -1.771653175354004, "logps/chosen": -169.3682403564453, "logps/rejected": -163.67990112304688, "loss": 0.6861, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0037752394564449787, "rewards/margins": 0.01528622955083847, "rewards/rejected": -0.01906147226691246, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.500000000000001e-06, "logits/chosen": -1.6899704933166504, "logits/rejected": -1.7716753482818604, "logps/chosen": -197.17576599121094, "logps/rejected": -232.6337127685547, "loss": 0.7158, "rewards/accuracies": 0.3125, "rewards/chosen": -0.06386594474315643, "rewards/margins": -0.04272947832942009, "rewards/rejected": -0.02113647386431694, "step": 11 }, { "epoch": 0.02, "learning_rate": 6e-06, "logits/chosen": -1.8938390016555786, "logits/rejected": -1.8334659337997437, "logps/chosen": -150.29385375976562, "logps/rejected": -159.55947875976562, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": 0.0028306012973189354, "rewards/margins": 0.01067290361970663, "rewards/rejected": -0.00784230325371027, "step": 12 }, { "epoch": 0.02, "learning_rate": 6.5000000000000004e-06, "logits/chosen": -1.87659752368927, "logits/rejected": -1.8801605701446533, "logps/chosen": -205.460693359375, "logps/rejected": -202.0462188720703, "loss": 0.6984, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0009463531896471977, "rewards/margins": -0.007184028625488281, "rewards/rejected": 0.008130382746458054, "step": 13 }, { "epoch": 0.02, "learning_rate": 7.000000000000001e-06, "logits/chosen": -1.9238759279251099, "logits/rejected": -2.0003201961517334, "logps/chosen": -184.6726531982422, "logps/rejected": -177.62509155273438, "loss": 0.6972, "rewards/accuracies": 0.5, "rewards/chosen": -0.01391973439604044, "rewards/margins": -0.004122593905776739, "rewards/rejected": -0.009797144681215286, "step": 14 }, { "epoch": 0.02, "learning_rate": 7.5e-06, "logits/chosen": -1.569742202758789, "logits/rejected": -1.5442943572998047, "logps/chosen": -171.07496643066406, "logps/rejected": -170.74981689453125, "loss": 0.6992, "rewards/accuracies": 0.5, "rewards/chosen": -0.020387031137943268, "rewards/margins": -0.008263109251856804, "rewards/rejected": -0.012123920023441315, "step": 15 }, { "epoch": 0.02, "learning_rate": 8.000000000000001e-06, "logits/chosen": -1.7774536609649658, "logits/rejected": -1.8797023296356201, "logps/chosen": -173.0279541015625, "logps/rejected": -187.43557739257812, "loss": 0.6999, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03834056854248047, "rewards/margins": -0.01055521797388792, "rewards/rejected": -0.027785349637269974, "step": 16 }, { "epoch": 0.02, "learning_rate": 8.500000000000002e-06, "logits/chosen": -1.7320338487625122, "logits/rejected": -1.6945910453796387, "logps/chosen": -185.27392578125, "logps/rejected": -189.87738037109375, "loss": 0.6808, "rewards/accuracies": 0.6875, "rewards/chosen": 0.007571219466626644, "rewards/margins": 0.025459958240389824, "rewards/rejected": -0.017888737842440605, "step": 17 }, { "epoch": 0.02, "learning_rate": 9e-06, "logits/chosen": -1.7604248523712158, "logits/rejected": -1.7776434421539307, "logps/chosen": -196.05404663085938, "logps/rejected": -190.14569091796875, "loss": 0.6914, "rewards/accuracies": 0.4375, "rewards/chosen": -0.02323136478662491, "rewards/margins": 0.00760660320520401, "rewards/rejected": -0.03083796612918377, "step": 18 }, { "epoch": 0.02, "learning_rate": 9.5e-06, "logits/chosen": -1.59357488155365, "logits/rejected": -1.5590327978134155, "logps/chosen": -213.9495391845703, "logps/rejected": -218.56654357910156, "loss": 0.7222, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05001959949731827, "rewards/margins": -0.05468587949872017, "rewards/rejected": 0.004666280932724476, "step": 19 }, { "epoch": 0.03, "learning_rate": 1e-05, "logits/chosen": -1.752557635307312, "logits/rejected": -1.7027242183685303, "logps/chosen": -213.31336975097656, "logps/rejected": -204.09646606445312, "loss": 0.6887, "rewards/accuracies": 0.5625, "rewards/chosen": -0.017501091584563255, "rewards/margins": 0.010245682671666145, "rewards/rejected": -0.027746770530939102, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.05e-05, "logits/chosen": -1.9553875923156738, "logits/rejected": -1.9184911251068115, "logps/chosen": -175.35333251953125, "logps/rejected": -180.54550170898438, "loss": 0.655, "rewards/accuracies": 0.9375, "rewards/chosen": 0.021092725917696953, "rewards/margins": 0.08075069636106491, "rewards/rejected": -0.059657976031303406, "step": 21 }, { "epoch": 0.03, "learning_rate": 1.1000000000000001e-05, "logits/chosen": -1.8159900903701782, "logits/rejected": -1.771599531173706, "logps/chosen": -185.33059692382812, "logps/rejected": -209.6474609375, "loss": 0.6913, "rewards/accuracies": 0.5, "rewards/chosen": -0.017464350908994675, "rewards/margins": 0.006189251318573952, "rewards/rejected": -0.023653600364923477, "step": 22 }, { "epoch": 0.03, "learning_rate": 1.1500000000000002e-05, "logits/chosen": -1.8995972871780396, "logits/rejected": -1.9293156862258911, "logps/chosen": -178.39755249023438, "logps/rejected": -211.63937377929688, "loss": 0.7191, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05782761424779892, "rewards/margins": -0.04498009383678436, "rewards/rejected": -0.012847519479691982, "step": 23 }, { "epoch": 0.03, "learning_rate": 1.2e-05, "logits/chosen": -1.8924778699874878, "logits/rejected": -1.8979182243347168, "logps/chosen": -167.22109985351562, "logps/rejected": -176.33663940429688, "loss": 0.7028, "rewards/accuracies": 0.5, "rewards/chosen": -0.02767309918999672, "rewards/margins": -0.016704557463526726, "rewards/rejected": -0.010968542657792568, "step": 24 }, { "epoch": 0.03, "learning_rate": 1.25e-05, "logits/chosen": -1.840967059135437, "logits/rejected": -1.797666311264038, "logps/chosen": -175.57766723632812, "logps/rejected": -148.11917114257812, "loss": 0.6967, "rewards/accuracies": 0.3125, "rewards/chosen": -0.01812169887125492, "rewards/margins": -0.003334569279104471, "rewards/rejected": -0.014787126332521439, "step": 25 }, { "epoch": 0.03, "learning_rate": 1.3000000000000001e-05, "logits/chosen": -1.8270690441131592, "logits/rejected": -1.7811157703399658, "logps/chosen": -165.993896484375, "logps/rejected": -165.7574005126953, "loss": 0.7151, "rewards/accuracies": 0.375, "rewards/chosen": -0.06210968643426895, "rewards/margins": -0.040364596992731094, "rewards/rejected": -0.02174508571624756, "step": 26 }, { "epoch": 0.04, "learning_rate": 1.3500000000000001e-05, "logits/chosen": -1.877970576286316, "logits/rejected": -1.9722176790237427, "logps/chosen": -148.4381103515625, "logps/rejected": -173.12579345703125, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": -0.022617867216467857, "rewards/margins": 0.021304797381162643, "rewards/rejected": -0.04392266273498535, "step": 27 }, { "epoch": 0.04, "learning_rate": 1.4000000000000001e-05, "logits/chosen": -1.922934889793396, "logits/rejected": -1.9657589197158813, "logps/chosen": -170.66554260253906, "logps/rejected": -176.7419891357422, "loss": 0.7316, "rewards/accuracies": 0.1875, "rewards/chosen": -0.10023985058069229, "rewards/margins": -0.0725136250257492, "rewards/rejected": -0.027726221829652786, "step": 28 }, { "epoch": 0.04, "learning_rate": 1.45e-05, "logits/chosen": -1.9069832563400269, "logits/rejected": -1.8908956050872803, "logps/chosen": -193.16102600097656, "logps/rejected": -193.22003173828125, "loss": 0.6847, "rewards/accuracies": 0.3125, "rewards/chosen": 0.005608892068266869, "rewards/margins": 0.020983649417757988, "rewards/rejected": -0.01537475548684597, "step": 29 }, { "epoch": 0.04, "learning_rate": 1.5e-05, "logits/chosen": -1.7016417980194092, "logits/rejected": -1.7221649885177612, "logps/chosen": -188.40786743164062, "logps/rejected": -175.92909240722656, "loss": 0.7016, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09070225059986115, "rewards/margins": -0.013078359887003899, "rewards/rejected": -0.0776238888502121, "step": 30 }, { "epoch": 0.04, "learning_rate": 1.55e-05, "logits/chosen": -1.9492610692977905, "logits/rejected": -1.9104335308074951, "logps/chosen": -180.67147827148438, "logps/rejected": -184.8843994140625, "loss": 0.6708, "rewards/accuracies": 0.625, "rewards/chosen": 0.019913675263524055, "rewards/margins": 0.049428701400756836, "rewards/rejected": -0.02951502799987793, "step": 31 }, { "epoch": 0.04, "learning_rate": 1.6000000000000003e-05, "logits/chosen": -1.8524138927459717, "logits/rejected": -1.9383589029312134, "logps/chosen": -159.52560424804688, "logps/rejected": -170.27255249023438, "loss": 0.7146, "rewards/accuracies": 0.375, "rewards/chosen": -0.03338008001446724, "rewards/margins": -0.03860168159008026, "rewards/rejected": 0.00522160530090332, "step": 32 }, { "epoch": 0.04, "learning_rate": 1.65e-05, "logits/chosen": -1.7788478136062622, "logits/rejected": -1.835086703300476, "logps/chosen": -180.18177795410156, "logps/rejected": -206.07110595703125, "loss": 0.7059, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05509199947118759, "rewards/margins": -0.023075008764863014, "rewards/rejected": -0.03201699256896973, "step": 33 }, { "epoch": 0.04, "learning_rate": 1.7000000000000003e-05, "logits/chosen": -1.8998254537582397, "logits/rejected": -1.9138494729995728, "logps/chosen": -194.03167724609375, "logps/rejected": -203.6524658203125, "loss": 0.6915, "rewards/accuracies": 0.375, "rewards/chosen": -0.05050516128540039, "rewards/margins": 0.007271335460245609, "rewards/rejected": -0.057776499539613724, "step": 34 }, { "epoch": 0.05, "learning_rate": 1.75e-05, "logits/chosen": -1.9758315086364746, "logits/rejected": -2.0455610752105713, "logps/chosen": -153.28883361816406, "logps/rejected": -162.89920043945312, "loss": 0.7122, "rewards/accuracies": 0.3125, "rewards/chosen": -0.04079794883728027, "rewards/margins": -0.034585997462272644, "rewards/rejected": -0.006211946718394756, "step": 35 }, { "epoch": 0.05, "learning_rate": 1.8e-05, "logits/chosen": -1.597517967224121, "logits/rejected": -1.631009817123413, "logps/chosen": -160.9354248046875, "logps/rejected": -168.85618591308594, "loss": 0.6918, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07598643749952316, "rewards/margins": 0.005950784310698509, "rewards/rejected": -0.08193722367286682, "step": 36 }, { "epoch": 0.05, "learning_rate": 1.85e-05, "logits/chosen": -1.7368295192718506, "logits/rejected": -1.7420881986618042, "logps/chosen": -169.4617462158203, "logps/rejected": -184.20599365234375, "loss": 0.6948, "rewards/accuracies": 0.25, "rewards/chosen": -0.0290069580078125, "rewards/margins": -0.0007306085899472237, "rewards/rejected": -0.0282763484865427, "step": 37 }, { "epoch": 0.05, "learning_rate": 1.9e-05, "logits/chosen": -1.687361717224121, "logits/rejected": -1.7176148891448975, "logps/chosen": -174.06297302246094, "logps/rejected": -192.69715881347656, "loss": 0.6973, "rewards/accuracies": 0.5, "rewards/chosen": -0.09517102688550949, "rewards/margins": -0.0029970891773700714, "rewards/rejected": -0.09217393398284912, "step": 38 }, { "epoch": 0.05, "learning_rate": 1.9500000000000003e-05, "logits/chosen": -1.8842296600341797, "logits/rejected": -1.9039793014526367, "logps/chosen": -162.37741088867188, "logps/rejected": -176.11697387695312, "loss": 0.7109, "rewards/accuracies": 0.375, "rewards/chosen": -0.13322168588638306, "rewards/margins": -0.03214216232299805, "rewards/rejected": -0.10107951611280441, "step": 39 }, { "epoch": 0.05, "learning_rate": 2e-05, "logits/chosen": -1.6896815299987793, "logits/rejected": -1.7052747011184692, "logps/chosen": -155.33111572265625, "logps/rejected": -159.064208984375, "loss": 0.6846, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04464542865753174, "rewards/margins": 0.025189755484461784, "rewards/rejected": -0.06983518600463867, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.05e-05, "logits/chosen": -1.7584484815597534, "logits/rejected": -1.6827234029769897, "logps/chosen": -161.47369384765625, "logps/rejected": -166.0779571533203, "loss": 0.6856, "rewards/accuracies": 0.375, "rewards/chosen": -0.07150936126708984, "rewards/margins": 0.019259024411439896, "rewards/rejected": -0.09076839685440063, "step": 41 }, { "epoch": 0.05, "learning_rate": 2.1e-05, "logits/chosen": -2.0230112075805664, "logits/rejected": -1.9628382921218872, "logps/chosen": -160.32073974609375, "logps/rejected": -182.98248291015625, "loss": 0.6857, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09749487042427063, "rewards/margins": 0.01751875691115856, "rewards/rejected": -0.11501362919807434, "step": 42 }, { "epoch": 0.06, "learning_rate": 2.15e-05, "logits/chosen": -1.8469973802566528, "logits/rejected": -1.814754843711853, "logps/chosen": -194.247314453125, "logps/rejected": -204.80491638183594, "loss": 0.7142, "rewards/accuracies": 0.375, "rewards/chosen": -0.15520897507667542, "rewards/margins": -0.03922419250011444, "rewards/rejected": -0.11598476767539978, "step": 43 }, { "epoch": 0.06, "learning_rate": 2.2000000000000003e-05, "logits/chosen": -1.9133589267730713, "logits/rejected": -1.9711928367614746, "logps/chosen": -186.54978942871094, "logps/rejected": -189.23045349121094, "loss": 0.7238, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1358685940504074, "rewards/margins": -0.05334620922803879, "rewards/rejected": -0.08252239227294922, "step": 44 }, { "epoch": 0.06, "learning_rate": 2.25e-05, "logits/chosen": -1.6977447271347046, "logits/rejected": -1.7883001565933228, "logps/chosen": -178.20858764648438, "logps/rejected": -201.03770446777344, "loss": 0.6465, "rewards/accuracies": 0.8125, "rewards/chosen": -0.053537700325250626, "rewards/margins": 0.1018117368221283, "rewards/rejected": -0.15534944832324982, "step": 45 }, { "epoch": 0.06, "learning_rate": 2.3000000000000003e-05, "logits/chosen": -1.9271280765533447, "logits/rejected": -1.9463679790496826, "logps/chosen": -176.5238800048828, "logps/rejected": -166.8386993408203, "loss": 0.7159, "rewards/accuracies": 0.5, "rewards/chosen": -0.09149947762489319, "rewards/margins": -0.04024248570203781, "rewards/rejected": -0.05125699192285538, "step": 46 }, { "epoch": 0.06, "learning_rate": 2.35e-05, "logits/chosen": -1.7680522203445435, "logits/rejected": -1.668277621269226, "logps/chosen": -189.94366455078125, "logps/rejected": -177.8812255859375, "loss": 0.6647, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08811293542385101, "rewards/margins": 0.06799888610839844, "rewards/rejected": -0.15611180663108826, "step": 47 }, { "epoch": 0.06, "learning_rate": 2.4e-05, "logits/chosen": -1.5737125873565674, "logits/rejected": -1.626237154006958, "logps/chosen": -177.7281036376953, "logps/rejected": -185.34068298339844, "loss": 0.7038, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12158739566802979, "rewards/margins": -0.004673934541642666, "rewards/rejected": -0.11691347509622574, "step": 48 }, { "epoch": 0.06, "learning_rate": 2.45e-05, "logits/chosen": -1.487099528312683, "logits/rejected": -1.5200517177581787, "logps/chosen": -152.00973510742188, "logps/rejected": -163.13150024414062, "loss": 0.7211, "rewards/accuracies": 0.25, "rewards/chosen": -0.12497053295373917, "rewards/margins": -0.05033881962299347, "rewards/rejected": -0.0746317207813263, "step": 49 }, { "epoch": 0.07, "learning_rate": 2.5e-05, "logits/chosen": -1.751338243484497, "logits/rejected": -1.7326526641845703, "logps/chosen": -180.03758239746094, "logps/rejected": -191.03634643554688, "loss": 0.7029, "rewards/accuracies": 0.5, "rewards/chosen": -0.12404017895460129, "rewards/margins": -0.013475272804498672, "rewards/rejected": -0.11056490242481232, "step": 50 }, { "epoch": 0.07, "learning_rate": 2.5500000000000003e-05, "logits/chosen": -1.9426243305206299, "logits/rejected": -1.9601188898086548, "logps/chosen": -158.4928741455078, "logps/rejected": -164.06317138671875, "loss": 0.7142, "rewards/accuracies": 0.4375, "rewards/chosen": -0.12027917802333832, "rewards/margins": -0.028645988553762436, "rewards/rejected": -0.09163318574428558, "step": 51 }, { "epoch": 0.07, "learning_rate": 2.6000000000000002e-05, "logits/chosen": -1.5965681076049805, "logits/rejected": -1.597716212272644, "logps/chosen": -195.06454467773438, "logps/rejected": -193.9747314453125, "loss": 0.6905, "rewards/accuracies": 0.625, "rewards/chosen": -0.14499236643314362, "rewards/margins": 0.02140347845852375, "rewards/rejected": -0.16639582812786102, "step": 52 }, { "epoch": 0.07, "learning_rate": 2.6500000000000004e-05, "logits/chosen": -1.9194509983062744, "logits/rejected": -1.883784294128418, "logps/chosen": -161.39295959472656, "logps/rejected": -167.61610412597656, "loss": 0.6965, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11441681534051895, "rewards/margins": -0.00019459612667560577, "rewards/rejected": -0.1142222210764885, "step": 53 }, { "epoch": 0.07, "learning_rate": 2.7000000000000002e-05, "logits/chosen": -1.7368502616882324, "logits/rejected": -1.7216427326202393, "logps/chosen": -205.18130493164062, "logps/rejected": -191.90237426757812, "loss": 0.66, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1574774831533432, "rewards/margins": 0.07581701874732971, "rewards/rejected": -0.23329448699951172, "step": 54 }, { "epoch": 0.07, "learning_rate": 2.7500000000000004e-05, "logits/chosen": -2.0303847789764404, "logits/rejected": -2.0456559658050537, "logps/chosen": -169.33453369140625, "logps/rejected": -166.40707397460938, "loss": 0.6479, "rewards/accuracies": 0.75, "rewards/chosen": -0.14774441719055176, "rewards/margins": 0.1020236536860466, "rewards/rejected": -0.24976806342601776, "step": 55 }, { "epoch": 0.07, "learning_rate": 2.8000000000000003e-05, "logits/chosen": -2.035883903503418, "logits/rejected": -1.9933511018753052, "logps/chosen": -190.82186889648438, "logps/rejected": -190.29147338867188, "loss": 0.7023, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11226066946983337, "rewards/margins": -0.01058507151901722, "rewards/rejected": -0.1016756072640419, "step": 56 }, { "epoch": 0.07, "learning_rate": 2.8499999999999998e-05, "logits/chosen": -1.9773008823394775, "logits/rejected": -1.935595989227295, "logps/chosen": -180.8785400390625, "logps/rejected": -193.48155212402344, "loss": 0.6739, "rewards/accuracies": 0.5, "rewards/chosen": -0.2835652828216553, "rewards/margins": 0.048453718423843384, "rewards/rejected": -0.33201897144317627, "step": 57 }, { "epoch": 0.08, "learning_rate": 2.9e-05, "logits/chosen": -1.681299090385437, "logits/rejected": -1.6699227094650269, "logps/chosen": -165.8355255126953, "logps/rejected": -191.2743377685547, "loss": 0.7356, "rewards/accuracies": 0.5, "rewards/chosen": -0.17172232270240784, "rewards/margins": -0.07655029743909836, "rewards/rejected": -0.09517201036214828, "step": 58 }, { "epoch": 0.08, "learning_rate": 2.95e-05, "logits/chosen": -1.71488356590271, "logits/rejected": -1.7937383651733398, "logps/chosen": -177.51776123046875, "logps/rejected": -188.60760498046875, "loss": 0.6358, "rewards/accuracies": 0.625, "rewards/chosen": -0.1568491905927658, "rewards/margins": 0.12602630257606506, "rewards/rejected": -0.2828754782676697, "step": 59 }, { "epoch": 0.08, "learning_rate": 3e-05, "logits/chosen": -2.0625619888305664, "logits/rejected": -2.1217410564422607, "logps/chosen": -190.26431274414062, "logps/rejected": -215.28648376464844, "loss": 0.7777, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3512558341026306, "rewards/margins": -0.13770633935928345, "rewards/rejected": -0.21354950964450836, "step": 60 }, { "epoch": 0.08, "learning_rate": 3.05e-05, "logits/chosen": -1.5000535249710083, "logits/rejected": -1.508121371269226, "logps/chosen": -167.42588806152344, "logps/rejected": -151.4751739501953, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": -0.15655343234539032, "rewards/margins": 0.01617264747619629, "rewards/rejected": -0.1727260947227478, "step": 61 }, { "epoch": 0.08, "learning_rate": 3.1e-05, "logits/chosen": -1.628549337387085, "logits/rejected": -1.7045596837997437, "logps/chosen": -206.2198944091797, "logps/rejected": -188.40350341796875, "loss": 0.7688, "rewards/accuracies": 0.375, "rewards/chosen": -0.2638259530067444, "rewards/margins": -0.12546539306640625, "rewards/rejected": -0.13836055994033813, "step": 62 }, { "epoch": 0.08, "learning_rate": 3.15e-05, "logits/chosen": -1.6634955406188965, "logits/rejected": -1.641862154006958, "logps/chosen": -173.11769104003906, "logps/rejected": -179.14816284179688, "loss": 0.6446, "rewards/accuracies": 0.625, "rewards/chosen": -0.1516265571117401, "rewards/margins": 0.10875654965639114, "rewards/rejected": -0.26038309931755066, "step": 63 }, { "epoch": 0.08, "learning_rate": 3.2000000000000005e-05, "logits/chosen": -1.90239417552948, "logits/rejected": -1.846576452255249, "logps/chosen": -185.16522216796875, "logps/rejected": -187.98654174804688, "loss": 0.696, "rewards/accuracies": 0.5, "rewards/chosen": -0.26641684770584106, "rewards/margins": 0.004999059252440929, "rewards/rejected": -0.27141591906547546, "step": 64 }, { "epoch": 0.09, "learning_rate": 3.2500000000000004e-05, "logits/chosen": -1.9015129804611206, "logits/rejected": -1.9043163061141968, "logps/chosen": -166.89393615722656, "logps/rejected": -185.8280487060547, "loss": 0.7096, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3341715931892395, "rewards/margins": -0.005350928753614426, "rewards/rejected": -0.32882067561149597, "step": 65 }, { "epoch": 0.09, "learning_rate": 3.3e-05, "logits/chosen": -1.7935004234313965, "logits/rejected": -1.8295319080352783, "logps/chosen": -145.04159545898438, "logps/rejected": -152.42388916015625, "loss": 0.7515, "rewards/accuracies": 0.3125, "rewards/chosen": -0.3907526433467865, "rewards/margins": -0.10265941917896271, "rewards/rejected": -0.2880932092666626, "step": 66 }, { "epoch": 0.09, "learning_rate": 3.35e-05, "logits/chosen": -1.6959490776062012, "logits/rejected": -1.680245280265808, "logps/chosen": -165.76852416992188, "logps/rejected": -161.328857421875, "loss": 0.7212, "rewards/accuracies": 0.5, "rewards/chosen": -0.3491905629634857, "rewards/margins": -0.03104216605424881, "rewards/rejected": -0.3181484043598175, "step": 67 }, { "epoch": 0.09, "learning_rate": 3.4000000000000007e-05, "logits/chosen": -1.5831743478775024, "logits/rejected": -1.5631486177444458, "logps/chosen": -182.02952575683594, "logps/rejected": -183.06907653808594, "loss": 0.6674, "rewards/accuracies": 0.5, "rewards/chosen": -0.22372691333293915, "rewards/margins": 0.07622986286878586, "rewards/rejected": -0.2999567687511444, "step": 68 }, { "epoch": 0.09, "learning_rate": 3.45e-05, "logits/chosen": -1.7518221139907837, "logits/rejected": -1.7015327215194702, "logps/chosen": -181.39324951171875, "logps/rejected": -181.88720703125, "loss": 0.6668, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3097344934940338, "rewards/margins": 0.0728234276175499, "rewards/rejected": -0.3825579285621643, "step": 69 }, { "epoch": 0.09, "learning_rate": 3.5e-05, "logits/chosen": -1.8494486808776855, "logits/rejected": -1.80762779712677, "logps/chosen": -181.36915588378906, "logps/rejected": -186.561279296875, "loss": 0.7449, "rewards/accuracies": 0.375, "rewards/chosen": -0.47243672609329224, "rewards/margins": -0.07020688056945801, "rewards/rejected": -0.40222981572151184, "step": 70 }, { "epoch": 0.09, "learning_rate": 3.55e-05, "logits/chosen": -1.701026201248169, "logits/rejected": -1.6742680072784424, "logps/chosen": -157.6632843017578, "logps/rejected": -161.87045288085938, "loss": 0.6544, "rewards/accuracies": 0.5625, "rewards/chosen": -0.27613064646720886, "rewards/margins": 0.10120917111635208, "rewards/rejected": -0.37733981013298035, "step": 71 }, { "epoch": 0.09, "learning_rate": 3.6e-05, "logits/chosen": -1.7675864696502686, "logits/rejected": -1.8016642332077026, "logps/chosen": -185.2778778076172, "logps/rejected": -212.45452880859375, "loss": 0.6621, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3811972141265869, "rewards/margins": 0.07623375207185745, "rewards/rejected": -0.45743098855018616, "step": 72 }, { "epoch": 0.1, "learning_rate": 3.65e-05, "logits/chosen": -1.8149230480194092, "logits/rejected": -1.8410683870315552, "logps/chosen": -167.20913696289062, "logps/rejected": -197.37989807128906, "loss": 0.7319, "rewards/accuracies": 0.4375, "rewards/chosen": -0.48692482709884644, "rewards/margins": -0.050217654556035995, "rewards/rejected": -0.43670713901519775, "step": 73 }, { "epoch": 0.1, "learning_rate": 3.7e-05, "logits/chosen": -1.8524658679962158, "logits/rejected": -1.8538126945495605, "logps/chosen": -164.4295654296875, "logps/rejected": -180.02191162109375, "loss": 0.7363, "rewards/accuracies": 0.25, "rewards/chosen": -0.4177883267402649, "rewards/margins": -0.050662752240896225, "rewards/rejected": -0.36712557077407837, "step": 74 }, { "epoch": 0.1, "learning_rate": 3.7500000000000003e-05, "logits/chosen": -1.2946817874908447, "logits/rejected": -1.3125836849212646, "logps/chosen": -225.81192016601562, "logps/rejected": -252.005859375, "loss": 0.7049, "rewards/accuracies": 0.5, "rewards/chosen": -0.37055703997612, "rewards/margins": 0.016286462545394897, "rewards/rejected": -0.3868435025215149, "step": 75 }, { "epoch": 0.1, "learning_rate": 3.8e-05, "logits/chosen": -1.7864320278167725, "logits/rejected": -1.8551700115203857, "logps/chosen": -176.35296630859375, "logps/rejected": -182.03231811523438, "loss": 0.7029, "rewards/accuracies": 0.5, "rewards/chosen": -0.49798864126205444, "rewards/margins": 0.02277611568570137, "rewards/rejected": -0.5207647681236267, "step": 76 }, { "epoch": 0.1, "learning_rate": 3.85e-05, "logits/chosen": -1.7414928674697876, "logits/rejected": -1.7363300323486328, "logps/chosen": -191.74154663085938, "logps/rejected": -199.81724548339844, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4908958375453949, "rewards/margins": 0.02561158686876297, "rewards/rejected": -0.5165074467658997, "step": 77 }, { "epoch": 0.1, "learning_rate": 3.9000000000000006e-05, "logits/chosen": -1.835726261138916, "logits/rejected": -1.840294599533081, "logps/chosen": -191.46029663085938, "logps/rejected": -168.4354248046875, "loss": 0.7986, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5988375544548035, "rewards/margins": -0.1457357108592987, "rewards/rejected": -0.45310184359550476, "step": 78 }, { "epoch": 0.1, "learning_rate": 3.9500000000000005e-05, "logits/chosen": -1.6384897232055664, "logits/rejected": -1.5781760215759277, "logps/chosen": -217.35772705078125, "logps/rejected": -217.51126098632812, "loss": 0.7354, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6283512115478516, "rewards/margins": -0.027955979108810425, "rewards/rejected": -0.6003952026367188, "step": 79 }, { "epoch": 0.1, "learning_rate": 4e-05, "logits/chosen": -1.7660375833511353, "logits/rejected": -1.7639034986495972, "logps/chosen": -152.1668243408203, "logps/rejected": -141.91021728515625, "loss": 0.7167, "rewards/accuracies": 0.5, "rewards/chosen": -0.5395210385322571, "rewards/margins": -0.012110946699976921, "rewards/rejected": -0.527410089969635, "step": 80 }, { "epoch": 0.11, "learning_rate": 4.05e-05, "logits/chosen": -1.5262843370437622, "logits/rejected": -1.534919261932373, "logps/chosen": -182.58859252929688, "logps/rejected": -200.82281494140625, "loss": 0.6912, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6189998388290405, "rewards/margins": 0.02953934296965599, "rewards/rejected": -0.6485391855239868, "step": 81 }, { "epoch": 0.11, "learning_rate": 4.1e-05, "logits/chosen": -1.6617029905319214, "logits/rejected": -1.6961578130722046, "logps/chosen": -177.42286682128906, "logps/rejected": -168.7061767578125, "loss": 0.7955, "rewards/accuracies": 0.5, "rewards/chosen": -0.7245805859565735, "rewards/margins": -0.15311545133590698, "rewards/rejected": -0.5714651942253113, "step": 82 }, { "epoch": 0.11, "learning_rate": 4.15e-05, "logits/chosen": -1.6676827669143677, "logits/rejected": -1.5490450859069824, "logps/chosen": -171.3192596435547, "logps/rejected": -173.60470581054688, "loss": 0.7403, "rewards/accuracies": 0.375, "rewards/chosen": -0.5414891242980957, "rewards/margins": -0.05256550386548042, "rewards/rejected": -0.4889236092567444, "step": 83 }, { "epoch": 0.11, "learning_rate": 4.2e-05, "logits/chosen": -1.8843668699264526, "logits/rejected": -1.9513225555419922, "logps/chosen": -172.60548400878906, "logps/rejected": -164.8987579345703, "loss": 0.7633, "rewards/accuracies": 0.375, "rewards/chosen": -0.5382730960845947, "rewards/margins": -0.10875138640403748, "rewards/rejected": -0.42952167987823486, "step": 84 }, { "epoch": 0.11, "learning_rate": 4.25e-05, "logits/chosen": -1.9940603971481323, "logits/rejected": -1.9973390102386475, "logps/chosen": -161.12863159179688, "logps/rejected": -164.32958984375, "loss": 0.7724, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5231802463531494, "rewards/margins": -0.11174039542675018, "rewards/rejected": -0.41143983602523804, "step": 85 }, { "epoch": 0.11, "learning_rate": 4.3e-05, "logits/chosen": -1.8103983402252197, "logits/rejected": -1.7585985660552979, "logps/chosen": -195.51132202148438, "logps/rejected": -189.4246826171875, "loss": 0.747, "rewards/accuracies": 0.375, "rewards/chosen": -0.7077435255050659, "rewards/margins": -0.08058477193117142, "rewards/rejected": -0.6271587610244751, "step": 86 }, { "epoch": 0.11, "learning_rate": 4.35e-05, "logits/chosen": -1.9289149045944214, "logits/rejected": -1.982129693031311, "logps/chosen": -170.78253173828125, "logps/rejected": -172.57882690429688, "loss": 0.8229, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6636737585067749, "rewards/margins": -0.2039366364479065, "rewards/rejected": -0.4597371816635132, "step": 87 }, { "epoch": 0.12, "learning_rate": 4.4000000000000006e-05, "logits/chosen": -1.875556230545044, "logits/rejected": -1.8859914541244507, "logps/chosen": -160.60577392578125, "logps/rejected": -171.32774353027344, "loss": 0.6905, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3266763389110565, "rewards/margins": 0.03434550017118454, "rewards/rejected": -0.36102184653282166, "step": 88 }, { "epoch": 0.12, "learning_rate": 4.4500000000000004e-05, "logits/chosen": -1.7824954986572266, "logits/rejected": -1.7617722749710083, "logps/chosen": -177.59042358398438, "logps/rejected": -200.9052276611328, "loss": 0.6799, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4084916114807129, "rewards/margins": 0.0701964944601059, "rewards/rejected": -0.47868812084198, "step": 89 }, { "epoch": 0.12, "learning_rate": 4.5e-05, "logits/chosen": -1.7419114112854004, "logits/rejected": -1.7621021270751953, "logps/chosen": -185.32742309570312, "logps/rejected": -171.69085693359375, "loss": 0.8305, "rewards/accuracies": 0.25, "rewards/chosen": -0.3024221956729889, "rewards/margins": -0.20739878714084625, "rewards/rejected": -0.09502339363098145, "step": 90 }, { "epoch": 0.12, "learning_rate": 4.55e-05, "logits/chosen": -1.83467435836792, "logits/rejected": -1.8328973054885864, "logps/chosen": -207.63388061523438, "logps/rejected": -210.3101043701172, "loss": 0.6628, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3415074646472931, "rewards/margins": 0.07163538783788681, "rewards/rejected": -0.4131428897380829, "step": 91 }, { "epoch": 0.12, "learning_rate": 4.600000000000001e-05, "logits/chosen": -1.6864506006240845, "logits/rejected": -1.7266168594360352, "logps/chosen": -180.49522399902344, "logps/rejected": -198.40599060058594, "loss": 0.6523, "rewards/accuracies": 0.5625, "rewards/chosen": -0.27834925055503845, "rewards/margins": 0.10665541142225266, "rewards/rejected": -0.3850046396255493, "step": 92 }, { "epoch": 0.12, "learning_rate": 4.6500000000000005e-05, "logits/chosen": -1.4600155353546143, "logits/rejected": -1.4545408487319946, "logps/chosen": -191.01947021484375, "logps/rejected": -180.70298767089844, "loss": 0.7165, "rewards/accuracies": 0.5, "rewards/chosen": -0.1802830994129181, "rewards/margins": -0.027431445196270943, "rewards/rejected": -0.1528516411781311, "step": 93 }, { "epoch": 0.12, "learning_rate": 4.7e-05, "logits/chosen": -1.8134263753890991, "logits/rejected": -1.8928412199020386, "logps/chosen": -184.84503173828125, "logps/rejected": -193.90377807617188, "loss": 0.6644, "rewards/accuracies": 0.625, "rewards/chosen": -0.3032827377319336, "rewards/margins": 0.0916454941034317, "rewards/rejected": -0.3949282467365265, "step": 94 }, { "epoch": 0.12, "learning_rate": 4.75e-05, "logits/chosen": -1.8253490924835205, "logits/rejected": -1.810834527015686, "logps/chosen": -208.2128448486328, "logps/rejected": -175.4803009033203, "loss": 0.7444, "rewards/accuracies": 0.5, "rewards/chosen": -0.5496629476547241, "rewards/margins": -0.08057989180088043, "rewards/rejected": -0.4690830111503601, "step": 95 }, { "epoch": 0.13, "learning_rate": 4.8e-05, "logits/chosen": -1.908506989479065, "logits/rejected": -1.9602031707763672, "logps/chosen": -184.815185546875, "logps/rejected": -176.07138061523438, "loss": 0.6982, "rewards/accuracies": 0.5, "rewards/chosen": -0.3538370728492737, "rewards/margins": 0.013028910383582115, "rewards/rejected": -0.36686599254608154, "step": 96 }, { "epoch": 0.13, "learning_rate": 4.85e-05, "logits/chosen": -1.7022223472595215, "logits/rejected": -1.6424753665924072, "logps/chosen": -198.0614776611328, "logps/rejected": -215.3161163330078, "loss": 0.7737, "rewards/accuracies": 0.3125, "rewards/chosen": -0.2290499359369278, "rewards/margins": -0.12730106711387634, "rewards/rejected": -0.10174884647130966, "step": 97 }, { "epoch": 0.13, "learning_rate": 4.9e-05, "logits/chosen": -2.018606662750244, "logits/rejected": -2.027151584625244, "logps/chosen": -167.92147827148438, "logps/rejected": -166.90982055664062, "loss": 0.6961, "rewards/accuracies": 0.625, "rewards/chosen": -0.3297194838523865, "rewards/margins": 0.02775608003139496, "rewards/rejected": -0.35747551918029785, "step": 98 }, { "epoch": 0.13, "learning_rate": 4.9500000000000004e-05, "logits/chosen": -1.7240030765533447, "logits/rejected": -1.7241712808609009, "logps/chosen": -180.00389099121094, "logps/rejected": -189.3558349609375, "loss": 0.7948, "rewards/accuracies": 0.375, "rewards/chosen": -0.4574447572231293, "rewards/margins": -0.12154103070497513, "rewards/rejected": -0.33590370416641235, "step": 99 }, { "epoch": 0.13, "learning_rate": 5e-05, "logits/chosen": -1.7809938192367554, "logits/rejected": -1.9036985635757446, "logps/chosen": -157.1652069091797, "logps/rejected": -173.80288696289062, "loss": 0.6839, "rewards/accuracies": 0.625, "rewards/chosen": -0.2328178882598877, "rewards/margins": 0.05165515094995499, "rewards/rejected": -0.2844730615615845, "step": 100 }, { "epoch": 0.13, "learning_rate": 4.999997432392803e-05, "logits/chosen": -1.9480023384094238, "logits/rejected": -1.9346106052398682, "logps/chosen": -197.60128784179688, "logps/rejected": -193.88124084472656, "loss": 0.6427, "rewards/accuracies": 0.625, "rewards/chosen": -0.26800671219825745, "rewards/margins": 0.13769717514514923, "rewards/rejected": -0.40570390224456787, "step": 101 }, { "epoch": 0.13, "learning_rate": 4.9999897295764844e-05, "logits/chosen": -2.048476457595825, "logits/rejected": -2.0353472232818604, "logps/chosen": -180.3016357421875, "logps/rejected": -180.51600646972656, "loss": 0.6366, "rewards/accuracies": 0.625, "rewards/chosen": -0.35791218280792236, "rewards/margins": 0.15290379524230957, "rewards/rejected": -0.5108159780502319, "step": 102 }, { "epoch": 0.13, "learning_rate": 4.9999768915668665e-05, "logits/chosen": -1.9134316444396973, "logits/rejected": -1.8900353908538818, "logps/chosen": -152.71189880371094, "logps/rejected": -153.96690368652344, "loss": 0.6941, "rewards/accuracies": 0.5625, "rewards/chosen": -0.18337132036685944, "rewards/margins": 0.03930587321519852, "rewards/rejected": -0.22267718613147736, "step": 103 }, { "epoch": 0.14, "learning_rate": 4.999958918390321e-05, "logits/chosen": -1.8933653831481934, "logits/rejected": -1.8493753671646118, "logps/chosen": -188.6655731201172, "logps/rejected": -183.1341552734375, "loss": 0.7168, "rewards/accuracies": 0.5, "rewards/chosen": -0.38441047072410583, "rewards/margins": -0.010426240041851997, "rewards/rejected": -0.3739842474460602, "step": 104 }, { "epoch": 0.14, "learning_rate": 4.999935810083766e-05, "logits/chosen": -1.7264684438705444, "logits/rejected": -1.694319486618042, "logps/chosen": -156.22084045410156, "logps/rejected": -152.80894470214844, "loss": 0.6763, "rewards/accuracies": 0.5625, "rewards/chosen": -0.28453487157821655, "rewards/margins": 0.052657999098300934, "rewards/rejected": -0.3371928334236145, "step": 105 }, { "epoch": 0.14, "learning_rate": 4.999907566694667e-05, "logits/chosen": -1.8885689973831177, "logits/rejected": -1.9364815950393677, "logps/chosen": -167.39117431640625, "logps/rejected": -191.052001953125, "loss": 0.6963, "rewards/accuracies": 0.4375, "rewards/chosen": -0.21066321432590485, "rewards/margins": 0.03920959681272507, "rewards/rejected": -0.2498728483915329, "step": 106 }, { "epoch": 0.14, "learning_rate": 4.9998741882810384e-05, "logits/chosen": -1.7698872089385986, "logits/rejected": -1.7441281080245972, "logps/chosen": -178.68572998046875, "logps/rejected": -174.1964569091797, "loss": 0.7459, "rewards/accuracies": 0.5, "rewards/chosen": -0.24953651428222656, "rewards/margins": -0.08057431131601334, "rewards/rejected": -0.16896219551563263, "step": 107 }, { "epoch": 0.14, "learning_rate": 4.999835674911443e-05, "logits/chosen": -1.812888741493225, "logits/rejected": -1.7789666652679443, "logps/chosen": -228.23631286621094, "logps/rejected": -203.85357666015625, "loss": 0.6838, "rewards/accuracies": 0.5, "rewards/chosen": -0.10872795432806015, "rewards/margins": 0.05488254129886627, "rewards/rejected": -0.16361048817634583, "step": 108 }, { "epoch": 0.14, "learning_rate": 4.999792026664991e-05, "logits/chosen": -1.6739952564239502, "logits/rejected": -1.6701843738555908, "logps/chosen": -203.80810546875, "logps/rejected": -211.82334899902344, "loss": 0.6853, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3305152356624603, "rewards/margins": 0.05953298509120941, "rewards/rejected": -0.39004823565483093, "step": 109 }, { "epoch": 0.14, "learning_rate": 4.9997432436313384e-05, "logits/chosen": -1.6255407333374023, "logits/rejected": -1.586310625076294, "logps/chosen": -165.31040954589844, "logps/rejected": -186.821044921875, "loss": 0.646, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3459918200969696, "rewards/margins": 0.1395949125289917, "rewards/rejected": -0.4855867028236389, "step": 110 }, { "epoch": 0.15, "learning_rate": 4.99968932591069e-05, "logits/chosen": -1.8984012603759766, "logits/rejected": -1.8569310903549194, "logps/chosen": -182.6356201171875, "logps/rejected": -176.14752197265625, "loss": 0.7585, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3858329653739929, "rewards/margins": -0.05486001819372177, "rewards/rejected": -0.33097296953201294, "step": 111 }, { "epoch": 0.15, "learning_rate": 4.999630273613799e-05, "logits/chosen": -1.8298226594924927, "logits/rejected": -1.812253475189209, "logps/chosen": -163.0826416015625, "logps/rejected": -200.62098693847656, "loss": 0.7425, "rewards/accuracies": 0.375, "rewards/chosen": -0.2680785357952118, "rewards/margins": -0.04629645124077797, "rewards/rejected": -0.22178205847740173, "step": 112 }, { "epoch": 0.15, "learning_rate": 4.999566086861961e-05, "logits/chosen": -1.6931625604629517, "logits/rejected": -1.7089687585830688, "logps/chosen": -147.44491577148438, "logps/rejected": -150.4454345703125, "loss": 0.7146, "rewards/accuracies": 0.5, "rewards/chosen": -0.224314883351326, "rewards/margins": 0.05584639310836792, "rewards/rejected": -0.2801613211631775, "step": 113 }, { "epoch": 0.15, "learning_rate": 4.999496765787024e-05, "logits/chosen": -1.7311415672302246, "logits/rejected": -1.612362027168274, "logps/chosen": -195.65594482421875, "logps/rejected": -194.56130981445312, "loss": 0.7017, "rewards/accuracies": 0.4375, "rewards/chosen": -0.28545114398002625, "rewards/margins": 0.020438771694898605, "rewards/rejected": -0.30588990449905396, "step": 114 }, { "epoch": 0.15, "learning_rate": 4.9994223105313774e-05, "logits/chosen": -1.9310710430145264, "logits/rejected": -1.9658077955245972, "logps/chosen": -179.00779724121094, "logps/rejected": -181.42135620117188, "loss": 0.6605, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2686367928981781, "rewards/margins": 0.10176892578601837, "rewards/rejected": -0.37040573358535767, "step": 115 }, { "epoch": 0.15, "learning_rate": 4.9993427212479606e-05, "logits/chosen": -1.7969621419906616, "logits/rejected": -1.8055509328842163, "logps/chosen": -176.26036071777344, "logps/rejected": -171.2470703125, "loss": 0.6656, "rewards/accuracies": 0.5625, "rewards/chosen": -0.33693569898605347, "rewards/margins": 0.08709227293729782, "rewards/rejected": -0.4240279793739319, "step": 116 }, { "epoch": 0.15, "learning_rate": 4.999257998100254e-05, "logits/chosen": -1.5152311325073242, "logits/rejected": -1.553884506225586, "logps/chosen": -179.60372924804688, "logps/rejected": -164.34481811523438, "loss": 0.7563, "rewards/accuracies": 0.5, "rewards/chosen": -0.3611801564693451, "rewards/margins": -0.08427368104457855, "rewards/rejected": -0.27690649032592773, "step": 117 }, { "epoch": 0.15, "learning_rate": 4.999168141262289e-05, "logits/chosen": -1.7877562046051025, "logits/rejected": -1.7855968475341797, "logps/chosen": -165.04220581054688, "logps/rejected": -179.1771240234375, "loss": 0.7999, "rewards/accuracies": 0.375, "rewards/chosen": -0.3364974856376648, "rewards/margins": -0.1316596120595932, "rewards/rejected": -0.20483790338039398, "step": 118 }, { "epoch": 0.16, "learning_rate": 4.9990731509186376e-05, "logits/chosen": -1.4499574899673462, "logits/rejected": -1.4440600872039795, "logps/chosen": -183.58778381347656, "logps/rejected": -198.231689453125, "loss": 0.7216, "rewards/accuracies": 0.5, "rewards/chosen": -0.3034099340438843, "rewards/margins": -0.00025239214301109314, "rewards/rejected": -0.3031575381755829, "step": 119 }, { "epoch": 0.16, "learning_rate": 4.998973027264419e-05, "logits/chosen": -1.6583937406539917, "logits/rejected": -1.689673900604248, "logps/chosen": -187.60508728027344, "logps/rejected": -212.7592315673828, "loss": 0.6404, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3053140342235565, "rewards/margins": 0.17152619361877441, "rewards/rejected": -0.47684019804000854, "step": 120 }, { "epoch": 0.16, "learning_rate": 4.998867770505295e-05, "logits/chosen": -1.6554609537124634, "logits/rejected": -1.6350326538085938, "logps/chosen": -181.46209716796875, "logps/rejected": -173.83157348632812, "loss": 0.726, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24238371849060059, "rewards/margins": 0.002255776897072792, "rewards/rejected": -0.24463950097560883, "step": 121 }, { "epoch": 0.16, "learning_rate": 4.9987573808574726e-05, "logits/chosen": -1.8908902406692505, "logits/rejected": -1.858853816986084, "logps/chosen": -174.7786865234375, "logps/rejected": -177.30545043945312, "loss": 0.7269, "rewards/accuracies": 0.625, "rewards/chosen": 0.0712527185678482, "rewards/margins": -0.02764507755637169, "rewards/rejected": 0.09889779984951019, "step": 122 }, { "epoch": 0.16, "learning_rate": 4.9986418585477016e-05, "logits/chosen": -1.7081693410873413, "logits/rejected": -1.7438371181488037, "logps/chosen": -191.28123474121094, "logps/rejected": -197.5807647705078, "loss": 0.6859, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2962351143360138, "rewards/margins": 0.06765662133693695, "rewards/rejected": -0.36389175057411194, "step": 123 }, { "epoch": 0.16, "learning_rate": 4.998521203813274e-05, "logits/chosen": -1.805833101272583, "logits/rejected": -1.7511969804763794, "logps/chosen": -166.08702087402344, "logps/rejected": -159.9141082763672, "loss": 0.6387, "rewards/accuracies": 0.5625, "rewards/chosen": -0.16753454506397247, "rewards/margins": 0.16043387353420258, "rewards/rejected": -0.32796838879585266, "step": 124 }, { "epoch": 0.16, "learning_rate": 4.9983954169020256e-05, "logits/chosen": -1.5241700410842896, "logits/rejected": -1.6004612445831299, "logps/chosen": -189.8046875, "logps/rejected": -198.48001098632812, "loss": 0.7803, "rewards/accuracies": 0.375, "rewards/chosen": -0.35226038098335266, "rewards/margins": -0.09872373938560486, "rewards/rejected": -0.2535366714000702, "step": 125 }, { "epoch": 0.16, "learning_rate": 4.9982644980723334e-05, "logits/chosen": -1.3276153802871704, "logits/rejected": -1.372768759727478, "logps/chosen": -179.91940307617188, "logps/rejected": -180.60751342773438, "loss": 0.7245, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4333783686161041, "rewards/margins": -0.014091454446315765, "rewards/rejected": -0.41928690671920776, "step": 126 }, { "epoch": 0.17, "learning_rate": 4.998128447593117e-05, "logits/chosen": -1.5195338726043701, "logits/rejected": -1.4305707216262817, "logps/chosen": -179.16712951660156, "logps/rejected": -162.93356323242188, "loss": 0.7394, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3149658441543579, "rewards/margins": -0.04299226403236389, "rewards/rejected": -0.2719736099243164, "step": 127 }, { "epoch": 0.17, "learning_rate": 4.997987265743834e-05, "logits/chosen": -1.8512688875198364, "logits/rejected": -1.791621446609497, "logps/chosen": -171.71990966796875, "logps/rejected": -177.26954650878906, "loss": 0.7217, "rewards/accuracies": 0.375, "rewards/chosen": -0.2529350221157074, "rewards/margins": -0.008312180638313293, "rewards/rejected": -0.2446228414773941, "step": 128 }, { "epoch": 0.17, "learning_rate": 4.997840952814484e-05, "logits/chosen": -1.8096928596496582, "logits/rejected": -1.7790553569793701, "logps/chosen": -174.4140167236328, "logps/rejected": -176.01708984375, "loss": 0.7997, "rewards/accuracies": 0.375, "rewards/chosen": -0.24751965701580048, "rewards/margins": -0.1494641900062561, "rewards/rejected": -0.09805545210838318, "step": 129 }, { "epoch": 0.17, "learning_rate": 4.9976895091056075e-05, "logits/chosen": -1.7724186182022095, "logits/rejected": -1.777117371559143, "logps/chosen": -171.97897338867188, "logps/rejected": -197.2012939453125, "loss": 0.6544, "rewards/accuracies": 0.5, "rewards/chosen": -0.3243841528892517, "rewards/margins": 0.10846509784460068, "rewards/rejected": -0.432849258184433, "step": 130 }, { "epoch": 0.17, "learning_rate": 4.9975329349282826e-05, "logits/chosen": -1.6474465131759644, "logits/rejected": -1.632018804550171, "logps/chosen": -176.29461669921875, "logps/rejected": -191.7507781982422, "loss": 0.7379, "rewards/accuracies": 0.5, "rewards/chosen": -0.5088250637054443, "rewards/margins": -0.030252262949943542, "rewards/rejected": -0.4785728454589844, "step": 131 }, { "epoch": 0.17, "learning_rate": 4.9973712306041256e-05, "logits/chosen": -2.033618927001953, "logits/rejected": -2.0342512130737305, "logps/chosen": -198.94796752929688, "logps/rejected": -192.28482055664062, "loss": 0.826, "rewards/accuracies": 0.3125, "rewards/chosen": -0.34583836793899536, "rewards/margins": -0.18120835721492767, "rewards/rejected": -0.1646299660205841, "step": 132 }, { "epoch": 0.17, "learning_rate": 4.997204396465292e-05, "logits/chosen": -1.5918235778808594, "logits/rejected": -1.5706799030303955, "logps/chosen": -157.1723175048828, "logps/rejected": -182.7075958251953, "loss": 0.8134, "rewards/accuracies": 0.5, "rewards/chosen": -0.5003759860992432, "rewards/margins": -0.07385056465864182, "rewards/rejected": -0.42652541399002075, "step": 133 }, { "epoch": 0.18, "learning_rate": 4.997032432854472e-05, "logits/chosen": -1.6958372592926025, "logits/rejected": -1.7072336673736572, "logps/chosen": -182.0128173828125, "logps/rejected": -202.56802368164062, "loss": 0.52, "rewards/accuracies": 0.75, "rewards/chosen": -0.08301500976085663, "rewards/margins": 0.4688724875450134, "rewards/rejected": -0.5518875122070312, "step": 134 }, { "epoch": 0.18, "learning_rate": 4.996855340124894e-05, "logits/chosen": -1.7832672595977783, "logits/rejected": -1.767440915107727, "logps/chosen": -193.7747344970703, "logps/rejected": -197.20289611816406, "loss": 0.7392, "rewards/accuracies": 0.3125, "rewards/chosen": -0.39458268880844116, "rewards/margins": -0.038821250200271606, "rewards/rejected": -0.35576146841049194, "step": 135 }, { "epoch": 0.18, "learning_rate": 4.996673118640323e-05, "logits/chosen": -1.7633535861968994, "logits/rejected": -1.7420529127120972, "logps/chosen": -154.8060760498047, "logps/rejected": -170.86996459960938, "loss": 0.6108, "rewards/accuracies": 0.625, "rewards/chosen": -0.15521948039531708, "rewards/margins": 0.2389230877161026, "rewards/rejected": -0.3941425681114197, "step": 136 }, { "epoch": 0.18, "learning_rate": 4.996485768775055e-05, "logits/chosen": -1.7529706954956055, "logits/rejected": -1.7141683101654053, "logps/chosen": -233.09747314453125, "logps/rejected": -237.85906982421875, "loss": 0.7094, "rewards/accuracies": 0.5, "rewards/chosen": -0.4181414544582367, "rewards/margins": 0.051600366830825806, "rewards/rejected": -0.4697418212890625, "step": 137 }, { "epoch": 0.18, "learning_rate": 4.996293290913926e-05, "logits/chosen": -1.6701140403747559, "logits/rejected": -1.684870719909668, "logps/chosen": -192.6127166748047, "logps/rejected": -188.32798767089844, "loss": 0.5773, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0851680114865303, "rewards/margins": 0.31171905994415283, "rewards/rejected": -0.39688706398010254, "step": 138 }, { "epoch": 0.18, "learning_rate": 4.9960956854522986e-05, "logits/chosen": -1.7777843475341797, "logits/rejected": -1.73331618309021, "logps/chosen": -199.59889221191406, "logps/rejected": -174.27069091796875, "loss": 0.785, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5378831028938293, "rewards/margins": -0.13611721992492676, "rewards/rejected": -0.4017658531665802, "step": 139 }, { "epoch": 0.18, "learning_rate": 4.995892952796074e-05, "logits/chosen": -1.7307249307632446, "logits/rejected": -1.7584043741226196, "logps/chosen": -184.14364624023438, "logps/rejected": -196.90878295898438, "loss": 0.6489, "rewards/accuracies": 0.75, "rewards/chosen": -0.4751961827278137, "rewards/margins": 0.1301158368587494, "rewards/rejected": -0.6053119897842407, "step": 140 }, { "epoch": 0.18, "learning_rate": 4.995685093361682e-05, "logits/chosen": -1.770018219947815, "logits/rejected": -1.832364797592163, "logps/chosen": -183.0137939453125, "logps/rejected": -204.83804321289062, "loss": 0.7497, "rewards/accuracies": 0.5, "rewards/chosen": -0.4263853132724762, "rewards/margins": -0.025808706879615784, "rewards/rejected": -0.4005766212940216, "step": 141 }, { "epoch": 0.19, "learning_rate": 4.9954721075760824e-05, "logits/chosen": -1.7604196071624756, "logits/rejected": -1.7073791027069092, "logps/chosen": -191.46923828125, "logps/rejected": -201.73545837402344, "loss": 0.8181, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5646533370018005, "rewards/margins": -0.1740642488002777, "rewards/rejected": -0.3905891180038452, "step": 142 }, { "epoch": 0.19, "learning_rate": 4.995253995876767e-05, "logits/chosen": -1.5798883438110352, "logits/rejected": -1.5167430639266968, "logps/chosen": -235.8765106201172, "logps/rejected": -222.53204345703125, "loss": 0.7115, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5441933870315552, "rewards/margins": 0.09400075674057007, "rewards/rejected": -0.6381941437721252, "step": 143 }, { "epoch": 0.19, "learning_rate": 4.995030758711756e-05, "logits/chosen": -1.932177186012268, "logits/rejected": -1.8748353719711304, "logps/chosen": -182.81915283203125, "logps/rejected": -170.09844970703125, "loss": 0.6448, "rewards/accuracies": 0.625, "rewards/chosen": -0.48413196206092834, "rewards/margins": 0.19957152009010315, "rewards/rejected": -0.6837034821510315, "step": 144 }, { "epoch": 0.19, "learning_rate": 4.994802396539598e-05, "logits/chosen": -1.8008129596710205, "logits/rejected": -1.7928048372268677, "logps/chosen": -202.68096923828125, "logps/rejected": -199.8201904296875, "loss": 0.8172, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0157395601272583, "rewards/margins": -0.10564298927783966, "rewards/rejected": -0.9100965857505798, "step": 145 }, { "epoch": 0.19, "learning_rate": 4.994568909829368e-05, "logits/chosen": -1.7543656826019287, "logits/rejected": -1.7212865352630615, "logps/chosen": -201.3524932861328, "logps/rejected": -218.5485382080078, "loss": 0.9316, "rewards/accuracies": 0.1875, "rewards/chosen": -0.7368103265762329, "rewards/margins": -0.3777569532394409, "rewards/rejected": -0.359053373336792, "step": 146 }, { "epoch": 0.19, "learning_rate": 4.9943302990606684e-05, "logits/chosen": -1.7704360485076904, "logits/rejected": -1.6632460355758667, "logps/chosen": -187.3475341796875, "logps/rejected": -180.01144409179688, "loss": 0.751, "rewards/accuracies": 0.5, "rewards/chosen": -0.4657554626464844, "rewards/margins": 0.006175771355628967, "rewards/rejected": -0.47193124890327454, "step": 147 }, { "epoch": 0.19, "learning_rate": 4.994086564723626e-05, "logits/chosen": -1.9261763095855713, "logits/rejected": -1.9572409391403198, "logps/chosen": -171.56101989746094, "logps/rejected": -182.58717346191406, "loss": 0.7349, "rewards/accuracies": 0.3125, "rewards/chosen": -0.7470525503158569, "rewards/margins": -0.043389588594436646, "rewards/rejected": -0.7036629915237427, "step": 148 }, { "epoch": 0.19, "learning_rate": 4.9938377073188905e-05, "logits/chosen": -1.9480628967285156, "logits/rejected": -2.002164363861084, "logps/chosen": -197.41912841796875, "logps/rejected": -184.93325805664062, "loss": 0.813, "rewards/accuracies": 0.375, "rewards/chosen": -0.6927478909492493, "rewards/margins": -0.12049313634634018, "rewards/rejected": -0.5722547769546509, "step": 149 }, { "epoch": 0.2, "learning_rate": 4.993583727357638e-05, "logits/chosen": -1.6262449026107788, "logits/rejected": -1.640842080116272, "logps/chosen": -205.38461303710938, "logps/rejected": -213.60650634765625, "loss": 0.7821, "rewards/accuracies": 0.375, "rewards/chosen": -0.7866306304931641, "rewards/margins": -0.11763662099838257, "rewards/rejected": -0.6689940094947815, "step": 150 }, { "epoch": 0.2, "learning_rate": 4.993324625361565e-05, "logits/chosen": -1.8480533361434937, "logits/rejected": -1.8557144403457642, "logps/chosen": -158.26290893554688, "logps/rejected": -169.06105041503906, "loss": 0.7438, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6200124025344849, "rewards/margins": -0.012443792074918747, "rewards/rejected": -0.607568621635437, "step": 151 }, { "epoch": 0.2, "learning_rate": 4.993060401862888e-05, "logits/chosen": -1.8685041666030884, "logits/rejected": -1.8648606538772583, "logps/chosen": -176.7852020263672, "logps/rejected": -183.40328979492188, "loss": 0.6935, "rewards/accuracies": 0.625, "rewards/chosen": -0.7892077565193176, "rewards/margins": 0.08869240432977676, "rewards/rejected": -0.877900242805481, "step": 152 }, { "epoch": 0.2, "learning_rate": 4.9927910574043465e-05, "logits/chosen": -1.9234154224395752, "logits/rejected": -1.904573917388916, "logps/chosen": -159.97625732421875, "logps/rejected": -152.645263671875, "loss": 0.7778, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6230319142341614, "rewards/margins": -0.06979034841060638, "rewards/rejected": -0.5532415509223938, "step": 153 }, { "epoch": 0.2, "learning_rate": 4.992516592539196e-05, "logits/chosen": -1.6896902322769165, "logits/rejected": -1.7036737203598022, "logps/chosen": -148.6313018798828, "logps/rejected": -164.7644500732422, "loss": 0.5546, "rewards/accuracies": 0.625, "rewards/chosen": -0.29331690073013306, "rewards/margins": 0.44352981448173523, "rewards/rejected": -0.7368468046188354, "step": 154 }, { "epoch": 0.2, "learning_rate": 4.9922370078312105e-05, "logits/chosen": -2.013890266418457, "logits/rejected": -1.9424934387207031, "logps/chosen": -215.90118408203125, "logps/rejected": -209.59071350097656, "loss": 0.4938, "rewards/accuracies": 0.75, "rewards/chosen": -0.2655085027217865, "rewards/margins": 0.5123203992843628, "rewards/rejected": -0.7778289318084717, "step": 155 }, { "epoch": 0.2, "learning_rate": 4.991952303854682e-05, "logits/chosen": -1.8328962326049805, "logits/rejected": -1.8138638734817505, "logps/chosen": -170.13475036621094, "logps/rejected": -176.11810302734375, "loss": 0.6684, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3248033821582794, "rewards/margins": 0.12224595248699188, "rewards/rejected": -0.4470493197441101, "step": 156 }, { "epoch": 0.21, "learning_rate": 4.9916624811944175e-05, "logits/chosen": -1.9051162004470825, "logits/rejected": -1.9407715797424316, "logps/chosen": -177.2139434814453, "logps/rejected": -185.92947387695312, "loss": 0.6297, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6396099328994751, "rewards/margins": 0.1931779384613037, "rewards/rejected": -0.832787811756134, "step": 157 }, { "epoch": 0.21, "learning_rate": 4.991367540445735e-05, "logits/chosen": -1.7430989742279053, "logits/rejected": -1.7986749410629272, "logps/chosen": -199.38021850585938, "logps/rejected": -195.27647399902344, "loss": 0.6881, "rewards/accuracies": 0.625, "rewards/chosen": -0.8835601210594177, "rewards/margins": 0.10807879269123077, "rewards/rejected": -0.9916388392448425, "step": 158 }, { "epoch": 0.21, "learning_rate": 4.991067482214471e-05, "logits/chosen": -1.868577241897583, "logits/rejected": -1.799201488494873, "logps/chosen": -177.93130493164062, "logps/rejected": -164.038330078125, "loss": 0.7179, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7006940245628357, "rewards/margins": 0.020756253972649574, "rewards/rejected": -0.7214502096176147, "step": 159 }, { "epoch": 0.21, "learning_rate": 4.9907623071169686e-05, "logits/chosen": -1.8050721883773804, "logits/rejected": -1.7359880208969116, "logps/chosen": -197.66583251953125, "logps/rejected": -172.34146118164062, "loss": 0.6379, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7713190317153931, "rewards/margins": 0.19513630867004395, "rewards/rejected": -0.966455340385437, "step": 160 }, { "epoch": 0.21, "learning_rate": 4.990452015780085e-05, "logits/chosen": -1.74982750415802, "logits/rejected": -1.719763159751892, "logps/chosen": -204.21517944335938, "logps/rejected": -196.62576293945312, "loss": 0.7434, "rewards/accuracies": 0.4375, "rewards/chosen": -0.35949403047561646, "rewards/margins": 0.03506145626306534, "rewards/rejected": -0.3945554494857788, "step": 161 }, { "epoch": 0.21, "learning_rate": 4.9901366088411846e-05, "logits/chosen": -1.6477172374725342, "logits/rejected": -1.622018814086914, "logps/chosen": -207.30174255371094, "logps/rejected": -229.1028289794922, "loss": 0.9207, "rewards/accuracies": 0.4375, "rewards/chosen": -1.1269769668579102, "rewards/margins": -0.2236841320991516, "rewards/rejected": -0.9032928943634033, "step": 162 }, { "epoch": 0.21, "learning_rate": 4.98981608694814e-05, "logits/chosen": -1.8223265409469604, "logits/rejected": -1.8162957429885864, "logps/chosen": -171.70675659179688, "logps/rejected": -171.83108520507812, "loss": 0.9174, "rewards/accuracies": 0.5, "rewards/chosen": -0.8722136616706848, "rewards/margins": -0.21637174487113953, "rewards/rejected": -0.6558419466018677, "step": 163 }, { "epoch": 0.21, "learning_rate": 4.9894904507593316e-05, "logits/chosen": -1.9424258470535278, "logits/rejected": -1.828155279159546, "logps/chosen": -196.1715545654297, "logps/rejected": -192.08624267578125, "loss": 0.7506, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5839821696281433, "rewards/margins": 0.0038854647427797318, "rewards/rejected": -0.5878676772117615, "step": 164 }, { "epoch": 0.22, "learning_rate": 4.989159700943643e-05, "logits/chosen": -1.623518705368042, "logits/rejected": -1.679386854171753, "logps/chosen": -175.6849365234375, "logps/rejected": -179.52059936523438, "loss": 0.9491, "rewards/accuracies": 0.375, "rewards/chosen": -0.7352603673934937, "rewards/margins": -0.2515270411968231, "rewards/rejected": -0.4837333559989929, "step": 165 }, { "epoch": 0.22, "learning_rate": 4.988823838180464e-05, "logits/chosen": -1.997894048690796, "logits/rejected": -1.9666211605072021, "logps/chosen": -167.2881622314453, "logps/rejected": -183.72474670410156, "loss": 0.6663, "rewards/accuracies": 0.625, "rewards/chosen": -0.5555349588394165, "rewards/margins": 0.17635540664196014, "rewards/rejected": -0.7318904399871826, "step": 166 }, { "epoch": 0.22, "learning_rate": 4.988482863159684e-05, "logits/chosen": -1.8912848234176636, "logits/rejected": -1.968542218208313, "logps/chosen": -174.55911254882812, "logps/rejected": -169.36610412597656, "loss": 0.7905, "rewards/accuracies": 0.4375, "rewards/chosen": -0.227503702044487, "rewards/margins": -0.08394889533519745, "rewards/rejected": -0.14355483651161194, "step": 167 }, { "epoch": 0.22, "learning_rate": 4.988136776581696e-05, "logits/chosen": -2.151402711868286, "logits/rejected": -2.186088800430298, "logps/chosen": -157.3328094482422, "logps/rejected": -150.9196319580078, "loss": 0.6989, "rewards/accuracies": 0.5625, "rewards/chosen": -0.29456794261932373, "rewards/margins": 0.04511295258998871, "rewards/rejected": -0.33968091011047363, "step": 168 }, { "epoch": 0.22, "learning_rate": 4.9877855791573915e-05, "logits/chosen": -1.8164244890213013, "logits/rejected": -1.8539032936096191, "logps/chosen": -188.2481689453125, "logps/rejected": -171.02090454101562, "loss": 0.9341, "rewards/accuracies": 0.375, "rewards/chosen": -0.5969239473342896, "rewards/margins": -0.33455953001976013, "rewards/rejected": -0.2623644471168518, "step": 169 }, { "epoch": 0.22, "learning_rate": 4.9874292716081595e-05, "logits/chosen": -1.7419726848602295, "logits/rejected": -1.7444337606430054, "logps/chosen": -169.46658325195312, "logps/rejected": -173.33348083496094, "loss": 0.6817, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4356076121330261, "rewards/margins": 0.14688673615455627, "rewards/rejected": -0.5824943780899048, "step": 170 }, { "epoch": 0.22, "learning_rate": 4.9870678546658865e-05, "logits/chosen": -1.6884466409683228, "logits/rejected": -1.7295485734939575, "logps/chosen": -160.8187713623047, "logps/rejected": -176.16746520996094, "loss": 0.9101, "rewards/accuracies": 0.125, "rewards/chosen": -0.6096308827400208, "rewards/margins": -0.28187263011932373, "rewards/rejected": -0.3277583122253418, "step": 171 }, { "epoch": 0.23, "learning_rate": 4.9867013290729535e-05, "logits/chosen": -1.932777762413025, "logits/rejected": -1.8993282318115234, "logps/chosen": -198.2738037109375, "logps/rejected": -197.27252197265625, "loss": 0.7183, "rewards/accuracies": 0.5625, "rewards/chosen": -0.19178181886672974, "rewards/margins": 0.006053738296031952, "rewards/rejected": -0.1978355497121811, "step": 172 }, { "epoch": 0.23, "learning_rate": 4.986329695582237e-05, "logits/chosen": -2.03489351272583, "logits/rejected": -2.076827049255371, "logps/chosen": -179.49679565429688, "logps/rejected": -177.2965545654297, "loss": 0.7798, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6112560629844666, "rewards/margins": -0.09085651487112045, "rewards/rejected": -0.5203995704650879, "step": 173 }, { "epoch": 0.23, "learning_rate": 4.985952954957103e-05, "logits/chosen": -1.8877630233764648, "logits/rejected": -1.8290894031524658, "logps/chosen": -233.42221069335938, "logps/rejected": -228.90179443359375, "loss": 0.7997, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4796496629714966, "rewards/margins": 0.0218522846698761, "rewards/rejected": -0.5015019774436951, "step": 174 }, { "epoch": 0.23, "learning_rate": 4.985571107971408e-05, "logits/chosen": -1.8358758687973022, "logits/rejected": -1.8217942714691162, "logps/chosen": -173.35556030273438, "logps/rejected": -176.17031860351562, "loss": 0.6678, "rewards/accuracies": 0.625, "rewards/chosen": 0.03331407904624939, "rewards/margins": 0.18146347999572754, "rewards/rejected": -0.14814940094947815, "step": 175 }, { "epoch": 0.23, "learning_rate": 4.9851841554095e-05, "logits/chosen": -1.9195314645767212, "logits/rejected": -1.9220951795578003, "logps/chosen": -233.7957305908203, "logps/rejected": -216.81581115722656, "loss": 0.5707, "rewards/accuracies": 0.75, "rewards/chosen": -0.0034320950508117676, "rewards/margins": 0.3490698039531708, "rewards/rejected": -0.35250189900398254, "step": 176 }, { "epoch": 0.23, "learning_rate": 4.9847920980662134e-05, "logits/chosen": -1.598222255706787, "logits/rejected": -1.591965913772583, "logps/chosen": -226.130615234375, "logps/rejected": -236.63760375976562, "loss": 0.6588, "rewards/accuracies": 0.5, "rewards/chosen": 0.06091824173927307, "rewards/margins": 0.13529105484485626, "rewards/rejected": -0.07437281310558319, "step": 177 }, { "epoch": 0.23, "learning_rate": 4.984394936746865e-05, "logits/chosen": -1.7949796915054321, "logits/rejected": -1.805631399154663, "logps/chosen": -232.09498596191406, "logps/rejected": -235.5016632080078, "loss": 0.709, "rewards/accuracies": 0.625, "rewards/chosen": -0.5284535884857178, "rewards/margins": 0.1780376434326172, "rewards/rejected": -0.706491231918335, "step": 178 }, { "epoch": 0.23, "learning_rate": 4.98399267226726e-05, "logits/chosen": -2.0285134315490723, "logits/rejected": -2.060884714126587, "logps/chosen": -183.62449645996094, "logps/rejected": -172.28872680664062, "loss": 0.7795, "rewards/accuracies": 0.375, "rewards/chosen": -0.42201852798461914, "rewards/margins": -0.029273340478539467, "rewards/rejected": -0.3927451968193054, "step": 179 }, { "epoch": 0.24, "learning_rate": 4.9835853054536846e-05, "logits/chosen": -1.7763793468475342, "logits/rejected": -1.8531831502914429, "logps/chosen": -182.07516479492188, "logps/rejected": -205.60504150390625, "loss": 0.9431, "rewards/accuracies": 0.3125, "rewards/chosen": -0.2773244380950928, "rewards/margins": -0.36518311500549316, "rewards/rejected": 0.08785867691040039, "step": 180 }, { "epoch": 0.24, "learning_rate": 4.9831728371429046e-05, "logits/chosen": -1.8723288774490356, "logits/rejected": -1.9752717018127441, "logps/chosen": -200.94346618652344, "logps/rejected": -210.53933715820312, "loss": 0.7466, "rewards/accuracies": 0.5, "rewards/chosen": -0.15453863143920898, "rewards/margins": -0.037006717175245285, "rewards/rejected": -0.117531917989254, "step": 181 }, { "epoch": 0.24, "learning_rate": 4.982755268182164e-05, "logits/chosen": -1.7186784744262695, "logits/rejected": -1.752870798110962, "logps/chosen": -168.73394775390625, "logps/rejected": -177.32054138183594, "loss": 0.7175, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1720157265663147, "rewards/margins": -0.019572071731090546, "rewards/rejected": -0.15244367718696594, "step": 182 }, { "epoch": 0.24, "learning_rate": 4.982332599429187e-05, "logits/chosen": -1.9437085390090942, "logits/rejected": -1.9051276445388794, "logps/chosen": -168.2744903564453, "logps/rejected": -158.18289184570312, "loss": 0.8465, "rewards/accuracies": 0.375, "rewards/chosen": -0.40080204606056213, "rewards/margins": -0.24280087649822235, "rewards/rejected": -0.15800118446350098, "step": 183 }, { "epoch": 0.24, "learning_rate": 4.981904831752171e-05, "logits/chosen": -1.9985157251358032, "logits/rejected": -1.9973095655441284, "logps/chosen": -182.0140380859375, "logps/rejected": -169.61883544921875, "loss": 0.6244, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13748487830162048, "rewards/margins": 0.31921717524528503, "rewards/rejected": -0.4567020535469055, "step": 184 }, { "epoch": 0.24, "learning_rate": 4.981471966029787e-05, "logits/chosen": -1.9101628065109253, "logits/rejected": -1.9257937669754028, "logps/chosen": -192.24288940429688, "logps/rejected": -188.32144165039062, "loss": 0.6148, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2866635024547577, "rewards/margins": 0.2923651337623596, "rewards/rejected": -0.00570157915353775, "step": 185 }, { "epoch": 0.24, "learning_rate": 4.981034003151178e-05, "logits/chosen": -1.8881627321243286, "logits/rejected": -1.9550986289978027, "logps/chosen": -195.71372985839844, "logps/rejected": -210.1089324951172, "loss": 0.8689, "rewards/accuracies": 0.375, "rewards/chosen": -0.2077367603778839, "rewards/margins": -0.24497191607952118, "rewards/rejected": 0.03723515570163727, "step": 186 }, { "epoch": 0.24, "learning_rate": 4.980590944015958e-05, "logits/chosen": -1.7422230243682861, "logits/rejected": -1.7998031377792358, "logps/chosen": -216.39398193359375, "logps/rejected": -222.33880615234375, "loss": 0.7159, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11723195761442184, "rewards/margins": 0.26700979471206665, "rewards/rejected": -0.14977779984474182, "step": 187 }, { "epoch": 0.25, "learning_rate": 4.98014278953421e-05, "logits/chosen": -1.8071612119674683, "logits/rejected": -1.8180828094482422, "logps/chosen": -176.4374542236328, "logps/rejected": -189.68438720703125, "loss": 0.8578, "rewards/accuracies": 0.3125, "rewards/chosen": 0.02567705512046814, "rewards/margins": -0.21896688640117645, "rewards/rejected": 0.2446439117193222, "step": 188 }, { "epoch": 0.25, "learning_rate": 4.979689540626479e-05, "logits/chosen": -1.7617331743240356, "logits/rejected": -1.8094758987426758, "logps/chosen": -158.13441467285156, "logps/rejected": -177.64797973632812, "loss": 0.5662, "rewards/accuracies": 0.75, "rewards/chosen": 0.3027116656303406, "rewards/margins": 0.35941970348358154, "rewards/rejected": -0.05670810118317604, "step": 189 }, { "epoch": 0.25, "learning_rate": 4.9792311982237774e-05, "logits/chosen": -1.528577208518982, "logits/rejected": -1.5104334354400635, "logps/chosen": -164.3386993408203, "logps/rejected": -167.84840393066406, "loss": 0.7506, "rewards/accuracies": 0.5, "rewards/chosen": -0.04946248233318329, "rewards/margins": -0.02245260775089264, "rewards/rejected": -0.027009889483451843, "step": 190 }, { "epoch": 0.25, "learning_rate": 4.9787677632675825e-05, "logits/chosen": -1.86935555934906, "logits/rejected": -1.8780128955841064, "logps/chosen": -167.26119995117188, "logps/rejected": -187.39584350585938, "loss": 0.6634, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09376392513513565, "rewards/margins": 0.16032886505126953, "rewards/rejected": -0.06656493991613388, "step": 191 }, { "epoch": 0.25, "learning_rate": 4.978299236709826e-05, "logits/chosen": -1.79931640625, "logits/rejected": -1.7840967178344727, "logps/chosen": -173.83987426757812, "logps/rejected": -174.4556427001953, "loss": 0.8509, "rewards/accuracies": 0.375, "rewards/chosen": -0.17128746211528778, "rewards/margins": -0.21691852807998657, "rewards/rejected": 0.04563106596469879, "step": 192 }, { "epoch": 0.25, "learning_rate": 4.977825619512904e-05, "logits/chosen": -1.9714162349700928, "logits/rejected": -1.9255212545394897, "logps/chosen": -216.35391235351562, "logps/rejected": -213.25570678710938, "loss": 0.787, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07267741113901138, "rewards/margins": -0.07426212728023529, "rewards/rejected": 0.0015847217291593552, "step": 193 }, { "epoch": 0.25, "learning_rate": 4.977346912649666e-05, "logits/chosen": -1.8389030694961548, "logits/rejected": -1.872612476348877, "logps/chosen": -199.1173095703125, "logps/rejected": -170.51205444335938, "loss": 0.6682, "rewards/accuracies": 0.625, "rewards/chosen": -0.054506294429302216, "rewards/margins": 0.1410951465368271, "rewards/rejected": -0.1956014782190323, "step": 194 }, { "epoch": 0.26, "learning_rate": 4.9768631171034175e-05, "logits/chosen": -1.6105570793151855, "logits/rejected": -1.655145525932312, "logps/chosen": -183.67723083496094, "logps/rejected": -176.943115234375, "loss": 0.8043, "rewards/accuracies": 0.3125, "rewards/chosen": -0.09822743386030197, "rewards/margins": -0.16278451681137085, "rewards/rejected": 0.06455708295106888, "step": 195 }, { "epoch": 0.26, "learning_rate": 4.9763742338679145e-05, "logits/chosen": -1.589104413986206, "logits/rejected": -1.544286847114563, "logps/chosen": -188.53167724609375, "logps/rejected": -190.18521118164062, "loss": 0.8959, "rewards/accuracies": 0.375, "rewards/chosen": -0.08583441376686096, "rewards/margins": -0.30193185806274414, "rewards/rejected": 0.21609747409820557, "step": 196 }, { "epoch": 0.26, "learning_rate": 4.975880263947367e-05, "logits/chosen": -1.5240559577941895, "logits/rejected": -1.5749508142471313, "logps/chosen": -173.70506286621094, "logps/rejected": -174.54681396484375, "loss": 0.708, "rewards/accuracies": 0.5625, "rewards/chosen": 0.030573375523090363, "rewards/margins": 0.030332941561937332, "rewards/rejected": 0.00024041905999183655, "step": 197 }, { "epoch": 0.26, "learning_rate": 4.9753812083564304e-05, "logits/chosen": -1.879758596420288, "logits/rejected": -1.8182477951049805, "logps/chosen": -186.97525024414062, "logps/rejected": -159.67259216308594, "loss": 0.7133, "rewards/accuracies": 0.5, "rewards/chosen": 0.18908704817295074, "rewards/margins": 0.015400439500808716, "rewards/rejected": 0.17368660867214203, "step": 198 }, { "epoch": 0.26, "learning_rate": 4.974877068120208e-05, "logits/chosen": -1.8003509044647217, "logits/rejected": -1.8137016296386719, "logps/chosen": -180.23757934570312, "logps/rejected": -191.359375, "loss": 0.7853, "rewards/accuracies": 0.5, "rewards/chosen": 0.0054572634398937225, "rewards/margins": -0.046278372406959534, "rewards/rejected": 0.05173564702272415, "step": 199 }, { "epoch": 0.26, "learning_rate": 4.974367844274248e-05, "logits/chosen": -1.7015259265899658, "logits/rejected": -1.6682251691818237, "logps/chosen": -155.98707580566406, "logps/rejected": -146.8739776611328, "loss": 0.7147, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09909596294164658, "rewards/margins": 0.09801648557186127, "rewards/rejected": 0.001079469919204712, "step": 200 }, { "epoch": 0.26, "learning_rate": 4.973853537864538e-05, "logits/chosen": -1.9078797101974487, "logits/rejected": -1.8878577947616577, "logps/chosen": -213.63034057617188, "logps/rejected": -196.01133728027344, "loss": 0.7176, "rewards/accuracies": 0.5, "rewards/chosen": -0.07858332991600037, "rewards/margins": 0.04805755987763405, "rewards/rejected": -0.12664087116718292, "step": 201 }, { "epoch": 0.26, "learning_rate": 4.973334149947508e-05, "logits/chosen": -1.8100643157958984, "logits/rejected": -1.8916385173797607, "logps/chosen": -154.20033264160156, "logps/rejected": -180.57833862304688, "loss": 0.9816, "rewards/accuracies": 0.25, "rewards/chosen": -0.11480588465929031, "rewards/margins": -0.3577505350112915, "rewards/rejected": 0.24294468760490417, "step": 202 }, { "epoch": 0.27, "learning_rate": 4.972809681590026e-05, "logits/chosen": -1.6414841413497925, "logits/rejected": -1.6490445137023926, "logps/chosen": -186.58531188964844, "logps/rejected": -188.83343505859375, "loss": 0.7131, "rewards/accuracies": 0.5, "rewards/chosen": 0.0989757627248764, "rewards/margins": 0.02306460589170456, "rewards/rejected": 0.07591113448143005, "step": 203 }, { "epoch": 0.27, "learning_rate": 4.972280133869396e-05, "logits/chosen": -1.7980338335037231, "logits/rejected": -1.8528972864151, "logps/chosen": -194.47787475585938, "logps/rejected": -186.64306640625, "loss": 0.7408, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09773577749729156, "rewards/margins": 0.03546319156885147, "rewards/rejected": 0.06227259710431099, "step": 204 }, { "epoch": 0.27, "learning_rate": 4.971745507873352e-05, "logits/chosen": -1.8588396310806274, "logits/rejected": -1.7567483186721802, "logps/chosen": -181.69345092773438, "logps/rejected": -187.5961151123047, "loss": 0.8451, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1902497261762619, "rewards/margins": -0.21404078602790833, "rewards/rejected": 0.023791024461388588, "step": 205 }, { "epoch": 0.27, "learning_rate": 4.971205804700063e-05, "logits/chosen": -1.8144092559814453, "logits/rejected": -1.8621768951416016, "logps/chosen": -143.86961364746094, "logps/rejected": -168.99295043945312, "loss": 0.8074, "rewards/accuracies": 0.3125, "rewards/chosen": -0.11274349689483643, "rewards/margins": -0.13580113649368286, "rewards/rejected": 0.023057660087943077, "step": 206 }, { "epoch": 0.27, "learning_rate": 4.970661025458125e-05, "logits/chosen": -1.573486089706421, "logits/rejected": -1.5931901931762695, "logps/chosen": -157.45480346679688, "logps/rejected": -169.86550903320312, "loss": 0.7474, "rewards/accuracies": 0.375, "rewards/chosen": 0.007056853733956814, "rewards/margins": -0.06664810329675674, "rewards/rejected": 0.07370495796203613, "step": 207 }, { "epoch": 0.27, "learning_rate": 4.9701111712665625e-05, "logits/chosen": -2.0106117725372314, "logits/rejected": -1.9366306066513062, "logps/chosen": -190.40353393554688, "logps/rejected": -176.56820678710938, "loss": 0.6494, "rewards/accuracies": 0.4375, "rewards/chosen": 0.12490460276603699, "rewards/margins": 0.2138100564479828, "rewards/rejected": -0.08890549838542938, "step": 208 }, { "epoch": 0.27, "learning_rate": 4.969556243254822e-05, "logits/chosen": -1.7928516864776611, "logits/rejected": -1.8632910251617432, "logps/chosen": -232.3812255859375, "logps/rejected": -237.18328857421875, "loss": 0.725, "rewards/accuracies": 0.5, "rewards/chosen": 0.20639315247535706, "rewards/margins": 0.010047540068626404, "rewards/rejected": 0.19634561240673065, "step": 209 }, { "epoch": 0.27, "learning_rate": 4.968996242562774e-05, "logits/chosen": -1.8890585899353027, "logits/rejected": -1.8688653707504272, "logps/chosen": -191.57362365722656, "logps/rejected": -199.00550842285156, "loss": 0.7262, "rewards/accuracies": 0.375, "rewards/chosen": -0.2404320389032364, "rewards/margins": -0.03374467045068741, "rewards/rejected": -0.2066873461008072, "step": 210 }, { "epoch": 0.28, "learning_rate": 4.968431170340706e-05, "logits/chosen": -1.6715140342712402, "logits/rejected": -1.619262456893921, "logps/chosen": -178.09326171875, "logps/rejected": -181.5880126953125, "loss": 0.7146, "rewards/accuracies": 0.5625, "rewards/chosen": 0.08340275287628174, "rewards/margins": 0.0807909369468689, "rewards/rejected": 0.0026118261739611626, "step": 211 }, { "epoch": 0.28, "learning_rate": 4.9678610277493275e-05, "logits/chosen": -1.6335276365280151, "logits/rejected": -1.6984977722167969, "logps/chosen": -181.51470947265625, "logps/rejected": -188.1268768310547, "loss": 0.6818, "rewards/accuracies": 0.5, "rewards/chosen": 0.13411936163902283, "rewards/margins": 0.05135034769773483, "rewards/rejected": 0.0827689841389656, "step": 212 }, { "epoch": 0.28, "learning_rate": 4.967285815959759e-05, "logits/chosen": -1.5702835321426392, "logits/rejected": -1.6215555667877197, "logps/chosen": -177.0418701171875, "logps/rejected": -186.50494384765625, "loss": 0.6603, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04579095542430878, "rewards/margins": 0.188707172870636, "rewards/rejected": -0.23449814319610596, "step": 213 }, { "epoch": 0.28, "learning_rate": 4.9667055361535354e-05, "logits/chosen": -1.7180269956588745, "logits/rejected": -1.7135778665542603, "logps/chosen": -195.20785522460938, "logps/rejected": -210.96878051757812, "loss": 0.9002, "rewards/accuracies": 0.5, "rewards/chosen": 0.03346429392695427, "rewards/margins": -0.26186707615852356, "rewards/rejected": 0.29533132910728455, "step": 214 }, { "epoch": 0.28, "learning_rate": 4.9661201895226e-05, "logits/chosen": -1.7542705535888672, "logits/rejected": -1.7284282445907593, "logps/chosen": -173.01751708984375, "logps/rejected": -157.28419494628906, "loss": 0.6185, "rewards/accuracies": 0.75, "rewards/chosen": 0.02107839845120907, "rewards/margins": 0.2878818213939667, "rewards/rejected": -0.26680341362953186, "step": 215 }, { "epoch": 0.28, "learning_rate": 4.965529777269306e-05, "logits/chosen": -1.736549973487854, "logits/rejected": -1.771423578262329, "logps/chosen": -158.92160034179688, "logps/rejected": -163.87283325195312, "loss": 0.7139, "rewards/accuracies": 0.5, "rewards/chosen": 0.006686069071292877, "rewards/margins": 0.04404951259493828, "rewards/rejected": -0.0373634397983551, "step": 216 }, { "epoch": 0.28, "learning_rate": 4.964934300606411e-05, "logits/chosen": -1.511568307876587, "logits/rejected": -1.5145093202590942, "logps/chosen": -170.07809448242188, "logps/rejected": -186.4696807861328, "loss": 0.607, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2800918221473694, "rewards/margins": 0.30444180965423584, "rewards/rejected": -0.02434997633099556, "step": 217 }, { "epoch": 0.29, "learning_rate": 4.964333760757074e-05, "logits/chosen": -1.436962366104126, "logits/rejected": -1.4119391441345215, "logps/chosen": -309.4395446777344, "logps/rejected": -291.1634521484375, "loss": 0.6898, "rewards/accuracies": 0.375, "rewards/chosen": 0.0010376125574111938, "rewards/margins": 0.03738358989357948, "rewards/rejected": -0.036345988512039185, "step": 218 }, { "epoch": 0.29, "learning_rate": 4.963728158954856e-05, "logits/chosen": -1.891182541847229, "logits/rejected": -1.8770077228546143, "logps/chosen": -162.81988525390625, "logps/rejected": -169.5299072265625, "loss": 0.8258, "rewards/accuracies": 0.4375, "rewards/chosen": 0.07328500598669052, "rewards/margins": -0.13723579049110413, "rewards/rejected": 0.21052080392837524, "step": 219 }, { "epoch": 0.29, "learning_rate": 4.963117496443715e-05, "logits/chosen": -1.8470525741577148, "logits/rejected": -1.8625476360321045, "logps/chosen": -166.7591552734375, "logps/rejected": -194.91290283203125, "loss": 0.9473, "rewards/accuracies": 0.1875, "rewards/chosen": -0.4374409317970276, "rewards/margins": -0.35452863574028015, "rewards/rejected": -0.08291231095790863, "step": 220 }, { "epoch": 0.29, "learning_rate": 4.9625017744780045e-05, "logits/chosen": -1.5161206722259521, "logits/rejected": -1.4952480792999268, "logps/chosen": -173.3487548828125, "logps/rejected": -167.93292236328125, "loss": 0.8109, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0004083700478076935, "rewards/margins": -0.171901136636734, "rewards/rejected": 0.1714927703142166, "step": 221 }, { "epoch": 0.29, "learning_rate": 4.96188099432247e-05, "logits/chosen": -1.617663025856018, "logits/rejected": -1.6117898225784302, "logps/chosen": -194.1741180419922, "logps/rejected": -188.07025146484375, "loss": 0.7477, "rewards/accuracies": 0.3125, "rewards/chosen": -0.21349965035915375, "rewards/margins": 0.016070939600467682, "rewards/rejected": -0.22957059741020203, "step": 222 }, { "epoch": 0.29, "learning_rate": 4.9612551572522464e-05, "logits/chosen": -1.899290680885315, "logits/rejected": -1.8964145183563232, "logps/chosen": -192.67469787597656, "logps/rejected": -179.21112060546875, "loss": 1.0079, "rewards/accuracies": 0.5, "rewards/chosen": -0.24318143725395203, "rewards/margins": -0.3611811697483063, "rewards/rejected": 0.11799970269203186, "step": 223 }, { "epoch": 0.29, "learning_rate": 4.960624264552858e-05, "logits/chosen": -1.6361061334609985, "logits/rejected": -1.6821738481521606, "logps/chosen": -194.6638641357422, "logps/rejected": -179.90225219726562, "loss": 0.6424, "rewards/accuracies": 0.5625, "rewards/chosen": 0.19569191336631775, "rewards/margins": 0.2166380137205124, "rewards/rejected": -0.020946092903614044, "step": 224 }, { "epoch": 0.29, "learning_rate": 4.9599883175202124e-05, "logits/chosen": -1.522665023803711, "logits/rejected": -1.4538843631744385, "logps/chosen": -222.08251953125, "logps/rejected": -200.12472534179688, "loss": 0.8261, "rewards/accuracies": 0.25, "rewards/chosen": -0.26220959424972534, "rewards/margins": -0.16530472040176392, "rewards/rejected": -0.09690490365028381, "step": 225 }, { "epoch": 0.3, "learning_rate": 4.9593473174605974e-05, "logits/chosen": -1.4936704635620117, "logits/rejected": -1.5721888542175293, "logps/chosen": -182.029541015625, "logps/rejected": -204.53567504882812, "loss": 0.6881, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3360684812068939, "rewards/margins": 0.14660386741161346, "rewards/rejected": -0.4826723635196686, "step": 226 }, { "epoch": 0.3, "learning_rate": 4.958701265690685e-05, "logits/chosen": -1.6544736623764038, "logits/rejected": -1.672118902206421, "logps/chosen": -174.0331268310547, "logps/rejected": -203.7425079345703, "loss": 0.7058, "rewards/accuracies": 0.4375, "rewards/chosen": -0.34017136693000793, "rewards/margins": 0.01865684613585472, "rewards/rejected": -0.3588281571865082, "step": 227 }, { "epoch": 0.3, "learning_rate": 4.958050163537519e-05, "logits/chosen": -1.8430697917938232, "logits/rejected": -1.7734485864639282, "logps/chosen": -208.96421813964844, "logps/rejected": -212.52711486816406, "loss": 0.8319, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6053764820098877, "rewards/margins": -0.1353476196527481, "rewards/rejected": -0.4700288772583008, "step": 228 }, { "epoch": 0.3, "learning_rate": 4.957394012338519e-05, "logits/chosen": -1.9725829362869263, "logits/rejected": -1.9301397800445557, "logps/chosen": -229.4776153564453, "logps/rejected": -215.60470581054688, "loss": 0.7281, "rewards/accuracies": 0.5625, "rewards/chosen": 0.046676263213157654, "rewards/margins": 0.04501792788505554, "rewards/rejected": 0.0016583409160375595, "step": 229 }, { "epoch": 0.3, "learning_rate": 4.956732813441477e-05, "logits/chosen": -1.733205795288086, "logits/rejected": -1.617655873298645, "logps/chosen": -174.42959594726562, "logps/rejected": -154.43499755859375, "loss": 0.9236, "rewards/accuracies": 0.3125, "rewards/chosen": -0.30461782217025757, "rewards/margins": -0.30838024616241455, "rewards/rejected": 0.0037624058313667774, "step": 230 }, { "epoch": 0.3, "learning_rate": 4.956066568204552e-05, "logits/chosen": -1.6661994457244873, "logits/rejected": -1.7003294229507446, "logps/chosen": -179.96853637695312, "logps/rejected": -187.9959716796875, "loss": 0.8541, "rewards/accuracies": 0.4375, "rewards/chosen": -0.47250720858573914, "rewards/margins": -0.12853975594043732, "rewards/rejected": -0.3439674377441406, "step": 231 }, { "epoch": 0.3, "learning_rate": 4.955395277996268e-05, "logits/chosen": -1.786563754081726, "logits/rejected": -1.7648732662200928, "logps/chosen": -193.4469757080078, "logps/rejected": -202.775146484375, "loss": 0.5666, "rewards/accuracies": 0.75, "rewards/chosen": -0.182941734790802, "rewards/margins": 0.3387344479560852, "rewards/rejected": -0.5216761231422424, "step": 232 }, { "epoch": 0.3, "learning_rate": 4.954718944195512e-05, "logits/chosen": -1.7109639644622803, "logits/rejected": -1.6762810945510864, "logps/chosen": -154.25538635253906, "logps/rejected": -158.89683532714844, "loss": 0.7073, "rewards/accuracies": 0.4375, "rewards/chosen": -0.25806286931037903, "rewards/margins": 0.007834136486053467, "rewards/rejected": -0.2658970057964325, "step": 233 }, { "epoch": 0.31, "learning_rate": 4.954037568191534e-05, "logits/chosen": -1.7765631675720215, "logits/rejected": -1.8208155632019043, "logps/chosen": -189.78883361816406, "logps/rejected": -214.85067749023438, "loss": 0.7049, "rewards/accuracies": 0.625, "rewards/chosen": -0.3182518482208252, "rewards/margins": 0.08276516944169998, "rewards/rejected": -0.4010169804096222, "step": 234 }, { "epoch": 0.31, "learning_rate": 4.9533511513839384e-05, "logits/chosen": -1.6697825193405151, "logits/rejected": -1.688278079032898, "logps/chosen": -174.92227172851562, "logps/rejected": -173.88099670410156, "loss": 0.579, "rewards/accuracies": 0.625, "rewards/chosen": -0.08302205801010132, "rewards/margins": 0.34601137042045593, "rewards/rejected": -0.42903345823287964, "step": 235 }, { "epoch": 0.31, "learning_rate": 4.9526596951826824e-05, "logits/chosen": -1.9389506578445435, "logits/rejected": -1.8745498657226562, "logps/chosen": -193.29092407226562, "logps/rejected": -187.18719482421875, "loss": 0.8118, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6870115995407104, "rewards/margins": -0.025065027177333832, "rewards/rejected": -0.6619465351104736, "step": 236 }, { "epoch": 0.31, "learning_rate": 4.951963201008076e-05, "logits/chosen": -1.9140545129776, "logits/rejected": -1.8272314071655273, "logps/chosen": -154.29287719726562, "logps/rejected": -149.56021118164062, "loss": 0.8096, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10280251502990723, "rewards/margins": -0.07113885879516602, "rewards/rejected": -0.03166365623474121, "step": 237 }, { "epoch": 0.31, "learning_rate": 4.951261670290781e-05, "logits/chosen": -2.0082364082336426, "logits/rejected": -2.00789737701416, "logps/chosen": -186.12057495117188, "logps/rejected": -193.4689483642578, "loss": 0.7454, "rewards/accuracies": 0.375, "rewards/chosen": -0.14050771296024323, "rewards/margins": 0.02027921937406063, "rewards/rejected": -0.16078691184520721, "step": 238 }, { "epoch": 0.31, "learning_rate": 4.950555104471799e-05, "logits/chosen": -1.851813554763794, "logits/rejected": -1.8402018547058105, "logps/chosen": -157.9239959716797, "logps/rejected": -147.2781524658203, "loss": 0.7481, "rewards/accuracies": 0.5, "rewards/chosen": -0.21430522203445435, "rewards/margins": -0.030568838119506836, "rewards/rejected": -0.1837363839149475, "step": 239 }, { "epoch": 0.31, "learning_rate": 4.949843505002477e-05, "logits/chosen": -1.9467929601669312, "logits/rejected": -1.976270079612732, "logps/chosen": -170.30682373046875, "logps/rejected": -167.61927795410156, "loss": 0.7485, "rewards/accuracies": 0.375, "rewards/chosen": -0.2596588134765625, "rewards/margins": -0.021375911310315132, "rewards/rejected": -0.23828287422657013, "step": 240 }, { "epoch": 0.32, "learning_rate": 4.9491268733445034e-05, "logits/chosen": -1.724785327911377, "logits/rejected": -1.7340233325958252, "logps/chosen": -204.80548095703125, "logps/rejected": -209.44329833984375, "loss": 0.7051, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09321151673793793, "rewards/margins": 0.0742495059967041, "rewards/rejected": -0.16746100783348083, "step": 241 }, { "epoch": 0.32, "learning_rate": 4.9484052109698984e-05, "logits/chosen": -1.7430789470672607, "logits/rejected": -1.7313511371612549, "logps/chosen": -181.70632934570312, "logps/rejected": -162.24334716796875, "loss": 0.844, "rewards/accuracies": 0.625, "rewards/chosen": -0.014489106833934784, "rewards/margins": -0.029044844210147858, "rewards/rejected": 0.014555716887116432, "step": 242 }, { "epoch": 0.32, "learning_rate": 4.947678519361021e-05, "logits/chosen": -1.9160277843475342, "logits/rejected": -1.8753935098648071, "logps/chosen": -175.2951202392578, "logps/rejected": -161.4536590576172, "loss": 0.6499, "rewards/accuracies": 0.5, "rewards/chosen": -0.1920948028564453, "rewards/margins": 0.18029190599918365, "rewards/rejected": -0.3723866939544678, "step": 243 }, { "epoch": 0.32, "learning_rate": 4.946946800010556e-05, "logits/chosen": -1.788377046585083, "logits/rejected": -1.804762601852417, "logps/chosen": -190.5827178955078, "logps/rejected": -207.48460388183594, "loss": 0.7442, "rewards/accuracies": 0.4375, "rewards/chosen": -0.20643703639507294, "rewards/margins": 0.0016913870349526405, "rewards/rejected": -0.20812839269638062, "step": 244 }, { "epoch": 0.32, "learning_rate": 4.946210054421518e-05, "logits/chosen": -1.943693995475769, "logits/rejected": -1.9860758781433105, "logps/chosen": -162.47232055664062, "logps/rejected": -187.59640502929688, "loss": 0.5544, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04131259024143219, "rewards/margins": 0.34646379947662354, "rewards/rejected": -0.3877764344215393, "step": 245 }, { "epoch": 0.32, "learning_rate": 4.945468284107246e-05, "logits/chosen": -1.7154016494750977, "logits/rejected": -1.729323387145996, "logps/chosen": -151.67153930664062, "logps/rejected": -175.7374725341797, "loss": 0.7351, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3676840364933014, "rewards/margins": -0.05210195109248161, "rewards/rejected": -0.3155820667743683, "step": 246 }, { "epoch": 0.32, "learning_rate": 4.944721490591401e-05, "logits/chosen": -1.5419683456420898, "logits/rejected": -1.5722306966781616, "logps/chosen": -158.3173065185547, "logps/rejected": -168.21975708007812, "loss": 0.7106, "rewards/accuracies": 0.625, "rewards/chosen": 0.040096037089824677, "rewards/margins": 0.040343452244997025, "rewards/rejected": -0.00024740397930145264, "step": 247 }, { "epoch": 0.32, "learning_rate": 4.9439696754079595e-05, "logits/chosen": -1.8851487636566162, "logits/rejected": -1.927181601524353, "logps/chosen": -163.97447204589844, "logps/rejected": -171.12020874023438, "loss": 0.6697, "rewards/accuracies": 0.5, "rewards/chosen": -0.4691685140132904, "rewards/margins": 0.14997676014900208, "rewards/rejected": -0.6191452741622925, "step": 248 }, { "epoch": 0.33, "learning_rate": 4.9432128401012144e-05, "logits/chosen": -1.5929148197174072, "logits/rejected": -1.5544054508209229, "logps/chosen": -143.14022827148438, "logps/rejected": -158.71368408203125, "loss": 0.6763, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0918157696723938, "rewards/margins": 0.08131375163793564, "rewards/rejected": -0.17312952876091003, "step": 249 }, { "epoch": 0.33, "learning_rate": 4.9424509862257706e-05, "logits/chosen": -1.599873423576355, "logits/rejected": -1.5568993091583252, "logps/chosen": -197.35276794433594, "logps/rejected": -228.1996307373047, "loss": 0.6008, "rewards/accuracies": 0.75, "rewards/chosen": -0.22382640838623047, "rewards/margins": 0.30180901288986206, "rewards/rejected": -0.5256354808807373, "step": 250 }, { "epoch": 0.33, "learning_rate": 4.941684115346541e-05, "logits/chosen": -1.9682908058166504, "logits/rejected": -1.9601702690124512, "logps/chosen": -178.14833068847656, "logps/rejected": -180.44769287109375, "loss": 0.6096, "rewards/accuracies": 0.625, "rewards/chosen": -0.2652210593223572, "rewards/margins": 0.3119816184043884, "rewards/rejected": -0.5772026181221008, "step": 251 }, { "epoch": 0.33, "learning_rate": 4.940912229038745e-05, "logits/chosen": -1.7443188428878784, "logits/rejected": -1.720470666885376, "logps/chosen": -176.2379150390625, "logps/rejected": -166.10626220703125, "loss": 0.8199, "rewards/accuracies": 0.75, "rewards/chosen": -0.5284540057182312, "rewards/margins": -0.06340186297893524, "rewards/rejected": -0.46505218744277954, "step": 252 }, { "epoch": 0.33, "learning_rate": 4.9401353288879024e-05, "logits/chosen": -1.8005255460739136, "logits/rejected": -1.814915657043457, "logps/chosen": -173.22021484375, "logps/rejected": -187.9818878173828, "loss": 0.6487, "rewards/accuracies": 0.625, "rewards/chosen": -0.36548957228660583, "rewards/margins": 0.12811800837516785, "rewards/rejected": -0.49360761046409607, "step": 253 }, { "epoch": 0.33, "learning_rate": 4.9393534164898335e-05, "logits/chosen": -1.8766534328460693, "logits/rejected": -1.9305753707885742, "logps/chosen": -166.86866760253906, "logps/rejected": -195.80569458007812, "loss": 0.6937, "rewards/accuracies": 0.5, "rewards/chosen": -0.2575337588787079, "rewards/margins": 0.2505089044570923, "rewards/rejected": -0.5080426931381226, "step": 254 }, { "epoch": 0.33, "learning_rate": 4.9385664934506526e-05, "logits/chosen": -1.7149075269699097, "logits/rejected": -1.7573699951171875, "logps/chosen": -169.73626708984375, "logps/rejected": -178.3174591064453, "loss": 0.6638, "rewards/accuracies": 0.625, "rewards/chosen": -0.4145450294017792, "rewards/margins": 0.1643792688846588, "rewards/rejected": -0.578924298286438, "step": 255 }, { "epoch": 0.34, "learning_rate": 4.937774561386768e-05, "logits/chosen": -1.8144739866256714, "logits/rejected": -1.8054416179656982, "logps/chosen": -201.68247985839844, "logps/rejected": -208.70188903808594, "loss": 0.7625, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2262319028377533, "rewards/margins": -0.053480371832847595, "rewards/rejected": -0.1727515161037445, "step": 256 }, { "epoch": 0.34, "learning_rate": 4.936977621924875e-05, "logits/chosen": -1.721892237663269, "logits/rejected": -1.7585711479187012, "logps/chosen": -193.45179748535156, "logps/rejected": -200.13726806640625, "loss": 0.798, "rewards/accuracies": 0.375, "rewards/chosen": -0.7297480702400208, "rewards/margins": -0.1376451551914215, "rewards/rejected": -0.5921030044555664, "step": 257 }, { "epoch": 0.34, "learning_rate": 4.9361756767019564e-05, "logits/chosen": -1.8132922649383545, "logits/rejected": -1.8062866926193237, "logps/chosen": -204.11619567871094, "logps/rejected": -196.605224609375, "loss": 0.802, "rewards/accuracies": 0.375, "rewards/chosen": -0.10710492730140686, "rewards/margins": -0.09782031178474426, "rewards/rejected": -0.009284593164920807, "step": 258 }, { "epoch": 0.34, "learning_rate": 4.935368727365276e-05, "logits/chosen": -1.6550960540771484, "logits/rejected": -1.6377525329589844, "logps/chosen": -191.87579345703125, "logps/rejected": -181.06930541992188, "loss": 0.7402, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8034918308258057, "rewards/margins": -0.04004772752523422, "rewards/rejected": -0.763444185256958, "step": 259 }, { "epoch": 0.34, "learning_rate": 4.934556775572377e-05, "logits/chosen": -1.9349067211151123, "logits/rejected": -1.9205700159072876, "logps/chosen": -173.06373596191406, "logps/rejected": -173.32766723632812, "loss": 0.6361, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24924635887145996, "rewards/margins": 0.23114144802093506, "rewards/rejected": -0.480387806892395, "step": 260 }, { "epoch": 0.34, "learning_rate": 4.9337398229910784e-05, "logits/chosen": -1.8233386278152466, "logits/rejected": -1.8753117322921753, "logps/chosen": -189.73959350585938, "logps/rejected": -197.85728454589844, "loss": 0.7225, "rewards/accuracies": 0.625, "rewards/chosen": -0.24001392722129822, "rewards/margins": 0.08805333077907562, "rewards/rejected": -0.32806724309921265, "step": 261 }, { "epoch": 0.34, "learning_rate": 4.932917871299471e-05, "logits/chosen": -1.540401816368103, "logits/rejected": -1.5170302391052246, "logps/chosen": -205.3408203125, "logps/rejected": -206.5533905029297, "loss": 0.8948, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7126679420471191, "rewards/margins": -0.22102710604667664, "rewards/rejected": -0.4916408061981201, "step": 262 }, { "epoch": 0.34, "learning_rate": 4.9320909221859134e-05, "logits/chosen": -1.934309482574463, "logits/rejected": -1.945433497428894, "logps/chosen": -170.4419708251953, "logps/rejected": -165.6936492919922, "loss": 0.7461, "rewards/accuracies": 0.5, "rewards/chosen": -0.23198306560516357, "rewards/margins": -0.018536821007728577, "rewards/rejected": -0.213446244597435, "step": 263 }, { "epoch": 0.35, "learning_rate": 4.9312589773490304e-05, "logits/chosen": -2.026982545852661, "logits/rejected": -1.9359885454177856, "logps/chosen": -185.02920532226562, "logps/rejected": -173.5999298095703, "loss": 0.6839, "rewards/accuracies": 0.5, "rewards/chosen": -0.39162588119506836, "rewards/margins": 0.09019112586975098, "rewards/rejected": -0.48181700706481934, "step": 264 }, { "epoch": 0.35, "learning_rate": 4.930422038497708e-05, "logits/chosen": -1.9103275537490845, "logits/rejected": -1.8527649641036987, "logps/chosen": -167.06378173828125, "logps/rejected": -153.1953125, "loss": 0.6232, "rewards/accuracies": 0.75, "rewards/chosen": -0.32273101806640625, "rewards/margins": 0.2372804582118988, "rewards/rejected": -0.5600115060806274, "step": 265 }, { "epoch": 0.35, "learning_rate": 4.92958010735109e-05, "logits/chosen": -1.9541754722595215, "logits/rejected": -2.0632808208465576, "logps/chosen": -181.56781005859375, "logps/rejected": -199.48483276367188, "loss": 0.5323, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1671663373708725, "rewards/margins": 0.5313636064529419, "rewards/rejected": -0.6985299587249756, "step": 266 }, { "epoch": 0.35, "learning_rate": 4.928733185638575e-05, "logits/chosen": -1.7843657732009888, "logits/rejected": -1.8627678155899048, "logps/chosen": -167.1883087158203, "logps/rejected": -172.48223876953125, "loss": 0.7816, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2200300097465515, "rewards/margins": -0.0496581606566906, "rewards/rejected": -0.1703718900680542, "step": 267 }, { "epoch": 0.35, "learning_rate": 4.927881275099815e-05, "logits/chosen": -1.713842511177063, "logits/rejected": -1.805971622467041, "logps/chosen": -192.92250061035156, "logps/rejected": -212.14866638183594, "loss": 0.6343, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17002353072166443, "rewards/margins": 0.28741562366485596, "rewards/rejected": -0.4574391543865204, "step": 268 }, { "epoch": 0.35, "learning_rate": 4.927024377484705e-05, "logits/chosen": -1.682020664215088, "logits/rejected": -1.7268104553222656, "logps/chosen": -156.587158203125, "logps/rejected": -159.53341674804688, "loss": 0.6657, "rewards/accuracies": 0.5625, "rewards/chosen": -0.16453158855438232, "rewards/margins": 0.15859198570251465, "rewards/rejected": -0.323123574256897, "step": 269 }, { "epoch": 0.35, "learning_rate": 4.9261624945533855e-05, "logits/chosen": -1.8595139980316162, "logits/rejected": -1.8612048625946045, "logps/chosen": -163.1502685546875, "logps/rejected": -193.11166381835938, "loss": 0.6646, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2516610622406006, "rewards/margins": 0.18862426280975342, "rewards/rejected": -0.440285325050354, "step": 270 }, { "epoch": 0.35, "learning_rate": 4.925295628076241e-05, "logits/chosen": -1.8986504077911377, "logits/rejected": -1.951588749885559, "logps/chosen": -161.01625061035156, "logps/rejected": -171.39744567871094, "loss": 0.7413, "rewards/accuracies": 0.625, "rewards/chosen": -0.18460389971733093, "rewards/margins": -0.004900887608528137, "rewards/rejected": -0.1797029972076416, "step": 271 }, { "epoch": 0.36, "learning_rate": 4.9244237798338866e-05, "logits/chosen": -1.7942367792129517, "logits/rejected": -1.8609907627105713, "logps/chosen": -190.5836181640625, "logps/rejected": -181.18942260742188, "loss": 0.8213, "rewards/accuracies": 0.375, "rewards/chosen": -0.7920368909835815, "rewards/margins": -0.13006603717803955, "rewards/rejected": -0.6619707942008972, "step": 272 }, { "epoch": 0.36, "learning_rate": 4.923546951617175e-05, "logits/chosen": -1.7586820125579834, "logits/rejected": -1.8338139057159424, "logps/chosen": -161.4608154296875, "logps/rejected": -174.6810302734375, "loss": 0.7838, "rewards/accuracies": 0.375, "rewards/chosen": -0.15520796179771423, "rewards/margins": 0.007751762866973877, "rewards/rejected": -0.1629597246646881, "step": 273 }, { "epoch": 0.36, "learning_rate": 4.922665145227187e-05, "logits/chosen": -1.999558925628662, "logits/rejected": -1.9667410850524902, "logps/chosen": -177.38986206054688, "logps/rejected": -182.5379638671875, "loss": 0.8666, "rewards/accuracies": 0.4375, "rewards/chosen": -0.26553717255592346, "rewards/margins": -0.1137295514345169, "rewards/rejected": -0.15180760622024536, "step": 274 }, { "epoch": 0.36, "learning_rate": 4.9217783624752266e-05, "logits/chosen": -1.7799978256225586, "logits/rejected": -1.800316572189331, "logps/chosen": -189.352783203125, "logps/rejected": -173.5353546142578, "loss": 0.7974, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6156832575798035, "rewards/margins": -0.09660260379314423, "rewards/rejected": -0.519080638885498, "step": 275 }, { "epoch": 0.36, "learning_rate": 4.920886605182823e-05, "logits/chosen": -1.7154024839401245, "logits/rejected": -1.7451473474502563, "logps/chosen": -162.1999053955078, "logps/rejected": -177.0426483154297, "loss": 0.6234, "rewards/accuracies": 0.625, "rewards/chosen": -0.3276807367801666, "rewards/margins": 0.274338036775589, "rewards/rejected": -0.6020187139511108, "step": 276 }, { "epoch": 0.36, "learning_rate": 4.919989875181722e-05, "logits/chosen": -1.769112467765808, "logits/rejected": -1.759423851966858, "logps/chosen": -170.71876525878906, "logps/rejected": -173.2998809814453, "loss": 0.7953, "rewards/accuracies": 0.375, "rewards/chosen": -0.2965158224105835, "rewards/margins": 0.05386320501565933, "rewards/rejected": -0.3503790497779846, "step": 277 }, { "epoch": 0.36, "learning_rate": 4.919088174313884e-05, "logits/chosen": -1.2533071041107178, "logits/rejected": -1.3566581010818481, "logps/chosen": -200.62548828125, "logps/rejected": -190.6791534423828, "loss": 0.7712, "rewards/accuracies": 0.5, "rewards/chosen": -0.7463827729225159, "rewards/margins": -0.011319484561681747, "rewards/rejected": -0.7350633144378662, "step": 278 }, { "epoch": 0.37, "learning_rate": 4.91818150443148e-05, "logits/chosen": -1.958874225616455, "logits/rejected": -1.8372151851654053, "logps/chosen": -184.42295837402344, "logps/rejected": -178.30995178222656, "loss": 0.9, "rewards/accuracies": 0.1875, "rewards/chosen": -0.6464177370071411, "rewards/margins": -0.2897469401359558, "rewards/rejected": -0.3566707670688629, "step": 279 }, { "epoch": 0.37, "learning_rate": 4.917269867396886e-05, "logits/chosen": -1.731322169303894, "logits/rejected": -1.8198059797286987, "logps/chosen": -157.6179962158203, "logps/rejected": -162.81597900390625, "loss": 0.7364, "rewards/accuracies": 0.625, "rewards/chosen": -0.6854066848754883, "rewards/margins": 0.03314337879419327, "rewards/rejected": -0.7185500860214233, "step": 280 }, { "epoch": 0.37, "learning_rate": 4.916353265082686e-05, "logits/chosen": -1.5918539762496948, "logits/rejected": -1.5675849914550781, "logps/chosen": -229.21499633789062, "logps/rejected": -248.54562377929688, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": -0.4582862854003906, "rewards/margins": 0.10020212829113007, "rewards/rejected": -0.5584883689880371, "step": 281 }, { "epoch": 0.37, "learning_rate": 4.9154316993716565e-05, "logits/chosen": -1.9066779613494873, "logits/rejected": -1.9495766162872314, "logps/chosen": -149.90614318847656, "logps/rejected": -146.74632263183594, "loss": 0.7492, "rewards/accuracies": 0.5, "rewards/chosen": -0.44106772541999817, "rewards/margins": -0.07195230573415756, "rewards/rejected": -0.36911541223526, "step": 282 }, { "epoch": 0.37, "learning_rate": 4.9145051721567734e-05, "logits/chosen": -2.0382936000823975, "logits/rejected": -1.9569990634918213, "logps/chosen": -166.32168579101562, "logps/rejected": -162.79673767089844, "loss": 0.6561, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3699617087841034, "rewards/margins": 0.14387893676757812, "rewards/rejected": -0.5138406157493591, "step": 283 }, { "epoch": 0.37, "learning_rate": 4.913573685341205e-05, "logits/chosen": -1.474026083946228, "logits/rejected": -1.5045154094696045, "logps/chosen": -229.38619995117188, "logps/rejected": -236.83164978027344, "loss": 0.8663, "rewards/accuracies": 0.3125, "rewards/chosen": -0.9608985781669617, "rewards/margins": -0.17912134528160095, "rewards/rejected": -0.7817772626876831, "step": 284 }, { "epoch": 0.37, "learning_rate": 4.9126372408383025e-05, "logits/chosen": -1.8263182640075684, "logits/rejected": -1.8128119707107544, "logps/chosen": -190.57106018066406, "logps/rejected": -223.3215789794922, "loss": 0.7163, "rewards/accuracies": 0.5, "rewards/chosen": -0.5443102717399597, "rewards/margins": 0.06481970846652985, "rewards/rejected": -0.6091300249099731, "step": 285 }, { "epoch": 0.37, "learning_rate": 4.911695840571605e-05, "logits/chosen": -1.7644751071929932, "logits/rejected": -1.8120529651641846, "logps/chosen": -205.31759643554688, "logps/rejected": -199.3420867919922, "loss": 0.8917, "rewards/accuracies": 0.375, "rewards/chosen": -0.6110701560974121, "rewards/margins": -0.2775843143463135, "rewards/rejected": -0.33348581194877625, "step": 286 }, { "epoch": 0.38, "learning_rate": 4.910749486474828e-05, "logits/chosen": -1.6636385917663574, "logits/rejected": -1.5808690786361694, "logps/chosen": -176.58572387695312, "logps/rejected": -195.53761291503906, "loss": 0.7043, "rewards/accuracies": 0.625, "rewards/chosen": -0.6308318972587585, "rewards/margins": 0.1960841417312622, "rewards/rejected": -0.8269160985946655, "step": 287 }, { "epoch": 0.38, "learning_rate": 4.909798180491865e-05, "logits/chosen": -1.946243166923523, "logits/rejected": -2.0143542289733887, "logps/chosen": -178.8447265625, "logps/rejected": -188.88853454589844, "loss": 0.796, "rewards/accuracies": 0.5, "rewards/chosen": -0.4359901547431946, "rewards/margins": -0.07123897969722748, "rewards/rejected": -0.3647511601448059, "step": 288 }, { "epoch": 0.38, "learning_rate": 4.9088419245767803e-05, "logits/chosen": -2.0332159996032715, "logits/rejected": -1.9962480068206787, "logps/chosen": -181.74835205078125, "logps/rejected": -198.86822509765625, "loss": 0.8508, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9367510676383972, "rewards/margins": -0.1084718257188797, "rewards/rejected": -0.8282791972160339, "step": 289 }, { "epoch": 0.38, "learning_rate": 4.907880720693804e-05, "logits/chosen": -2.006517171859741, "logits/rejected": -1.8688626289367676, "logps/chosen": -176.2784423828125, "logps/rejected": -171.2430419921875, "loss": 0.8853, "rewards/accuracies": 0.3125, "rewards/chosen": -0.8897907733917236, "rewards/margins": -0.2713664770126343, "rewards/rejected": -0.6184243559837341, "step": 290 }, { "epoch": 0.38, "learning_rate": 4.9069145708173324e-05, "logits/chosen": -1.914872169494629, "logits/rejected": -1.9809991121292114, "logps/chosen": -152.42398071289062, "logps/rejected": -172.4163360595703, "loss": 0.7878, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8594350814819336, "rewards/margins": -0.11893659830093384, "rewards/rejected": -0.740498423576355, "step": 291 }, { "epoch": 0.38, "learning_rate": 4.9059434769319205e-05, "logits/chosen": -1.4920971393585205, "logits/rejected": -1.479290246963501, "logps/chosen": -221.7364501953125, "logps/rejected": -217.07138061523438, "loss": 0.8203, "rewards/accuracies": 0.375, "rewards/chosen": -0.8598781824111938, "rewards/margins": -0.09480879455804825, "rewards/rejected": -0.7650693655014038, "step": 292 }, { "epoch": 0.38, "learning_rate": 4.904967441032278e-05, "logits/chosen": -1.6795321702957153, "logits/rejected": -1.7278181314468384, "logps/chosen": -176.05941772460938, "logps/rejected": -176.9445037841797, "loss": 0.6673, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6722793579101562, "rewards/margins": 0.1413556933403015, "rewards/rejected": -0.8136351108551025, "step": 293 }, { "epoch": 0.38, "learning_rate": 4.903986465123266e-05, "logits/chosen": -1.8752367496490479, "logits/rejected": -1.815263032913208, "logps/chosen": -163.13931274414062, "logps/rejected": -167.08154296875, "loss": 0.6859, "rewards/accuracies": 0.75, "rewards/chosen": -0.587963879108429, "rewards/margins": 0.14399470388889313, "rewards/rejected": -0.7319585084915161, "step": 294 }, { "epoch": 0.39, "learning_rate": 4.903000551219894e-05, "logits/chosen": -2.049884796142578, "logits/rejected": -2.096831798553467, "logps/chosen": -159.4933624267578, "logps/rejected": -172.0717010498047, "loss": 0.7986, "rewards/accuracies": 0.375, "rewards/chosen": -0.7208172082901001, "rewards/margins": -0.12478935718536377, "rewards/rejected": -0.5960277915000916, "step": 295 }, { "epoch": 0.39, "learning_rate": 4.902009701347313e-05, "logits/chosen": -1.733120322227478, "logits/rejected": -1.7238028049468994, "logps/chosen": -207.2619171142578, "logps/rejected": -196.6807098388672, "loss": 0.7436, "rewards/accuracies": 0.5625, "rewards/chosen": -0.46741175651550293, "rewards/margins": 0.01438647136092186, "rewards/rejected": -0.4817982316017151, "step": 296 }, { "epoch": 0.39, "learning_rate": 4.901013917540814e-05, "logits/chosen": -2.027272939682007, "logits/rejected": -1.9993281364440918, "logps/chosen": -184.6562042236328, "logps/rejected": -183.0182647705078, "loss": 0.7772, "rewards/accuracies": 0.5, "rewards/chosen": -0.6997889280319214, "rewards/margins": 0.0013962779194116592, "rewards/rejected": -0.7011851668357849, "step": 297 }, { "epoch": 0.39, "learning_rate": 4.900013201845821e-05, "logits/chosen": -1.5796035528182983, "logits/rejected": -1.3949412107467651, "logps/chosen": -175.8470458984375, "logps/rejected": -212.80296325683594, "loss": 0.7523, "rewards/accuracies": 0.375, "rewards/chosen": -0.37455612421035767, "rewards/margins": -0.005637466907501221, "rewards/rejected": -0.36891865730285645, "step": 298 }, { "epoch": 0.39, "learning_rate": 4.899007556317893e-05, "logits/chosen": -1.9345701932907104, "logits/rejected": -2.001033306121826, "logps/chosen": -250.92816162109375, "logps/rejected": -241.9100341796875, "loss": 0.7686, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4188327193260193, "rewards/margins": -0.09854313731193542, "rewards/rejected": -0.32028958201408386, "step": 299 }, { "epoch": 0.39, "learning_rate": 4.8979969830227086e-05, "logits/chosen": -1.9917688369750977, "logits/rejected": -2.0171382427215576, "logps/chosen": -177.34434509277344, "logps/rejected": -170.58657836914062, "loss": 0.8629, "rewards/accuracies": 0.5, "rewards/chosen": -0.5179021954536438, "rewards/margins": -0.09615220129489899, "rewards/rejected": -0.4217500388622284, "step": 300 }, { "epoch": 0.39, "learning_rate": 4.896981484036074e-05, "logits/chosen": -2.010779619216919, "logits/rejected": -2.0212411880493164, "logps/chosen": -190.38487243652344, "logps/rejected": -189.54226684570312, "loss": 0.6184, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45494896173477173, "rewards/margins": 0.2873493432998657, "rewards/rejected": -0.7422983050346375, "step": 301 }, { "epoch": 0.4, "learning_rate": 4.895961061443911e-05, "logits/chosen": -1.8409286737442017, "logits/rejected": -1.7947205305099487, "logps/chosen": -189.4046173095703, "logps/rejected": -178.43617248535156, "loss": 0.8209, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7244110107421875, "rewards/margins": -0.0501336008310318, "rewards/rejected": -0.6742774248123169, "step": 302 }, { "epoch": 0.4, "learning_rate": 4.894935717342255e-05, "logits/chosen": -1.7337801456451416, "logits/rejected": -1.6998471021652222, "logps/chosen": -228.77264404296875, "logps/rejected": -214.67515563964844, "loss": 0.7953, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7624901533126831, "rewards/margins": -0.0503702238202095, "rewards/rejected": -0.7121198773384094, "step": 303 }, { "epoch": 0.4, "learning_rate": 4.8939054538372496e-05, "logits/chosen": -1.7520679235458374, "logits/rejected": -1.752877116203308, "logps/chosen": -186.9480743408203, "logps/rejected": -195.37429809570312, "loss": 0.8588, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6405542492866516, "rewards/margins": -0.206780344247818, "rewards/rejected": -0.4337739050388336, "step": 304 }, { "epoch": 0.4, "learning_rate": 4.8928702730451456e-05, "logits/chosen": -1.9656537771224976, "logits/rejected": -2.027155876159668, "logps/chosen": -199.00657653808594, "logps/rejected": -212.56494140625, "loss": 0.7799, "rewards/accuracies": 0.5, "rewards/chosen": -0.6149024963378906, "rewards/margins": -0.018680021166801453, "rewards/rejected": -0.5962225198745728, "step": 305 }, { "epoch": 0.4, "learning_rate": 4.891830177092294e-05, "logits/chosen": -1.629424810409546, "logits/rejected": -1.6878198385238647, "logps/chosen": -170.93138122558594, "logps/rejected": -171.99090576171875, "loss": 0.6828, "rewards/accuracies": 0.5, "rewards/chosen": -0.673701822757721, "rewards/margins": 0.13800469040870667, "rewards/rejected": -0.8117064237594604, "step": 306 }, { "epoch": 0.4, "learning_rate": 4.8907851681151396e-05, "logits/chosen": -1.7744640111923218, "logits/rejected": -1.855400800704956, "logps/chosen": -162.42169189453125, "logps/rejected": -166.64620971679688, "loss": 0.9846, "rewards/accuracies": 0.375, "rewards/chosen": -0.8018124103546143, "rewards/margins": -0.3769915699958801, "rewards/rejected": -0.42482078075408936, "step": 307 }, { "epoch": 0.4, "learning_rate": 4.889735248260221e-05, "logits/chosen": -1.889973521232605, "logits/rejected": -1.907044768333435, "logps/chosen": -166.85736083984375, "logps/rejected": -188.35101318359375, "loss": 0.6128, "rewards/accuracies": 0.625, "rewards/chosen": -0.3316620886325836, "rewards/margins": 0.2465287297964096, "rewards/rejected": -0.578190803527832, "step": 308 }, { "epoch": 0.4, "learning_rate": 4.8886804196841626e-05, "logits/chosen": -2.047497034072876, "logits/rejected": -2.0145740509033203, "logps/chosen": -201.215087890625, "logps/rejected": -203.3246307373047, "loss": 0.7635, "rewards/accuracies": 0.5, "rewards/chosen": -0.49714764952659607, "rewards/margins": 0.03132334351539612, "rewards/rejected": -0.5284709930419922, "step": 309 }, { "epoch": 0.41, "learning_rate": 4.887620684553674e-05, "logits/chosen": -1.831432580947876, "logits/rejected": -1.826611876487732, "logps/chosen": -170.79603576660156, "logps/rejected": -184.5007781982422, "loss": 0.734, "rewards/accuracies": 0.5, "rewards/chosen": -0.7777528166770935, "rewards/margins": 0.054428160190582275, "rewards/rejected": -0.832180917263031, "step": 310 }, { "epoch": 0.41, "learning_rate": 4.886556045045542e-05, "logits/chosen": -2.050309896469116, "logits/rejected": -2.0222089290618896, "logps/chosen": -186.81155395507812, "logps/rejected": -177.9705810546875, "loss": 0.7883, "rewards/accuracies": 0.5, "rewards/chosen": -0.6639207601547241, "rewards/margins": 0.03769933432340622, "rewards/rejected": -0.7016200423240662, "step": 311 }, { "epoch": 0.41, "learning_rate": 4.8854865033466275e-05, "logits/chosen": -2.076099395751953, "logits/rejected": -2.0824105739593506, "logps/chosen": -179.39295959472656, "logps/rejected": -174.53573608398438, "loss": 0.7306, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6415868401527405, "rewards/margins": 0.04155872389674187, "rewards/rejected": -0.6831455230712891, "step": 312 }, { "epoch": 0.41, "learning_rate": 4.88441206165386e-05, "logits/chosen": -1.6709840297698975, "logits/rejected": -1.8081105947494507, "logps/chosen": -166.87539672851562, "logps/rejected": -183.02621459960938, "loss": 0.7271, "rewards/accuracies": 0.375, "rewards/chosen": -0.518951952457428, "rewards/margins": 0.06794089823961258, "rewards/rejected": -0.58689284324646, "step": 313 }, { "epoch": 0.41, "learning_rate": 4.8833327221742356e-05, "logits/chosen": -1.9388179779052734, "logits/rejected": -1.9303326606750488, "logps/chosen": -229.31259155273438, "logps/rejected": -218.38491821289062, "loss": 0.9369, "rewards/accuracies": 0.375, "rewards/chosen": -0.7209200263023376, "rewards/margins": -0.36736997961997986, "rewards/rejected": -0.3535500466823578, "step": 314 }, { "epoch": 0.41, "learning_rate": 4.88224848712481e-05, "logits/chosen": -1.8626773357391357, "logits/rejected": -1.9242452383041382, "logps/chosen": -190.93325805664062, "logps/rejected": -220.92437744140625, "loss": 0.7958, "rewards/accuracies": 0.5, "rewards/chosen": -0.9179306030273438, "rewards/margins": -0.005557693541049957, "rewards/rejected": -0.9123728275299072, "step": 315 }, { "epoch": 0.41, "learning_rate": 4.881159358732694e-05, "logits/chosen": -1.9244499206542969, "logits/rejected": -1.9016033411026, "logps/chosen": -208.19638061523438, "logps/rejected": -194.61898803710938, "loss": 0.6003, "rewards/accuracies": 0.625, "rewards/chosen": -0.5419009923934937, "rewards/margins": 0.4854595363140106, "rewards/rejected": -1.0273605585098267, "step": 316 }, { "epoch": 0.41, "learning_rate": 4.8800653392350526e-05, "logits/chosen": -2.065157413482666, "logits/rejected": -1.979295015335083, "logps/chosen": -167.4358367919922, "logps/rejected": -148.04000854492188, "loss": 0.79, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6589381098747253, "rewards/margins": -0.009491220116615295, "rewards/rejected": -0.6494468450546265, "step": 317 }, { "epoch": 0.42, "learning_rate": 4.8789664308790936e-05, "logits/chosen": -1.8643383979797363, "logits/rejected": -1.801065444946289, "logps/chosen": -163.1219482421875, "logps/rejected": -162.10537719726562, "loss": 0.6148, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19017738103866577, "rewards/margins": 0.2843154966831207, "rewards/rejected": -0.4744928777217865, "step": 318 }, { "epoch": 0.42, "learning_rate": 4.8778626359220715e-05, "logits/chosen": -1.7437247037887573, "logits/rejected": -1.704676628112793, "logps/chosen": -152.5662078857422, "logps/rejected": -159.8699493408203, "loss": 0.7078, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5272018909454346, "rewards/margins": 0.054309070110321045, "rewards/rejected": -0.5815109014511108, "step": 319 }, { "epoch": 0.42, "learning_rate": 4.8767539566312734e-05, "logits/chosen": -1.884958028793335, "logits/rejected": -1.849034070968628, "logps/chosen": -162.9790802001953, "logps/rejected": -154.30078125, "loss": 0.7916, "rewards/accuracies": 0.25, "rewards/chosen": -0.5811585187911987, "rewards/margins": -0.11695164442062378, "rewards/rejected": -0.46420690417289734, "step": 320 }, { "epoch": 0.42, "learning_rate": 4.875640395284023e-05, "logits/chosen": -1.9072563648223877, "logits/rejected": -1.8811615705490112, "logps/chosen": -173.24008178710938, "logps/rejected": -179.58273315429688, "loss": 0.7763, "rewards/accuracies": 0.375, "rewards/chosen": -0.484822154045105, "rewards/margins": -0.08394555747509003, "rewards/rejected": -0.40087658166885376, "step": 321 }, { "epoch": 0.42, "learning_rate": 4.874521954167671e-05, "logits/chosen": -1.8919446468353271, "logits/rejected": -1.9220166206359863, "logps/chosen": -158.00631713867188, "logps/rejected": -158.1962890625, "loss": 0.6527, "rewards/accuracies": 0.625, "rewards/chosen": -0.2469102144241333, "rewards/margins": 0.14598041772842407, "rewards/rejected": -0.39289066195487976, "step": 322 }, { "epoch": 0.42, "learning_rate": 4.8733986355795905e-05, "logits/chosen": -2.0576484203338623, "logits/rejected": -1.957137107849121, "logps/chosen": -183.29859924316406, "logps/rejected": -164.59544372558594, "loss": 0.6596, "rewards/accuracies": 0.625, "rewards/chosen": -0.19953250885009766, "rewards/margins": 0.1992800384759903, "rewards/rejected": -0.39881253242492676, "step": 323 }, { "epoch": 0.42, "learning_rate": 4.8722704418271745e-05, "logits/chosen": -1.9749754667282104, "logits/rejected": -1.918540120124817, "logps/chosen": -167.0876922607422, "logps/rejected": -155.88064575195312, "loss": 0.8313, "rewards/accuracies": 0.1875, "rewards/chosen": -0.398946613073349, "rewards/margins": -0.19024060666561127, "rewards/rejected": -0.20870603621006012, "step": 324 }, { "epoch": 0.43, "learning_rate": 4.871137375227829e-05, "logits/chosen": -1.6178803443908691, "logits/rejected": -1.73251211643219, "logps/chosen": -267.42718505859375, "logps/rejected": -300.83477783203125, "loss": 0.8896, "rewards/accuracies": 0.25, "rewards/chosen": -0.6622705459594727, "rewards/margins": -0.21010492742061615, "rewards/rejected": -0.4521656334400177, "step": 325 }, { "epoch": 0.43, "learning_rate": 4.869999438108971e-05, "logits/chosen": -1.888526439666748, "logits/rejected": -1.8802413940429688, "logps/chosen": -190.6936798095703, "logps/rejected": -198.58245849609375, "loss": 0.8107, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6914330124855042, "rewards/margins": -0.09704277664422989, "rewards/rejected": -0.5943902134895325, "step": 326 }, { "epoch": 0.43, "learning_rate": 4.8688566328080215e-05, "logits/chosen": -1.5124105215072632, "logits/rejected": -1.5067615509033203, "logps/chosen": -282.9752502441406, "logps/rejected": -281.37933349609375, "loss": 0.7879, "rewards/accuracies": 0.375, "rewards/chosen": -0.5186710953712463, "rewards/margins": -0.08422104269266129, "rewards/rejected": -0.4344501197338104, "step": 327 }, { "epoch": 0.43, "learning_rate": 4.867708961672399e-05, "logits/chosen": -1.8901722431182861, "logits/rejected": -1.8916290998458862, "logps/chosen": -194.58010864257812, "logps/rejected": -206.20706176757812, "loss": 0.7058, "rewards/accuracies": 0.625, "rewards/chosen": -0.2762334942817688, "rewards/margins": 0.04776221513748169, "rewards/rejected": -0.3239956796169281, "step": 328 }, { "epoch": 0.43, "learning_rate": 4.866556427059519e-05, "logits/chosen": -1.9781274795532227, "logits/rejected": -1.991908073425293, "logps/chosen": -190.02462768554688, "logps/rejected": -172.36788940429688, "loss": 0.6999, "rewards/accuracies": 0.5, "rewards/chosen": -0.3898717761039734, "rewards/margins": 0.08414015173912048, "rewards/rejected": -0.47401195764541626, "step": 329 }, { "epoch": 0.43, "learning_rate": 4.865399031336787e-05, "logits/chosen": -1.7196893692016602, "logits/rejected": -1.7024250030517578, "logps/chosen": -183.65408325195312, "logps/rejected": -183.18309020996094, "loss": 0.6556, "rewards/accuracies": 0.625, "rewards/chosen": -0.18018794059753418, "rewards/margins": 0.1877795308828354, "rewards/rejected": -0.3679674565792084, "step": 330 }, { "epoch": 0.43, "learning_rate": 4.8642367768815936e-05, "logits/chosen": -1.82463800907135, "logits/rejected": -1.9342741966247559, "logps/chosen": -136.54745483398438, "logps/rejected": -149.12637329101562, "loss": 0.7994, "rewards/accuracies": 0.375, "rewards/chosen": -0.28317660093307495, "rewards/margins": -0.111660435795784, "rewards/rejected": -0.17151619493961334, "step": 331 }, { "epoch": 0.43, "learning_rate": 4.863069666081307e-05, "logits/chosen": -1.9666064977645874, "logits/rejected": -1.9600831270217896, "logps/chosen": -160.50009155273438, "logps/rejected": -172.79641723632812, "loss": 0.7938, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3549930453300476, "rewards/margins": -0.036753974854946136, "rewards/rejected": -0.3182390332221985, "step": 332 }, { "epoch": 0.44, "learning_rate": 4.861897701333274e-05, "logits/chosen": -1.7229609489440918, "logits/rejected": -1.7512197494506836, "logps/chosen": -160.8597869873047, "logps/rejected": -182.94717407226562, "loss": 0.7614, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5264466404914856, "rewards/margins": -0.05138474702835083, "rewards/rejected": -0.4750618636608124, "step": 333 }, { "epoch": 0.44, "learning_rate": 4.86072088504481e-05, "logits/chosen": -1.8726041316986084, "logits/rejected": -1.8346372842788696, "logps/chosen": -193.1851348876953, "logps/rejected": -175.16256713867188, "loss": 0.7167, "rewards/accuracies": 0.625, "rewards/chosen": 0.04130769148468971, "rewards/margins": 0.13345830142498016, "rewards/rejected": -0.09215061366558075, "step": 334 }, { "epoch": 0.44, "learning_rate": 4.859539219633199e-05, "logits/chosen": -1.6277759075164795, "logits/rejected": -1.650618553161621, "logps/chosen": -192.23538208007812, "logps/rejected": -195.0891876220703, "loss": 0.9001, "rewards/accuracies": 0.5, "rewards/chosen": -0.2764393389225006, "rewards/margins": -0.1653566211462021, "rewards/rejected": -0.11108270287513733, "step": 335 }, { "epoch": 0.44, "learning_rate": 4.8583527075256804e-05, "logits/chosen": -1.9752824306488037, "logits/rejected": -1.976406455039978, "logps/chosen": -215.763916015625, "logps/rejected": -222.56468200683594, "loss": 0.6023, "rewards/accuracies": 0.75, "rewards/chosen": -0.3105733394622803, "rewards/margins": 0.3618737459182739, "rewards/rejected": -0.6724470853805542, "step": 336 }, { "epoch": 0.44, "learning_rate": 4.857161351159454e-05, "logits/chosen": -1.7939121723175049, "logits/rejected": -1.8076589107513428, "logps/chosen": -175.3466033935547, "logps/rejected": -176.5845947265625, "loss": 0.6957, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10015758872032166, "rewards/margins": 0.10513995587825775, "rewards/rejected": -0.2052975744009018, "step": 337 }, { "epoch": 0.44, "learning_rate": 4.8559651529816664e-05, "logits/chosen": -1.7438242435455322, "logits/rejected": -1.7907154560089111, "logps/chosen": -190.70948791503906, "logps/rejected": -204.99143981933594, "loss": 0.6727, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06472301483154297, "rewards/margins": 0.1332576423883438, "rewards/rejected": -0.19798064231872559, "step": 338 }, { "epoch": 0.44, "learning_rate": 4.854764115449411e-05, "logits/chosen": -1.7970941066741943, "logits/rejected": -1.8352383375167847, "logps/chosen": -167.88230895996094, "logps/rejected": -166.8907928466797, "loss": 0.6771, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2522534728050232, "rewards/margins": 0.06870199739933014, "rewards/rejected": -0.3209554851055145, "step": 339 }, { "epoch": 0.44, "learning_rate": 4.853558241029723e-05, "logits/chosen": -1.9054516553878784, "logits/rejected": -1.9092512130737305, "logps/chosen": -173.30734252929688, "logps/rejected": -189.3043975830078, "loss": 0.7071, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2111372947692871, "rewards/margins": 0.07802311331033707, "rewards/rejected": -0.2891604006290436, "step": 340 }, { "epoch": 0.45, "learning_rate": 4.8523475321995715e-05, "logits/chosen": -1.6928297281265259, "logits/rejected": -1.7166639566421509, "logps/chosen": -163.96253967285156, "logps/rejected": -157.01870727539062, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": 0.023365147411823273, "rewards/margins": 0.09150275588035583, "rewards/rejected": -0.06813760101795197, "step": 341 }, { "epoch": 0.45, "learning_rate": 4.8511319914458555e-05, "logits/chosen": -1.5925623178482056, "logits/rejected": -1.6342376470565796, "logps/chosen": -169.8107452392578, "logps/rejected": -174.4763641357422, "loss": 0.7952, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6270018815994263, "rewards/margins": -0.09866765886545181, "rewards/rejected": -0.5283341407775879, "step": 342 }, { "epoch": 0.45, "learning_rate": 4.849911621265401e-05, "logits/chosen": -1.6875545978546143, "logits/rejected": -1.6878042221069336, "logps/chosen": -160.625732421875, "logps/rejected": -186.27923583984375, "loss": 0.6322, "rewards/accuracies": 0.625, "rewards/chosen": -0.33780720829963684, "rewards/margins": 0.20755568146705627, "rewards/rejected": -0.5453628301620483, "step": 343 }, { "epoch": 0.45, "learning_rate": 4.848686424164953e-05, "logits/chosen": -1.846010684967041, "logits/rejected": -1.860701084136963, "logps/chosen": -181.06878662109375, "logps/rejected": -178.49310302734375, "loss": 0.7739, "rewards/accuracies": 0.5, "rewards/chosen": -0.3998520076274872, "rewards/margins": -0.03460027277469635, "rewards/rejected": -0.36525171995162964, "step": 344 }, { "epoch": 0.45, "learning_rate": 4.84745640266117e-05, "logits/chosen": -1.8941820859909058, "logits/rejected": -1.8956482410430908, "logps/chosen": -228.6375732421875, "logps/rejected": -222.01824951171875, "loss": 0.7173, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3104371130466461, "rewards/margins": 0.0530361533164978, "rewards/rejected": -0.3634732663631439, "step": 345 }, { "epoch": 0.45, "learning_rate": 4.846221559280624e-05, "logits/chosen": -1.8459906578063965, "logits/rejected": -1.9024940729141235, "logps/chosen": -159.93936157226562, "logps/rejected": -165.43307495117188, "loss": 0.896, "rewards/accuracies": 0.5, "rewards/chosen": -0.23201912641525269, "rewards/margins": -0.12730032205581665, "rewards/rejected": -0.10471877455711365, "step": 346 }, { "epoch": 0.45, "learning_rate": 4.844981896559787e-05, "logits/chosen": -2.048933506011963, "logits/rejected": -2.061128616333008, "logps/chosen": -181.51718139648438, "logps/rejected": -185.25296020507812, "loss": 0.8408, "rewards/accuracies": 0.375, "rewards/chosen": -0.32012802362442017, "rewards/margins": -0.20446370542049408, "rewards/rejected": -0.1156642958521843, "step": 347 }, { "epoch": 0.46, "learning_rate": 4.8437374170450344e-05, "logits/chosen": -1.9143999814987183, "logits/rejected": -1.9218837022781372, "logps/chosen": -182.51480102539062, "logps/rejected": -182.2652130126953, "loss": 0.8078, "rewards/accuracies": 0.375, "rewards/chosen": -0.6757728457450867, "rewards/margins": -0.08970170468091965, "rewards/rejected": -0.5860711932182312, "step": 348 }, { "epoch": 0.46, "learning_rate": 4.842488123292632e-05, "logits/chosen": -1.736176609992981, "logits/rejected": -1.7229468822479248, "logps/chosen": -159.5717315673828, "logps/rejected": -189.70269775390625, "loss": 0.6977, "rewards/accuracies": 0.5, "rewards/chosen": -0.2442682683467865, "rewards/margins": 0.0333622470498085, "rewards/rejected": -0.2776305675506592, "step": 349 }, { "epoch": 0.46, "learning_rate": 4.8412340178687374e-05, "logits/chosen": -1.6423401832580566, "logits/rejected": -1.689012050628662, "logps/chosen": -151.13458251953125, "logps/rejected": -175.60679626464844, "loss": 0.8116, "rewards/accuracies": 0.375, "rewards/chosen": -0.33638978004455566, "rewards/margins": -0.15178707242012024, "rewards/rejected": -0.18460272252559662, "step": 350 }, { "epoch": 0.46, "learning_rate": 4.839975103349391e-05, "logits/chosen": -1.8096094131469727, "logits/rejected": -1.8605788946151733, "logps/chosen": -158.09568786621094, "logps/rejected": -172.2019805908203, "loss": 0.8539, "rewards/accuracies": 0.4375, "rewards/chosen": -0.17084676027297974, "rewards/margins": -0.16634216904640198, "rewards/rejected": -0.004504583775997162, "step": 351 }, { "epoch": 0.46, "learning_rate": 4.8387113823205096e-05, "logits/chosen": -1.8915197849273682, "logits/rejected": -1.9206541776657104, "logps/chosen": -173.2298583984375, "logps/rejected": -181.8549346923828, "loss": 0.6712, "rewards/accuracies": 0.75, "rewards/chosen": -0.22103241086006165, "rewards/margins": 0.12150835990905762, "rewards/rejected": -0.34254080057144165, "step": 352 }, { "epoch": 0.46, "learning_rate": 4.8374428573778864e-05, "logits/chosen": -1.9554250240325928, "logits/rejected": -2.0212881565093994, "logps/chosen": -188.2921142578125, "logps/rejected": -199.60987854003906, "loss": 0.7256, "rewards/accuracies": 0.625, "rewards/chosen": 0.0011289417743682861, "rewards/margins": 0.15477600693702698, "rewards/rejected": -0.1536470353603363, "step": 353 }, { "epoch": 0.46, "learning_rate": 4.8361695311271795e-05, "logits/chosen": -1.626081943511963, "logits/rejected": -1.5311224460601807, "logps/chosen": -185.79702758789062, "logps/rejected": -203.40957641601562, "loss": 0.8635, "rewards/accuracies": 0.4375, "rewards/chosen": -0.516851007938385, "rewards/margins": -0.19476839900016785, "rewards/rejected": -0.32208263874053955, "step": 354 }, { "epoch": 0.46, "learning_rate": 4.83489140618391e-05, "logits/chosen": -1.7895288467407227, "logits/rejected": -1.753225564956665, "logps/chosen": -217.21707153320312, "logps/rejected": -191.97915649414062, "loss": 0.7744, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4739922285079956, "rewards/margins": -0.06083906441926956, "rewards/rejected": -0.41315317153930664, "step": 355 }, { "epoch": 0.47, "learning_rate": 4.833608485173457e-05, "logits/chosen": -1.9408211708068848, "logits/rejected": -1.916908860206604, "logps/chosen": -210.69586181640625, "logps/rejected": -233.63662719726562, "loss": 0.7485, "rewards/accuracies": 0.625, "rewards/chosen": -0.3478991985321045, "rewards/margins": -0.0014675185084342957, "rewards/rejected": -0.3464316725730896, "step": 356 }, { "epoch": 0.47, "learning_rate": 4.8323207707310496e-05, "logits/chosen": -2.0299384593963623, "logits/rejected": -2.0607504844665527, "logps/chosen": -186.0093231201172, "logps/rejected": -179.7775115966797, "loss": 0.6698, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22601480782032013, "rewards/margins": 0.12358909845352173, "rewards/rejected": -0.34960389137268066, "step": 357 }, { "epoch": 0.47, "learning_rate": 4.831028265501764e-05, "logits/chosen": -1.7695857286453247, "logits/rejected": -1.7599815130233765, "logps/chosen": -161.50375366210938, "logps/rejected": -168.04541015625, "loss": 0.6868, "rewards/accuracies": 0.625, "rewards/chosen": -0.32897791266441345, "rewards/margins": 0.10846008360385895, "rewards/rejected": -0.437438040971756, "step": 358 }, { "epoch": 0.47, "learning_rate": 4.829730972140517e-05, "logits/chosen": -1.8524454832077026, "logits/rejected": -1.9345438480377197, "logps/chosen": -149.3410186767578, "logps/rejected": -160.43539428710938, "loss": 0.7965, "rewards/accuracies": 0.375, "rewards/chosen": -0.08119592815637589, "rewards/margins": -0.04433928430080414, "rewards/rejected": -0.03685663640499115, "step": 359 }, { "epoch": 0.47, "learning_rate": 4.8284288933120594e-05, "logits/chosen": -1.8411493301391602, "logits/rejected": -1.8524004220962524, "logps/chosen": -175.48532104492188, "logps/rejected": -182.57582092285156, "loss": 0.5989, "rewards/accuracies": 0.6875, "rewards/chosen": -0.46560171246528625, "rewards/margins": 0.3886818289756775, "rewards/rejected": -0.8542835712432861, "step": 360 }, { "epoch": 0.47, "learning_rate": 4.8271220316909735e-05, "logits/chosen": -1.687551498413086, "logits/rejected": -1.722497582435608, "logps/chosen": -167.26939392089844, "logps/rejected": -181.43814086914062, "loss": 0.9314, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5299715399742126, "rewards/margins": -0.17927514016628265, "rewards/rejected": -0.35069650411605835, "step": 361 }, { "epoch": 0.47, "learning_rate": 4.825810389961666e-05, "logits/chosen": -1.916797399520874, "logits/rejected": -1.9123202562332153, "logps/chosen": -198.55821228027344, "logps/rejected": -225.2833709716797, "loss": 0.8431, "rewards/accuracies": 0.3125, "rewards/chosen": -0.3632412254810333, "rewards/margins": -0.22463062405586243, "rewards/rejected": -0.1386105716228485, "step": 362 }, { "epoch": 0.48, "learning_rate": 4.8244939708183596e-05, "logits/chosen": -1.6189442873001099, "logits/rejected": -1.6897720098495483, "logps/chosen": -186.33004760742188, "logps/rejected": -181.720947265625, "loss": 0.8053, "rewards/accuracies": 0.375, "rewards/chosen": 0.27750128507614136, "rewards/margins": -0.1327960044145584, "rewards/rejected": 0.41029733419418335, "step": 363 }, { "epoch": 0.48, "learning_rate": 4.823172776965094e-05, "logits/chosen": -2.003798246383667, "logits/rejected": -1.9069428443908691, "logps/chosen": -206.22410583496094, "logps/rejected": -202.70343017578125, "loss": 0.7931, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07843560725450516, "rewards/margins": 0.052309781312942505, "rewards/rejected": -0.13074536621570587, "step": 364 }, { "epoch": 0.48, "learning_rate": 4.821846811115713e-05, "logits/chosen": -1.4173839092254639, "logits/rejected": -1.469193696975708, "logps/chosen": -252.58900451660156, "logps/rejected": -266.8291015625, "loss": 0.7776, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2866368591785431, "rewards/margins": -0.01211586594581604, "rewards/rejected": -0.27452099323272705, "step": 365 }, { "epoch": 0.48, "learning_rate": 4.820516075993865e-05, "logits/chosen": -1.9226216077804565, "logits/rejected": -1.8626333475112915, "logps/chosen": -189.990966796875, "logps/rejected": -212.71446228027344, "loss": 0.7835, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2961033880710602, "rewards/margins": -0.048593662679195404, "rewards/rejected": -0.24750974774360657, "step": 366 }, { "epoch": 0.48, "learning_rate": 4.819180574332994e-05, "logits/chosen": -2.1082520484924316, "logits/rejected": -2.0929677486419678, "logps/chosen": -172.19317626953125, "logps/rejected": -166.5411376953125, "loss": 0.7558, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3426204323768616, "rewards/margins": 0.029606737196445465, "rewards/rejected": -0.37222716212272644, "step": 367 }, { "epoch": 0.48, "learning_rate": 4.8178403088763355e-05, "logits/chosen": -1.8143612146377563, "logits/rejected": -1.8444868326187134, "logps/chosen": -219.51095581054688, "logps/rejected": -223.00332641601562, "loss": 0.7669, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5216841697692871, "rewards/margins": -0.06673192977905273, "rewards/rejected": -0.454952210187912, "step": 368 }, { "epoch": 0.48, "learning_rate": 4.8164952823769085e-05, "logits/chosen": -2.0486927032470703, "logits/rejected": -1.986092209815979, "logps/chosen": -179.5719757080078, "logps/rejected": -176.9426727294922, "loss": 0.7705, "rewards/accuracies": 0.5, "rewards/chosen": -0.12122049927711487, "rewards/margins": -0.017412271350622177, "rewards/rejected": -0.1038082093000412, "step": 369 }, { "epoch": 0.48, "learning_rate": 4.815145497597514e-05, "logits/chosen": -1.6093693971633911, "logits/rejected": -1.6691244840621948, "logps/chosen": -220.64230346679688, "logps/rejected": -226.87054443359375, "loss": 0.739, "rewards/accuracies": 0.625, "rewards/chosen": -0.48053669929504395, "rewards/margins": 0.01233639195561409, "rewards/rejected": -0.4928731620311737, "step": 370 }, { "epoch": 0.49, "learning_rate": 4.8137909573107246e-05, "logits/chosen": -1.5354715585708618, "logits/rejected": -1.4911106824874878, "logps/chosen": -173.0819549560547, "logps/rejected": -171.91053771972656, "loss": 0.7285, "rewards/accuracies": 0.375, "rewards/chosen": -0.09934857487678528, "rewards/margins": 0.08546656370162964, "rewards/rejected": -0.18481513857841492, "step": 371 }, { "epoch": 0.49, "learning_rate": 4.812431664298883e-05, "logits/chosen": -1.8645007610321045, "logits/rejected": -1.8654489517211914, "logps/chosen": -174.5074005126953, "logps/rejected": -174.8561248779297, "loss": 0.7322, "rewards/accuracies": 0.4375, "rewards/chosen": -0.34228798747062683, "rewards/margins": 0.07575173676013947, "rewards/rejected": -0.4180397093296051, "step": 372 }, { "epoch": 0.49, "learning_rate": 4.811067621354094e-05, "logits/chosen": -1.672195315361023, "logits/rejected": -1.7209053039550781, "logps/chosen": -179.9447784423828, "logps/rejected": -163.33111572265625, "loss": 0.9538, "rewards/accuracies": 0.25, "rewards/chosen": -0.3875757157802582, "rewards/margins": -0.4043459892272949, "rewards/rejected": 0.016770271584391594, "step": 373 }, { "epoch": 0.49, "learning_rate": 4.8096988312782174e-05, "logits/chosen": -2.031759023666382, "logits/rejected": -2.071578025817871, "logps/chosen": -179.43516540527344, "logps/rejected": -177.50064086914062, "loss": 0.8834, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4397561550140381, "rewards/margins": -0.1974533647298813, "rewards/rejected": -0.2423027753829956, "step": 374 }, { "epoch": 0.49, "learning_rate": 4.8083252968828665e-05, "logits/chosen": -1.9421418905258179, "logits/rejected": -1.922428846359253, "logps/chosen": -142.18792724609375, "logps/rejected": -137.76077270507812, "loss": 0.5961, "rewards/accuracies": 0.625, "rewards/chosen": -0.12917383015155792, "rewards/margins": 0.23805946111679077, "rewards/rejected": -0.3672332763671875, "step": 375 }, { "epoch": 0.49, "learning_rate": 4.8069470209893974e-05, "logits/chosen": -1.8579202890396118, "logits/rejected": -1.7998918294906616, "logps/chosen": -155.51712036132812, "logps/rejected": -156.0476531982422, "loss": 0.7183, "rewards/accuracies": 0.4375, "rewards/chosen": -0.02486548200249672, "rewards/margins": 0.029943522065877914, "rewards/rejected": -0.054809004068374634, "step": 376 }, { "epoch": 0.49, "learning_rate": 4.8055640064289086e-05, "logits/chosen": -1.9409297704696655, "logits/rejected": -1.911987066268921, "logps/chosen": -245.75701904296875, "logps/rejected": -248.6239013671875, "loss": 0.8102, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4732462763786316, "rewards/margins": -0.1539333313703537, "rewards/rejected": -0.3193129599094391, "step": 377 }, { "epoch": 0.49, "learning_rate": 4.80417625604223e-05, "logits/chosen": -1.8907678127288818, "logits/rejected": -1.8695294857025146, "logps/chosen": -177.34124755859375, "logps/rejected": -179.99476623535156, "loss": 0.6441, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10845950245857239, "rewards/margins": 0.19328270852565765, "rewards/rejected": -0.3017422556877136, "step": 378 }, { "epoch": 0.5, "learning_rate": 4.8027837726799205e-05, "logits/chosen": -1.8234997987747192, "logits/rejected": -1.8477734327316284, "logps/chosen": -152.91793823242188, "logps/rejected": -166.2785186767578, "loss": 0.7019, "rewards/accuracies": 0.625, "rewards/chosen": -0.19078156352043152, "rewards/margins": 0.07663966715335846, "rewards/rejected": -0.26742124557495117, "step": 379 }, { "epoch": 0.5, "learning_rate": 4.801386559202259e-05, "logits/chosen": -1.9331165552139282, "logits/rejected": -1.9343795776367188, "logps/chosen": -197.23309326171875, "logps/rejected": -216.14816284179688, "loss": 0.6427, "rewards/accuracies": 0.5, "rewards/chosen": -0.1457238346338272, "rewards/margins": 0.22981032729148865, "rewards/rejected": -0.37553414702415466, "step": 380 }, { "epoch": 0.5, "learning_rate": 4.799984618479242e-05, "logits/chosen": -1.7535991668701172, "logits/rejected": -1.8327221870422363, "logps/chosen": -170.40121459960938, "logps/rejected": -194.4297332763672, "loss": 0.7553, "rewards/accuracies": 0.625, "rewards/chosen": -0.2612544000148773, "rewards/margins": 0.023236550390720367, "rewards/rejected": -0.2844909727573395, "step": 381 }, { "epoch": 0.5, "learning_rate": 4.798577953390577e-05, "logits/chosen": -1.9297330379486084, "logits/rejected": -1.8824340105056763, "logps/chosen": -192.4176788330078, "logps/rejected": -204.23397827148438, "loss": 0.7058, "rewards/accuracies": 0.4375, "rewards/chosen": -0.016188140958547592, "rewards/margins": 0.1552063524723053, "rewards/rejected": -0.1713944971561432, "step": 382 }, { "epoch": 0.5, "learning_rate": 4.797166566825675e-05, "logits/chosen": -1.983964204788208, "logits/rejected": -2.0162336826324463, "logps/chosen": -165.31370544433594, "logps/rejected": -175.11459350585938, "loss": 0.8134, "rewards/accuracies": 0.375, "rewards/chosen": -0.43316060304641724, "rewards/margins": -0.14317026734352112, "rewards/rejected": -0.2899903357028961, "step": 383 }, { "epoch": 0.5, "learning_rate": 4.795750461683644e-05, "logits/chosen": -1.7382255792617798, "logits/rejected": -1.7152175903320312, "logps/chosen": -162.63970947265625, "logps/rejected": -167.93568420410156, "loss": 0.8503, "rewards/accuracies": 0.5, "rewards/chosen": -0.3506108224391937, "rewards/margins": -0.19645802676677704, "rewards/rejected": -0.1541527807712555, "step": 384 }, { "epoch": 0.5, "learning_rate": 4.794329640873285e-05, "logits/chosen": -1.9835039377212524, "logits/rejected": -1.9371455907821655, "logps/chosen": -164.90518188476562, "logps/rejected": -153.89732360839844, "loss": 0.8365, "rewards/accuracies": 0.375, "rewards/chosen": -0.10440421104431152, "rewards/margins": -0.1423448920249939, "rewards/rejected": 0.03794068843126297, "step": 385 }, { "epoch": 0.51, "learning_rate": 4.7929041073130867e-05, "logits/chosen": -1.6812117099761963, "logits/rejected": -1.7789117097854614, "logps/chosen": -171.525390625, "logps/rejected": -194.28512573242188, "loss": 0.6979, "rewards/accuracies": 0.375, "rewards/chosen": 0.21003414690494537, "rewards/margins": 0.1351175457239151, "rewards/rejected": 0.07491665333509445, "step": 386 }, { "epoch": 0.51, "learning_rate": 4.7914738639312165e-05, "logits/chosen": -1.9188036918640137, "logits/rejected": -1.9109784364700317, "logps/chosen": -189.7833251953125, "logps/rejected": -164.029541015625, "loss": 0.8337, "rewards/accuracies": 0.3125, "rewards/chosen": -0.3247818648815155, "rewards/margins": -0.18027785420417786, "rewards/rejected": -0.14450398087501526, "step": 387 }, { "epoch": 0.51, "learning_rate": 4.790038913665519e-05, "logits/chosen": -2.0011377334594727, "logits/rejected": -2.060800552368164, "logps/chosen": -176.84857177734375, "logps/rejected": -189.64443969726562, "loss": 0.7481, "rewards/accuracies": 0.3125, "rewards/chosen": -0.09154945611953735, "rewards/margins": -0.03873196616768837, "rewards/rejected": -0.052817486226558685, "step": 388 }, { "epoch": 0.51, "learning_rate": 4.788599259463502e-05, "logits/chosen": -1.8452333211898804, "logits/rejected": -1.8431695699691772, "logps/chosen": -154.12435913085938, "logps/rejected": -155.32220458984375, "loss": 0.7097, "rewards/accuracies": 0.625, "rewards/chosen": -0.14383897185325623, "rewards/margins": 0.18797016143798828, "rewards/rejected": -0.3318091332912445, "step": 389 }, { "epoch": 0.51, "learning_rate": 4.787154904282341e-05, "logits/chosen": -1.3743209838867188, "logits/rejected": -1.4174141883850098, "logps/chosen": -177.5303192138672, "logps/rejected": -211.40457153320312, "loss": 0.5768, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2827177047729492, "rewards/margins": 0.42551764845848083, "rewards/rejected": -0.1427999585866928, "step": 390 }, { "epoch": 0.51, "learning_rate": 4.7857058510888645e-05, "logits/chosen": -2.1841466426849365, "logits/rejected": -2.124525547027588, "logps/chosen": -246.4276885986328, "logps/rejected": -244.47906494140625, "loss": 0.6631, "rewards/accuracies": 0.375, "rewards/chosen": -0.038991779088974, "rewards/margins": 0.20248231291770935, "rewards/rejected": -0.24147410690784454, "step": 391 }, { "epoch": 0.51, "learning_rate": 4.7842521028595526e-05, "logits/chosen": -1.8262689113616943, "logits/rejected": -1.8402464389801025, "logps/chosen": -158.2376708984375, "logps/rejected": -177.70938110351562, "loss": 0.7674, "rewards/accuracies": 0.375, "rewards/chosen": -0.19041013717651367, "rewards/margins": 0.02704358845949173, "rewards/rejected": -0.2174537032842636, "step": 392 }, { "epoch": 0.51, "learning_rate": 4.7827936625805284e-05, "logits/chosen": -1.9320769309997559, "logits/rejected": -1.9378974437713623, "logps/chosen": -162.44107055664062, "logps/rejected": -162.588623046875, "loss": 0.702, "rewards/accuracies": 0.5625, "rewards/chosen": 0.22483624517917633, "rewards/margins": 0.04632706940174103, "rewards/rejected": 0.1785091906785965, "step": 393 }, { "epoch": 0.52, "learning_rate": 4.7813305332475535e-05, "logits/chosen": -2.023815870285034, "logits/rejected": -2.1101415157318115, "logps/chosen": -161.48867797851562, "logps/rejected": -177.5762939453125, "loss": 0.7641, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2686167359352112, "rewards/margins": -0.034188684076070786, "rewards/rejected": -0.2344280332326889, "step": 394 }, { "epoch": 0.52, "learning_rate": 4.77986271786602e-05, "logits/chosen": -1.9036113023757935, "logits/rejected": -1.8756999969482422, "logps/chosen": -200.5419464111328, "logps/rejected": -212.63906860351562, "loss": 0.6506, "rewards/accuracies": 0.5, "rewards/chosen": 0.06710982322692871, "rewards/margins": 0.18568173050880432, "rewards/rejected": -0.11857189238071442, "step": 395 }, { "epoch": 0.52, "learning_rate": 4.778390219450949e-05, "logits/chosen": -1.8086354732513428, "logits/rejected": -1.8471179008483887, "logps/chosen": -152.69277954101562, "logps/rejected": -143.47000122070312, "loss": 0.6566, "rewards/accuracies": 0.6875, "rewards/chosen": 0.007534712553024292, "rewards/margins": 0.15669256448745728, "rewards/rejected": -0.14915785193443298, "step": 396 }, { "epoch": 0.52, "learning_rate": 4.776913041026976e-05, "logits/chosen": -2.1575872898101807, "logits/rejected": -2.189612627029419, "logps/chosen": -178.6571044921875, "logps/rejected": -187.46389770507812, "loss": 0.8387, "rewards/accuracies": 0.375, "rewards/chosen": -0.3838292360305786, "rewards/margins": -0.1892194300889969, "rewards/rejected": -0.19460979104042053, "step": 397 }, { "epoch": 0.52, "learning_rate": 4.775431185628353e-05, "logits/chosen": -2.0314245223999023, "logits/rejected": -2.0670695304870605, "logps/chosen": -139.46705627441406, "logps/rejected": -137.02342224121094, "loss": 0.799, "rewards/accuracies": 0.25, "rewards/chosen": -0.3406837582588196, "rewards/margins": -0.09202456474304199, "rewards/rejected": -0.24865922331809998, "step": 398 }, { "epoch": 0.52, "learning_rate": 4.7739446562989384e-05, "logits/chosen": -1.7543888092041016, "logits/rejected": -1.8098934888839722, "logps/chosen": -171.0781707763672, "logps/rejected": -192.0042724609375, "loss": 0.7873, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4363712966442108, "rewards/margins": 0.039258234202861786, "rewards/rejected": -0.4756295382976532, "step": 399 }, { "epoch": 0.52, "learning_rate": 4.772453456092191e-05, "logits/chosen": -1.8036949634552002, "logits/rejected": -1.8187798261642456, "logps/chosen": -176.441650390625, "logps/rejected": -187.6487274169922, "loss": 0.7976, "rewards/accuracies": 0.375, "rewards/chosen": -0.22391349077224731, "rewards/margins": -0.052816301584243774, "rewards/rejected": -0.17109718918800354, "step": 400 }, { "epoch": 0.52, "learning_rate": 4.7709575880711634e-05, "logits/chosen": -2.0721547603607178, "logits/rejected": -2.05245041847229, "logps/chosen": -181.74232482910156, "logps/rejected": -187.65249633789062, "loss": 0.5921, "rewards/accuracies": 0.5, "rewards/chosen": 0.2778853476047516, "rewards/margins": 0.33293941617012024, "rewards/rejected": -0.05505405738949776, "step": 401 }, { "epoch": 0.53, "learning_rate": 4.769457055308497e-05, "logits/chosen": -2.026765823364258, "logits/rejected": -2.018843412399292, "logps/chosen": -197.64260864257812, "logps/rejected": -179.19317626953125, "loss": 0.9551, "rewards/accuracies": 0.5625, "rewards/chosen": -0.30001845955848694, "rewards/margins": -0.15433457493782043, "rewards/rejected": -0.1456838697195053, "step": 402 }, { "epoch": 0.53, "learning_rate": 4.767951860886415e-05, "logits/chosen": -1.7545514106750488, "logits/rejected": -1.7311463356018066, "logps/chosen": -189.0639190673828, "logps/rejected": -202.9300537109375, "loss": 0.7032, "rewards/accuracies": 0.5, "rewards/chosen": -0.16693828999996185, "rewards/margins": 0.15034297108650208, "rewards/rejected": -0.3172812759876251, "step": 403 }, { "epoch": 0.53, "learning_rate": 4.766442007896715e-05, "logits/chosen": -1.3602584600448608, "logits/rejected": -1.3138482570648193, "logps/chosen": -229.68812561035156, "logps/rejected": -228.81280517578125, "loss": 0.7063, "rewards/accuracies": 0.5625, "rewards/chosen": -0.30803146958351135, "rewards/margins": 0.05981824919581413, "rewards/rejected": -0.3678497076034546, "step": 404 }, { "epoch": 0.53, "learning_rate": 4.764927499440767e-05, "logits/chosen": -1.2695108652114868, "logits/rejected": -1.2994788885116577, "logps/chosen": -178.60507202148438, "logps/rejected": -209.27049255371094, "loss": 0.9922, "rewards/accuracies": 0.25, "rewards/chosen": -0.590323805809021, "rewards/margins": -0.39020806550979614, "rewards/rejected": -0.20011577010154724, "step": 405 }, { "epoch": 0.53, "learning_rate": 4.763408338629498e-05, "logits/chosen": -2.1045475006103516, "logits/rejected": -2.1285929679870605, "logps/chosen": -230.02374267578125, "logps/rejected": -225.12106323242188, "loss": 0.8737, "rewards/accuracies": 0.4375, "rewards/chosen": -0.33914846181869507, "rewards/margins": -0.23797425627708435, "rewards/rejected": -0.10117418318986893, "step": 406 }, { "epoch": 0.53, "learning_rate": 4.761884528583396e-05, "logits/chosen": -1.4888020753860474, "logits/rejected": -1.4826213121414185, "logps/chosen": -221.777587890625, "logps/rejected": -241.30885314941406, "loss": 0.7111, "rewards/accuracies": 0.5, "rewards/chosen": -0.6717454791069031, "rewards/margins": 0.279751718044281, "rewards/rejected": -0.9514971375465393, "step": 407 }, { "epoch": 0.53, "learning_rate": 4.760356072432498e-05, "logits/chosen": -1.8832398653030396, "logits/rejected": -2.041220188140869, "logps/chosen": -290.4461364746094, "logps/rejected": -303.868408203125, "loss": 0.747, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1965964436531067, "rewards/margins": 0.01566828042268753, "rewards/rejected": 0.1809280961751938, "step": 408 }, { "epoch": 0.54, "learning_rate": 4.7588229733163834e-05, "logits/chosen": -1.9910494089126587, "logits/rejected": -2.065354585647583, "logps/chosen": -189.50331115722656, "logps/rejected": -204.8780975341797, "loss": 0.7243, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7861883640289307, "rewards/margins": 0.04858472943305969, "rewards/rejected": -0.834773063659668, "step": 409 }, { "epoch": 0.54, "learning_rate": 4.757285234384169e-05, "logits/chosen": -1.94736909866333, "logits/rejected": -2.0690090656280518, "logps/chosen": -183.47666931152344, "logps/rejected": -199.96681213378906, "loss": 0.7047, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5590149164199829, "rewards/margins": 0.15124721825122833, "rewards/rejected": -0.7102621793746948, "step": 410 }, { "epoch": 0.54, "learning_rate": 4.755742858794503e-05, "logits/chosen": -2.1221137046813965, "logits/rejected": -2.069035291671753, "logps/chosen": -203.06430053710938, "logps/rejected": -187.47093200683594, "loss": 0.7289, "rewards/accuracies": 0.5, "rewards/chosen": -0.4738054871559143, "rewards/margins": 0.03721272572875023, "rewards/rejected": -0.5110181570053101, "step": 411 }, { "epoch": 0.54, "learning_rate": 4.754195849715557e-05, "logits/chosen": -1.9132802486419678, "logits/rejected": -1.976714849472046, "logps/chosen": -162.88668823242188, "logps/rejected": -180.09291076660156, "loss": 0.658, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10689959675073624, "rewards/margins": 0.1677176058292389, "rewards/rejected": -0.2746172249317169, "step": 412 }, { "epoch": 0.54, "learning_rate": 4.75264421032502e-05, "logits/chosen": -1.8963744640350342, "logits/rejected": -1.8926461935043335, "logps/chosen": -199.32736206054688, "logps/rejected": -198.4470672607422, "loss": 0.6689, "rewards/accuracies": 0.625, "rewards/chosen": -0.6300160884857178, "rewards/margins": 0.09289233386516571, "rewards/rejected": -0.7229084968566895, "step": 413 }, { "epoch": 0.54, "learning_rate": 4.751087943810093e-05, "logits/chosen": -1.8022470474243164, "logits/rejected": -1.797208547592163, "logps/chosen": -184.97244262695312, "logps/rejected": -180.94895935058594, "loss": 0.5437, "rewards/accuracies": 0.6875, "rewards/chosen": -0.35914158821105957, "rewards/margins": 0.4712386727333069, "rewards/rejected": -0.8303802609443665, "step": 414 }, { "epoch": 0.54, "learning_rate": 4.749527053367481e-05, "logits/chosen": -1.7871997356414795, "logits/rejected": -1.7747814655303955, "logps/chosen": -189.15310668945312, "logps/rejected": -178.39805603027344, "loss": 0.842, "rewards/accuracies": 0.375, "rewards/chosen": -0.7546498775482178, "rewards/margins": -0.17995727062225342, "rewards/rejected": -0.5746926069259644, "step": 415 }, { "epoch": 0.54, "learning_rate": 4.747961542203386e-05, "logits/chosen": -1.872157335281372, "logits/rejected": -1.9486424922943115, "logps/chosen": -172.46145629882812, "logps/rejected": -195.35421752929688, "loss": 0.7544, "rewards/accuracies": 0.5, "rewards/chosen": -0.42608222365379333, "rewards/margins": -0.038050394505262375, "rewards/rejected": -0.38803184032440186, "step": 416 }, { "epoch": 0.55, "learning_rate": 4.746391413533503e-05, "logits/chosen": -1.9934085607528687, "logits/rejected": -2.0255632400512695, "logps/chosen": -166.4789276123047, "logps/rejected": -183.97271728515625, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": -0.414784699678421, "rewards/margins": 0.18602606654167175, "rewards/rejected": -0.600810706615448, "step": 417 }, { "epoch": 0.55, "learning_rate": 4.74481667058301e-05, "logits/chosen": -1.8942878246307373, "logits/rejected": -1.87151038646698, "logps/chosen": -173.86004638671875, "logps/rejected": -163.0937957763672, "loss": 0.8824, "rewards/accuracies": 0.375, "rewards/chosen": -0.772480309009552, "rewards/margins": -0.22370155155658722, "rewards/rejected": -0.5487788319587708, "step": 418 }, { "epoch": 0.55, "learning_rate": 4.743237316586564e-05, "logits/chosen": -1.9378021955490112, "logits/rejected": -1.9703481197357178, "logps/chosen": -180.11892700195312, "logps/rejected": -195.14578247070312, "loss": 0.7388, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3675965666770935, "rewards/margins": -0.02655930444598198, "rewards/rejected": -0.3410373032093048, "step": 419 }, { "epoch": 0.55, "learning_rate": 4.741653354788295e-05, "logits/chosen": -2.0154881477355957, "logits/rejected": -2.0120866298675537, "logps/chosen": -174.45498657226562, "logps/rejected": -182.1272430419922, "loss": 0.8029, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7631839513778687, "rewards/margins": -0.11559872329235077, "rewards/rejected": -0.6475852727890015, "step": 420 }, { "epoch": 0.55, "learning_rate": 4.7400647884417956e-05, "logits/chosen": -1.8835885524749756, "logits/rejected": -2.027597665786743, "logps/chosen": -169.2918701171875, "logps/rejected": -167.33514404296875, "loss": 0.6719, "rewards/accuracies": 0.5, "rewards/chosen": -0.33407047390937805, "rewards/margins": 0.15689513087272644, "rewards/rejected": -0.49096566438674927, "step": 421 }, { "epoch": 0.55, "learning_rate": 4.7384716208101166e-05, "logits/chosen": -2.028184413909912, "logits/rejected": -2.0173041820526123, "logps/chosen": -168.9990692138672, "logps/rejected": -160.54428100585938, "loss": 0.7799, "rewards/accuracies": 0.1875, "rewards/chosen": -0.5599774122238159, "rewards/margins": -0.1413966715335846, "rewards/rejected": -0.4185807704925537, "step": 422 }, { "epoch": 0.55, "learning_rate": 4.736873855165762e-05, "logits/chosen": -1.9846090078353882, "logits/rejected": -1.9857451915740967, "logps/chosen": -193.67715454101562, "logps/rejected": -175.9185333251953, "loss": 0.6656, "rewards/accuracies": 0.625, "rewards/chosen": -0.5016548037528992, "rewards/margins": 0.22657424211502075, "rewards/rejected": -0.7282290458679199, "step": 423 }, { "epoch": 0.55, "learning_rate": 4.735271494790678e-05, "logits/chosen": -1.92975914478302, "logits/rejected": -1.9406872987747192, "logps/chosen": -181.31509399414062, "logps/rejected": -167.101806640625, "loss": 0.7386, "rewards/accuracies": 0.5625, "rewards/chosen": -0.18490439653396606, "rewards/margins": 0.0683104544878006, "rewards/rejected": -0.25321486592292786, "step": 424 }, { "epoch": 0.56, "learning_rate": 4.733664542976253e-05, "logits/chosen": -1.9516609907150269, "logits/rejected": -2.005047559738159, "logps/chosen": -167.55972290039062, "logps/rejected": -168.99807739257812, "loss": 0.9659, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3683350086212158, "rewards/margins": -0.24237681925296783, "rewards/rejected": -0.12595820426940918, "step": 425 }, { "epoch": 0.56, "learning_rate": 4.732053003023301e-05, "logits/chosen": -1.934274435043335, "logits/rejected": -1.9798094034194946, "logps/chosen": -153.78518676757812, "logps/rejected": -166.25979614257812, "loss": 0.7058, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5433434247970581, "rewards/margins": 0.13497428596019745, "rewards/rejected": -0.6783177256584167, "step": 426 }, { "epoch": 0.56, "learning_rate": 4.730436878242064e-05, "logits/chosen": -1.9575129747390747, "logits/rejected": -1.9926663637161255, "logps/chosen": -153.06951904296875, "logps/rejected": -174.07875061035156, "loss": 0.8359, "rewards/accuracies": 0.5, "rewards/chosen": -0.4944266080856323, "rewards/margins": -0.00036665797233581543, "rewards/rejected": -0.4940599203109741, "step": 427 }, { "epoch": 0.56, "learning_rate": 4.7288161719522016e-05, "logits/chosen": -1.9566092491149902, "logits/rejected": -1.9228875637054443, "logps/chosen": -162.54771423339844, "logps/rejected": -167.0357666015625, "loss": 0.8701, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5988922715187073, "rewards/margins": -0.1607256382703781, "rewards/rejected": -0.43816661834716797, "step": 428 }, { "epoch": 0.56, "learning_rate": 4.727190887482783e-05, "logits/chosen": -2.212228775024414, "logits/rejected": -2.238290309906006, "logps/chosen": -185.6492462158203, "logps/rejected": -201.63949584960938, "loss": 0.7027, "rewards/accuracies": 0.4375, "rewards/chosen": -0.45072147250175476, "rewards/margins": 0.03795819729566574, "rewards/rejected": -0.4886796772480011, "step": 429 }, { "epoch": 0.56, "learning_rate": 4.725561028172282e-05, "logits/chosen": -2.08243989944458, "logits/rejected": -2.100586414337158, "logps/chosen": -169.82723999023438, "logps/rejected": -169.41476440429688, "loss": 0.8011, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5032753348350525, "rewards/margins": -0.07662791758775711, "rewards/rejected": -0.4266473352909088, "step": 430 }, { "epoch": 0.56, "learning_rate": 4.7239265973685696e-05, "logits/chosen": -1.7974039316177368, "logits/rejected": -1.802499771118164, "logps/chosen": -166.3297119140625, "logps/rejected": -184.35450744628906, "loss": 0.6115, "rewards/accuracies": 0.75, "rewards/chosen": -0.3483116924762726, "rewards/margins": 0.2958502173423767, "rewards/rejected": -0.6441619396209717, "step": 431 }, { "epoch": 0.57, "learning_rate": 4.722287598428907e-05, "logits/chosen": -1.9482653141021729, "logits/rejected": -1.9997018575668335, "logps/chosen": -202.8570098876953, "logps/rejected": -219.10565185546875, "loss": 0.6303, "rewards/accuracies": 0.625, "rewards/chosen": -0.0477115735411644, "rewards/margins": 0.21878241002559662, "rewards/rejected": -0.26649394631385803, "step": 432 }, { "epoch": 0.57, "learning_rate": 4.720644034719938e-05, "logits/chosen": -1.8863980770111084, "logits/rejected": -1.8580697774887085, "logps/chosen": -178.37112426757812, "logps/rejected": -196.09762573242188, "loss": 0.7045, "rewards/accuracies": 0.5625, "rewards/chosen": -0.359438419342041, "rewards/margins": 0.0665070116519928, "rewards/rejected": -0.4259454607963562, "step": 433 }, { "epoch": 0.57, "learning_rate": 4.7189959096176825e-05, "logits/chosen": -1.962789535522461, "logits/rejected": -2.0059375762939453, "logps/chosen": -168.6863250732422, "logps/rejected": -207.6638946533203, "loss": 0.7517, "rewards/accuracies": 0.5, "rewards/chosen": -0.33223748207092285, "rewards/margins": 0.007875222712755203, "rewards/rejected": -0.3401126563549042, "step": 434 }, { "epoch": 0.57, "learning_rate": 4.7173432265075334e-05, "logits/chosen": -2.13173770904541, "logits/rejected": -2.1997811794281006, "logps/chosen": -171.5570526123047, "logps/rejected": -173.00726318359375, "loss": 0.7831, "rewards/accuracies": 0.25, "rewards/chosen": -0.5620677471160889, "rewards/margins": -0.08540257066488266, "rewards/rejected": -0.47666510939598083, "step": 435 }, { "epoch": 0.57, "learning_rate": 4.7156859887842416e-05, "logits/chosen": -1.9717164039611816, "logits/rejected": -1.9659010171890259, "logps/chosen": -163.4027099609375, "logps/rejected": -171.51205444335938, "loss": 0.8767, "rewards/accuracies": 0.4375, "rewards/chosen": -0.36586710810661316, "rewards/margins": -0.22375579178333282, "rewards/rejected": -0.14211128652095795, "step": 436 }, { "epoch": 0.57, "learning_rate": 4.714024199851915e-05, "logits/chosen": -1.9461572170257568, "logits/rejected": -1.9711329936981201, "logps/chosen": -173.81141662597656, "logps/rejected": -172.41860961914062, "loss": 0.7655, "rewards/accuracies": 0.5, "rewards/chosen": -0.5734329223632812, "rewards/margins": 0.0679081380367279, "rewards/rejected": -0.6413410305976868, "step": 437 }, { "epoch": 0.57, "learning_rate": 4.712357863124013e-05, "logits/chosen": -2.0299744606018066, "logits/rejected": -2.055668830871582, "logps/chosen": -168.48684692382812, "logps/rejected": -182.92257690429688, "loss": 0.69, "rewards/accuracies": 0.5, "rewards/chosen": -0.44026386737823486, "rewards/margins": 0.11328400671482086, "rewards/rejected": -0.5535478591918945, "step": 438 }, { "epoch": 0.57, "learning_rate": 4.710686982023332e-05, "logits/chosen": -2.0356907844543457, "logits/rejected": -1.9616978168487549, "logps/chosen": -147.8501739501953, "logps/rejected": -154.8659210205078, "loss": 0.7309, "rewards/accuracies": 0.5, "rewards/chosen": -0.06124575436115265, "rewards/margins": 0.03877441585063934, "rewards/rejected": -0.10002017021179199, "step": 439 }, { "epoch": 0.58, "learning_rate": 4.709011559982006e-05, "logits/chosen": -2.043642997741699, "logits/rejected": -1.9762914180755615, "logps/chosen": -193.67340087890625, "logps/rejected": -186.66543579101562, "loss": 0.7127, "rewards/accuracies": 0.625, "rewards/chosen": -0.34980466961860657, "rewards/margins": 0.07452677190303802, "rewards/rejected": -0.4243314862251282, "step": 440 }, { "epoch": 0.58, "learning_rate": 4.707331600441495e-05, "logits/chosen": -2.072479724884033, "logits/rejected": -2.091381549835205, "logps/chosen": -191.45059204101562, "logps/rejected": -176.9473876953125, "loss": 0.6801, "rewards/accuracies": 0.625, "rewards/chosen": -0.029101327061653137, "rewards/margins": 0.26256757974624634, "rewards/rejected": -0.2916688919067383, "step": 441 }, { "epoch": 0.58, "learning_rate": 4.705647106852581e-05, "logits/chosen": -1.9268254041671753, "logits/rejected": -1.8882079124450684, "logps/chosen": -171.5392303466797, "logps/rejected": -182.5844268798828, "loss": 0.7921, "rewards/accuracies": 0.375, "rewards/chosen": -0.2353130429983139, "rewards/margins": -0.0611705407500267, "rewards/rejected": -0.1741425096988678, "step": 442 }, { "epoch": 0.58, "learning_rate": 4.7039580826753564e-05, "logits/chosen": -2.029810667037964, "logits/rejected": -2.026019334793091, "logps/chosen": -169.64918518066406, "logps/rejected": -186.5985107421875, "loss": 0.6615, "rewards/accuracies": 0.5, "rewards/chosen": -0.34466996788978577, "rewards/margins": 0.14720463752746582, "rewards/rejected": -0.491874635219574, "step": 443 }, { "epoch": 0.58, "learning_rate": 4.7022645313792235e-05, "logits/chosen": -1.5730178356170654, "logits/rejected": -1.5958049297332764, "logps/chosen": -160.8270263671875, "logps/rejected": -176.29554748535156, "loss": 0.7613, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34809601306915283, "rewards/margins": 0.23688597977161407, "rewards/rejected": -0.5849819779396057, "step": 444 }, { "epoch": 0.58, "learning_rate": 4.700566456442882e-05, "logits/chosen": -2.009403705596924, "logits/rejected": -1.9752486944198608, "logps/chosen": -178.82701110839844, "logps/rejected": -180.32125854492188, "loss": 1.0244, "rewards/accuracies": 0.375, "rewards/chosen": -0.6049767136573792, "rewards/margins": -0.4276657700538635, "rewards/rejected": -0.1773110032081604, "step": 445 }, { "epoch": 0.58, "learning_rate": 4.6988638613543216e-05, "logits/chosen": -1.7354819774627686, "logits/rejected": -1.731933355331421, "logps/chosen": -167.52633666992188, "logps/rejected": -181.66583251953125, "loss": 0.789, "rewards/accuracies": 0.5, "rewards/chosen": -0.6287019848823547, "rewards/margins": -0.054200708866119385, "rewards/rejected": -0.5745012760162354, "step": 446 }, { "epoch": 0.58, "learning_rate": 4.6971567496108206e-05, "logits/chosen": -1.9981721639633179, "logits/rejected": -2.026167154312134, "logps/chosen": -212.54891967773438, "logps/rejected": -215.69253540039062, "loss": 0.7395, "rewards/accuracies": 0.625, "rewards/chosen": -0.4362500011920929, "rewards/margins": -0.020926453173160553, "rewards/rejected": -0.4153235852718353, "step": 447 }, { "epoch": 0.59, "learning_rate": 4.695445124718931e-05, "logits/chosen": -2.239379405975342, "logits/rejected": -2.186093807220459, "logps/chosen": -189.42547607421875, "logps/rejected": -181.0236358642578, "loss": 0.6656, "rewards/accuracies": 0.5, "rewards/chosen": -0.3167141079902649, "rewards/margins": 0.10905791819095612, "rewards/rejected": -0.4257720112800598, "step": 448 }, { "epoch": 0.59, "learning_rate": 4.693728990194479e-05, "logits/chosen": -2.132059097290039, "logits/rejected": -2.1224942207336426, "logps/chosen": -211.412841796875, "logps/rejected": -197.23439025878906, "loss": 0.6771, "rewards/accuracies": 0.625, "rewards/chosen": -0.29473942518234253, "rewards/margins": 0.20832209289073944, "rewards/rejected": -0.5030615329742432, "step": 449 }, { "epoch": 0.59, "learning_rate": 4.692008349562551e-05, "logits/chosen": -2.2113986015319824, "logits/rejected": -2.1679513454437256, "logps/chosen": -174.953369140625, "logps/rejected": -188.64576721191406, "loss": 0.9728, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5910927057266235, "rewards/margins": -0.3634761869907379, "rewards/rejected": -0.22761650383472443, "step": 450 }, { "epoch": 0.59, "learning_rate": 4.690283206357491e-05, "logits/chosen": -1.984092116355896, "logits/rejected": -2.0135576725006104, "logps/chosen": -187.25738525390625, "logps/rejected": -189.52325439453125, "loss": 0.672, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4424164593219757, "rewards/margins": 0.20770896971225739, "rewards/rejected": -0.6501253843307495, "step": 451 }, { "epoch": 0.59, "learning_rate": 4.6885535641228904e-05, "logits/chosen": -2.0326945781707764, "logits/rejected": -2.0409088134765625, "logps/chosen": -189.7724609375, "logps/rejected": -196.32164001464844, "loss": 0.846, "rewards/accuracies": 0.375, "rewards/chosen": -0.2984338402748108, "rewards/margins": -0.14787545800209045, "rewards/rejected": -0.15055838227272034, "step": 452 }, { "epoch": 0.59, "learning_rate": 4.6868194264115833e-05, "logits/chosen": -1.8356232643127441, "logits/rejected": -1.860498309135437, "logps/chosen": -190.26791381835938, "logps/rejected": -192.42124938964844, "loss": 0.6865, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5506845712661743, "rewards/margins": 0.12794733047485352, "rewards/rejected": -0.6786318421363831, "step": 453 }, { "epoch": 0.59, "learning_rate": 4.685080796785637e-05, "logits/chosen": -2.0949289798736572, "logits/rejected": -2.105635643005371, "logps/chosen": -188.15576171875, "logps/rejected": -177.27642822265625, "loss": 0.7395, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6156455874443054, "rewards/margins": 0.00026201456785202026, "rewards/rejected": -0.615907609462738, "step": 454 }, { "epoch": 0.6, "learning_rate": 4.683337678816345e-05, "logits/chosen": -2.062208414077759, "logits/rejected": -1.9715068340301514, "logps/chosen": -247.74729919433594, "logps/rejected": -225.91094970703125, "loss": 0.8454, "rewards/accuracies": 0.25, "rewards/chosen": -0.7724697589874268, "rewards/margins": -0.1988295614719391, "rewards/rejected": -0.5736401081085205, "step": 455 }, { "epoch": 0.6, "learning_rate": 4.6815900760842236e-05, "logits/chosen": -1.999656081199646, "logits/rejected": -2.013665199279785, "logps/chosen": -189.76425170898438, "logps/rejected": -200.77801513671875, "loss": 0.7929, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1113797426223755, "rewards/margins": -0.07439464330673218, "rewards/rejected": -1.0369850397109985, "step": 456 }, { "epoch": 0.6, "learning_rate": 4.679837992178996e-05, "logits/chosen": -1.962624192237854, "logits/rejected": -1.8894569873809814, "logps/chosen": -163.32264709472656, "logps/rejected": -176.6225128173828, "loss": 0.6683, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5831518769264221, "rewards/margins": 0.18081454932689667, "rewards/rejected": -0.76396644115448, "step": 457 }, { "epoch": 0.6, "learning_rate": 4.678081430699594e-05, "logits/chosen": -1.9277567863464355, "logits/rejected": -1.9991313219070435, "logps/chosen": -177.146484375, "logps/rejected": -180.6768798828125, "loss": 0.549, "rewards/accuracies": 0.75, "rewards/chosen": -0.5843685865402222, "rewards/margins": 0.5196143388748169, "rewards/rejected": -1.103982925415039, "step": 458 }, { "epoch": 0.6, "learning_rate": 4.676320395254146e-05, "logits/chosen": -1.8018032312393188, "logits/rejected": -1.7631927728652954, "logps/chosen": -193.01077270507812, "logps/rejected": -193.25103759765625, "loss": 0.6489, "rewards/accuracies": 0.625, "rewards/chosen": -0.5062118172645569, "rewards/margins": 0.23078583180904388, "rewards/rejected": -0.736997663974762, "step": 459 }, { "epoch": 0.6, "learning_rate": 4.674554889459968e-05, "logits/chosen": -1.7966090440750122, "logits/rejected": -1.7850843667984009, "logps/chosen": -182.4764404296875, "logps/rejected": -184.0175018310547, "loss": 0.7334, "rewards/accuracies": 0.5, "rewards/chosen": -0.4785413146018982, "rewards/margins": -0.006460566073656082, "rewards/rejected": -0.4720807671546936, "step": 460 }, { "epoch": 0.6, "learning_rate": 4.672784916943562e-05, "logits/chosen": -1.6125917434692383, "logits/rejected": -1.6394853591918945, "logps/chosen": -182.64862060546875, "logps/rejected": -196.8616943359375, "loss": 0.5382, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8303923010826111, "rewards/margins": 0.4725598096847534, "rewards/rejected": -1.3029520511627197, "step": 461 }, { "epoch": 0.6, "learning_rate": 4.6710104813406034e-05, "logits/chosen": -1.7587897777557373, "logits/rejected": -1.7165967226028442, "logps/chosen": -178.50250244140625, "logps/rejected": -159.33375549316406, "loss": 0.9045, "rewards/accuracies": 0.25, "rewards/chosen": -0.5282204151153564, "rewards/margins": -0.22660110890865326, "rewards/rejected": -0.30161935091018677, "step": 462 }, { "epoch": 0.61, "learning_rate": 4.669231586295934e-05, "logits/chosen": -1.8907124996185303, "logits/rejected": -1.9228183031082153, "logps/chosen": -169.16119384765625, "logps/rejected": -180.715087890625, "loss": 0.7763, "rewards/accuracies": 0.3125, "rewards/chosen": -0.8981459736824036, "rewards/margins": -0.11685739457607269, "rewards/rejected": -0.7812885046005249, "step": 463 }, { "epoch": 0.61, "learning_rate": 4.667448235463557e-05, "logits/chosen": -1.2660267353057861, "logits/rejected": -1.2475149631500244, "logps/chosen": -183.68353271484375, "logps/rejected": -182.84422302246094, "loss": 0.8929, "rewards/accuracies": 0.375, "rewards/chosen": -0.7595192790031433, "rewards/margins": -0.14992079138755798, "rewards/rejected": -0.6095985174179077, "step": 464 }, { "epoch": 0.61, "learning_rate": 4.665660432506629e-05, "logits/chosen": -1.8095303773880005, "logits/rejected": -1.8506840467453003, "logps/chosen": -213.27145385742188, "logps/rejected": -220.65640258789062, "loss": 0.7946, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6156535744667053, "rewards/margins": -0.057905957102775574, "rewards/rejected": -0.5577476024627686, "step": 465 }, { "epoch": 0.61, "learning_rate": 4.6638681810974496e-05, "logits/chosen": -1.758918285369873, "logits/rejected": -1.7417278289794922, "logps/chosen": -180.2194366455078, "logps/rejected": -199.62014770507812, "loss": 0.5957, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17319843173027039, "rewards/margins": 0.4147520363330841, "rewards/rejected": -0.5879504680633545, "step": 466 }, { "epoch": 0.61, "learning_rate": 4.6620714849174576e-05, "logits/chosen": -1.5012279748916626, "logits/rejected": -1.495218276977539, "logps/chosen": -227.26577758789062, "logps/rejected": -223.47470092773438, "loss": 0.7009, "rewards/accuracies": 0.5, "rewards/chosen": -0.7020196318626404, "rewards/margins": 0.10670151561498642, "rewards/rejected": -0.808721125125885, "step": 467 }, { "epoch": 0.61, "learning_rate": 4.660270347657219e-05, "logits/chosen": -1.4245662689208984, "logits/rejected": -1.4722357988357544, "logps/chosen": -219.63504028320312, "logps/rejected": -246.0736846923828, "loss": 0.6091, "rewards/accuracies": 0.5, "rewards/chosen": -0.7042097449302673, "rewards/margins": 0.5171206593513489, "rewards/rejected": -1.2213302850723267, "step": 468 }, { "epoch": 0.61, "learning_rate": 4.658464773016428e-05, "logits/chosen": -1.7068259716033936, "logits/rejected": -1.6351027488708496, "logps/chosen": -199.54136657714844, "logps/rejected": -181.50997924804688, "loss": 0.8131, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2041518688201904, "rewards/margins": -0.023167330771684647, "rewards/rejected": -1.180984616279602, "step": 469 }, { "epoch": 0.62, "learning_rate": 4.6566547647038864e-05, "logits/chosen": -1.7098909616470337, "logits/rejected": -1.80801522731781, "logps/chosen": -167.95101928710938, "logps/rejected": -180.0511016845703, "loss": 0.5463, "rewards/accuracies": 0.75, "rewards/chosen": -0.2591591477394104, "rewards/margins": 0.41101884841918945, "rewards/rejected": -0.6701779961585999, "step": 470 }, { "epoch": 0.62, "learning_rate": 4.6548403264375074e-05, "logits/chosen": -2.014415979385376, "logits/rejected": -2.0018627643585205, "logps/chosen": -181.22947692871094, "logps/rejected": -189.5872344970703, "loss": 0.8377, "rewards/accuracies": 0.5, "rewards/chosen": -0.9818540811538696, "rewards/margins": -0.06248188391327858, "rewards/rejected": -0.9193722009658813, "step": 471 }, { "epoch": 0.62, "learning_rate": 4.6530214619443037e-05, "logits/chosen": -1.902940034866333, "logits/rejected": -1.940006971359253, "logps/chosen": -156.01939392089844, "logps/rejected": -156.20623779296875, "loss": 0.8502, "rewards/accuracies": 0.375, "rewards/chosen": -0.6196620464324951, "rewards/margins": -0.18280625343322754, "rewards/rejected": -0.4368557929992676, "step": 472 }, { "epoch": 0.62, "learning_rate": 4.6511981749603775e-05, "logits/chosen": -1.8872562646865845, "logits/rejected": -1.9487504959106445, "logps/chosen": -178.33872985839844, "logps/rejected": -182.2080535888672, "loss": 0.7332, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6403207778930664, "rewards/margins": 0.12355762720108032, "rewards/rejected": -0.7638784050941467, "step": 473 }, { "epoch": 0.62, "learning_rate": 4.6493704692309175e-05, "logits/chosen": -1.8873028755187988, "logits/rejected": -1.8430054187774658, "logps/chosen": -248.9535675048828, "logps/rejected": -238.04327392578125, "loss": 1.062, "rewards/accuracies": 0.25, "rewards/chosen": -1.2121449708938599, "rewards/margins": -0.3689318299293518, "rewards/rejected": -0.8432131409645081, "step": 474 }, { "epoch": 0.62, "learning_rate": 4.647538348510189e-05, "logits/chosen": -1.8361527919769287, "logits/rejected": -1.856339454650879, "logps/chosen": -171.12091064453125, "logps/rejected": -179.4962158203125, "loss": 0.6846, "rewards/accuracies": 0.5, "rewards/chosen": -0.7720082998275757, "rewards/margins": 0.10657864063978195, "rewards/rejected": -0.8785868883132935, "step": 475 }, { "epoch": 0.62, "learning_rate": 4.645701816561523e-05, "logits/chosen": -1.6982722282409668, "logits/rejected": -1.7370768785476685, "logps/chosen": -232.54293823242188, "logps/rejected": -213.66964721679688, "loss": 0.7178, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8362730741500854, "rewards/margins": 0.07781472057104111, "rewards/rejected": -0.9140878319740295, "step": 476 }, { "epoch": 0.62, "learning_rate": 4.643860877157314e-05, "logits/chosen": -1.7802523374557495, "logits/rejected": -1.7304799556732178, "logps/chosen": -168.3419189453125, "logps/rejected": -205.67333984375, "loss": 0.8153, "rewards/accuracies": 0.375, "rewards/chosen": -0.4274996221065521, "rewards/margins": -0.046173423528671265, "rewards/rejected": -0.38132619857788086, "step": 477 }, { "epoch": 0.63, "learning_rate": 4.642015534079012e-05, "logits/chosen": -1.9037768840789795, "logits/rejected": -1.8988232612609863, "logps/chosen": -173.9936981201172, "logps/rejected": -197.27523803710938, "loss": 0.6135, "rewards/accuracies": 0.6875, "rewards/chosen": -0.32596248388290405, "rewards/margins": 0.23917633295059204, "rewards/rejected": -0.5651388168334961, "step": 478 }, { "epoch": 0.63, "learning_rate": 4.640165791117106e-05, "logits/chosen": -1.9618606567382812, "logits/rejected": -1.9455369710922241, "logps/chosen": -190.39830017089844, "logps/rejected": -175.3238067626953, "loss": 0.9091, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0257893800735474, "rewards/margins": -0.2109348475933075, "rewards/rejected": -0.8148545622825623, "step": 479 }, { "epoch": 0.63, "learning_rate": 4.63831165207113e-05, "logits/chosen": -1.864621877670288, "logits/rejected": -1.889084815979004, "logps/chosen": -205.9281005859375, "logps/rejected": -230.0751953125, "loss": 0.7761, "rewards/accuracies": 0.5, "rewards/chosen": -0.6850899457931519, "rewards/margins": -0.05720193684101105, "rewards/rejected": -0.6278879046440125, "step": 480 }, { "epoch": 0.63, "learning_rate": 4.6364531207496426e-05, "logits/chosen": -1.737329363822937, "logits/rejected": -1.745915412902832, "logps/chosen": -171.1361846923828, "logps/rejected": -175.88906860351562, "loss": 0.6837, "rewards/accuracies": 0.5, "rewards/chosen": -0.743391215801239, "rewards/margins": 0.0780097097158432, "rewards/rejected": -0.8214008808135986, "step": 481 }, { "epoch": 0.63, "learning_rate": 4.634590200970227e-05, "logits/chosen": -1.8280831575393677, "logits/rejected": -1.8323631286621094, "logps/chosen": -188.1634521484375, "logps/rejected": -213.10360717773438, "loss": 0.7336, "rewards/accuracies": 0.625, "rewards/chosen": -0.7827669382095337, "rewards/margins": 0.3351660966873169, "rewards/rejected": -1.1179330348968506, "step": 482 }, { "epoch": 0.63, "learning_rate": 4.632722896559481e-05, "logits/chosen": -1.9295848608016968, "logits/rejected": -1.9276199340820312, "logps/chosen": -167.31385803222656, "logps/rejected": -186.1995391845703, "loss": 0.6013, "rewards/accuracies": 0.6875, "rewards/chosen": -0.16915923357009888, "rewards/margins": 0.3335033357143402, "rewards/rejected": -0.5026625394821167, "step": 483 }, { "epoch": 0.63, "learning_rate": 4.630851211353007e-05, "logits/chosen": -1.6712524890899658, "logits/rejected": -1.788968801498413, "logps/chosen": -164.82725524902344, "logps/rejected": -182.7891387939453, "loss": 0.7642, "rewards/accuracies": 0.5, "rewards/chosen": -0.609403669834137, "rewards/margins": 0.03477644547820091, "rewards/rejected": -0.644180178642273, "step": 484 }, { "epoch": 0.63, "learning_rate": 4.628975149195407e-05, "logits/chosen": -1.2296477556228638, "logits/rejected": -1.2632193565368652, "logps/chosen": -204.8614044189453, "logps/rejected": -224.30543518066406, "loss": 0.7234, "rewards/accuracies": 0.5, "rewards/chosen": -0.7619196176528931, "rewards/margins": 0.12807466089725494, "rewards/rejected": -0.8899943232536316, "step": 485 }, { "epoch": 0.64, "learning_rate": 4.6270947139402744e-05, "logits/chosen": -2.047361373901367, "logits/rejected": -2.1051080226898193, "logps/chosen": -169.24703979492188, "logps/rejected": -184.35586547851562, "loss": 0.66, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6050511002540588, "rewards/margins": 0.13156384229660034, "rewards/rejected": -0.7366149425506592, "step": 486 }, { "epoch": 0.64, "learning_rate": 4.6252099094501834e-05, "logits/chosen": -1.9163178205490112, "logits/rejected": -1.8832037448883057, "logps/chosen": -202.50064086914062, "logps/rejected": -206.1787109375, "loss": 0.8078, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8862631320953369, "rewards/margins": -0.06633087992668152, "rewards/rejected": -0.819932222366333, "step": 487 }, { "epoch": 0.64, "learning_rate": 4.623320739596685e-05, "logits/chosen": -1.943336009979248, "logits/rejected": -1.9594800472259521, "logps/chosen": -184.20272827148438, "logps/rejected": -185.5780029296875, "loss": 0.948, "rewards/accuracies": 0.1875, "rewards/chosen": -0.8648966550827026, "rewards/margins": -0.32853201031684875, "rewards/rejected": -0.5363646745681763, "step": 488 }, { "epoch": 0.64, "learning_rate": 4.621427208260296e-05, "logits/chosen": -2.0543949604034424, "logits/rejected": -2.09141206741333, "logps/chosen": -186.11021423339844, "logps/rejected": -197.07164001464844, "loss": 0.6593, "rewards/accuracies": 0.625, "rewards/chosen": -0.6588226556777954, "rewards/margins": 0.23558923602104187, "rewards/rejected": -0.8944119811058044, "step": 489 }, { "epoch": 0.64, "learning_rate": 4.6195293193304915e-05, "logits/chosen": -2.2013731002807617, "logits/rejected": -2.209264039993286, "logps/chosen": -192.5195770263672, "logps/rejected": -188.9171600341797, "loss": 0.8303, "rewards/accuracies": 0.375, "rewards/chosen": -0.8534584641456604, "rewards/margins": -0.10596348345279694, "rewards/rejected": -0.7474948763847351, "step": 490 }, { "epoch": 0.64, "learning_rate": 4.6176270767056976e-05, "logits/chosen": -1.8635625839233398, "logits/rejected": -1.8899545669555664, "logps/chosen": -193.61715698242188, "logps/rejected": -196.3071746826172, "loss": 0.5859, "rewards/accuracies": 0.625, "rewards/chosen": -0.620331346988678, "rewards/margins": 0.35552215576171875, "rewards/rejected": -0.9758535027503967, "step": 491 }, { "epoch": 0.64, "learning_rate": 4.615720484293286e-05, "logits/chosen": -2.0970966815948486, "logits/rejected": -2.0922045707702637, "logps/chosen": -171.4237060546875, "logps/rejected": -173.91969299316406, "loss": 0.7777, "rewards/accuracies": 0.5, "rewards/chosen": -0.7394740581512451, "rewards/margins": 0.09215141832828522, "rewards/rejected": -0.8316254615783691, "step": 492 }, { "epoch": 0.65, "learning_rate": 4.613809546009558e-05, "logits/chosen": -1.923639178276062, "logits/rejected": -1.9087320566177368, "logps/chosen": -210.64447021484375, "logps/rejected": -202.98309326171875, "loss": 0.7005, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7617734670639038, "rewards/margins": 0.28139957785606384, "rewards/rejected": -1.04317307472229, "step": 493 }, { "epoch": 0.65, "learning_rate": 4.611894265779748e-05, "logits/chosen": -1.7692898511886597, "logits/rejected": -1.8441616296768188, "logps/chosen": -181.05316162109375, "logps/rejected": -190.46311950683594, "loss": 0.8492, "rewards/accuracies": 0.3125, "rewards/chosen": -0.9923998117446899, "rewards/margins": -0.16852112114429474, "rewards/rejected": -0.8238787651062012, "step": 494 }, { "epoch": 0.65, "learning_rate": 4.609974647538003e-05, "logits/chosen": -2.242365837097168, "logits/rejected": -2.2200753688812256, "logps/chosen": -192.88491821289062, "logps/rejected": -209.79190063476562, "loss": 0.7824, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7258377075195312, "rewards/margins": 0.06173846498131752, "rewards/rejected": -0.7875760793685913, "step": 495 }, { "epoch": 0.65, "learning_rate": 4.608050695227385e-05, "logits/chosen": -2.0390326976776123, "logits/rejected": -2.0602505207061768, "logps/chosen": -159.614013671875, "logps/rejected": -159.90530395507812, "loss": 0.6495, "rewards/accuracies": 0.625, "rewards/chosen": -0.6496396660804749, "rewards/margins": 0.15987172722816467, "rewards/rejected": -0.8095113635063171, "step": 496 }, { "epoch": 0.65, "learning_rate": 4.606122412799857e-05, "logits/chosen": -1.8621050119400024, "logits/rejected": -1.843872308731079, "logps/chosen": -191.3387451171875, "logps/rejected": -212.04867553710938, "loss": 0.838, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9131224751472473, "rewards/margins": -0.06589814275503159, "rewards/rejected": -0.847224235534668, "step": 497 }, { "epoch": 0.65, "learning_rate": 4.6041898042162764e-05, "logits/chosen": -1.9165095090866089, "logits/rejected": -1.9768743515014648, "logps/chosen": -179.1850128173828, "logps/rejected": -197.76953125, "loss": 0.7127, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8291003704071045, "rewards/margins": 0.08120033144950867, "rewards/rejected": -0.9103007316589355, "step": 498 }, { "epoch": 0.65, "learning_rate": 4.602252873446386e-05, "logits/chosen": -1.71052086353302, "logits/rejected": -1.7267752885818481, "logps/chosen": -233.17083740234375, "logps/rejected": -238.11651611328125, "loss": 0.7183, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6033438444137573, "rewards/margins": 0.14365322887897491, "rewards/rejected": -0.7469971179962158, "step": 499 }, { "epoch": 0.65, "learning_rate": 4.60031162446881e-05, "logits/chosen": -1.685623049736023, "logits/rejected": -1.759178876876831, "logps/chosen": -179.87600708007812, "logps/rejected": -183.2005615234375, "loss": 0.7049, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7123013734817505, "rewards/margins": 0.11958488076925278, "rewards/rejected": -0.8318862915039062, "step": 500 }, { "epoch": 0.66, "learning_rate": 4.5983660612710365e-05, "logits/chosen": -1.9058446884155273, "logits/rejected": -1.9013440608978271, "logps/chosen": -177.750244140625, "logps/rejected": -163.71591186523438, "loss": 0.7503, "rewards/accuracies": 0.5, "rewards/chosen": -0.6146600246429443, "rewards/margins": -0.01889324188232422, "rewards/rejected": -0.5957667827606201, "step": 501 }, { "epoch": 0.66, "learning_rate": 4.596416187849423e-05, "logits/chosen": -1.6376805305480957, "logits/rejected": -1.5382472276687622, "logps/chosen": -177.68373107910156, "logps/rejected": -205.24169921875, "loss": 0.579, "rewards/accuracies": 0.625, "rewards/chosen": -0.39451998472213745, "rewards/margins": 0.3564395606517792, "rewards/rejected": -0.7509595155715942, "step": 502 }, { "epoch": 0.66, "learning_rate": 4.5944620082091745e-05, "logits/chosen": -2.126429319381714, "logits/rejected": -2.135838747024536, "logps/chosen": -168.3842010498047, "logps/rejected": -193.17636108398438, "loss": 0.8234, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9303070902824402, "rewards/margins": -0.04648715257644653, "rewards/rejected": -0.8838199973106384, "step": 503 }, { "epoch": 0.66, "learning_rate": 4.5925035263643444e-05, "logits/chosen": -2.267376661300659, "logits/rejected": -2.2018914222717285, "logps/chosen": -199.2592315673828, "logps/rejected": -171.32907104492188, "loss": 1.1319, "rewards/accuracies": 0.375, "rewards/chosen": -1.2791969776153564, "rewards/margins": -0.523222804069519, "rewards/rejected": -0.7559741735458374, "step": 504 }, { "epoch": 0.66, "learning_rate": 4.5905407463378225e-05, "logits/chosen": -2.0708484649658203, "logits/rejected": -2.0920658111572266, "logps/chosen": -160.09619140625, "logps/rejected": -166.4447021484375, "loss": 0.7491, "rewards/accuracies": 0.5, "rewards/chosen": -0.7854949831962585, "rewards/margins": 0.020966414362192154, "rewards/rejected": -0.8064614534378052, "step": 505 }, { "epoch": 0.66, "learning_rate": 4.588573672161326e-05, "logits/chosen": -2.0177011489868164, "logits/rejected": -2.026048183441162, "logps/chosen": -272.7811584472656, "logps/rejected": -275.3113708496094, "loss": 0.755, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7942684888839722, "rewards/margins": 0.19223184883594513, "rewards/rejected": -0.9865003228187561, "step": 506 }, { "epoch": 0.66, "learning_rate": 4.586602307875396e-05, "logits/chosen": -2.0870862007141113, "logits/rejected": -2.1000866889953613, "logps/chosen": -159.46205139160156, "logps/rejected": -163.21469116210938, "loss": 0.5693, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6831841468811035, "rewards/margins": 0.39189672470092773, "rewards/rejected": -1.0750807523727417, "step": 507 }, { "epoch": 0.66, "learning_rate": 4.5846266575293816e-05, "logits/chosen": -2.2326223850250244, "logits/rejected": -2.258408308029175, "logps/chosen": -194.6920928955078, "logps/rejected": -210.00762939453125, "loss": 0.6405, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3496246635913849, "rewards/margins": 0.22416295111179352, "rewards/rejected": -0.5737876296043396, "step": 508 }, { "epoch": 0.67, "learning_rate": 4.582646725181441e-05, "logits/chosen": -1.880753993988037, "logits/rejected": -1.873094081878662, "logps/chosen": -185.65306091308594, "logps/rejected": -175.70822143554688, "loss": 0.8836, "rewards/accuracies": 0.375, "rewards/chosen": -0.8803216218948364, "rewards/margins": -0.1467835009098053, "rewards/rejected": -0.7335382103919983, "step": 509 }, { "epoch": 0.67, "learning_rate": 4.580662514898522e-05, "logits/chosen": -2.0115244388580322, "logits/rejected": -2.0410544872283936, "logps/chosen": -144.72772216796875, "logps/rejected": -144.91920471191406, "loss": 0.8422, "rewards/accuracies": 0.375, "rewards/chosen": -0.628396213054657, "rewards/margins": -0.12304525077342987, "rewards/rejected": -0.5053509473800659, "step": 510 }, { "epoch": 0.67, "learning_rate": 4.5786740307563636e-05, "logits/chosen": -2.0044987201690674, "logits/rejected": -2.0016119480133057, "logps/chosen": -173.86984252929688, "logps/rejected": -171.8080596923828, "loss": 0.8741, "rewards/accuracies": 0.3125, "rewards/chosen": -0.8329235315322876, "rewards/margins": -0.2628735303878784, "rewards/rejected": -0.5700500011444092, "step": 511 }, { "epoch": 0.67, "learning_rate": 4.576681276839483e-05, "logits/chosen": -1.8384199142456055, "logits/rejected": -1.9472904205322266, "logps/chosen": -151.073486328125, "logps/rejected": -166.85824584960938, "loss": 0.7757, "rewards/accuracies": 0.5625, "rewards/chosen": -1.168123483657837, "rewards/margins": 0.10600915551185608, "rewards/rejected": -1.2741327285766602, "step": 512 }, { "epoch": 0.67, "learning_rate": 4.574684257241168e-05, "logits/chosen": -1.633155107498169, "logits/rejected": -1.6421935558319092, "logps/chosen": -178.23052978515625, "logps/rejected": -178.91262817382812, "loss": 0.6821, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3441859483718872, "rewards/margins": 0.10676999390125275, "rewards/rejected": -0.45095589756965637, "step": 513 }, { "epoch": 0.67, "learning_rate": 4.572682976063468e-05, "logits/chosen": -2.086414337158203, "logits/rejected": -2.0576364994049072, "logps/chosen": -210.91128540039062, "logps/rejected": -207.3369140625, "loss": 0.7466, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8871356248855591, "rewards/margins": 0.06643228977918625, "rewards/rejected": -0.9535678625106812, "step": 514 }, { "epoch": 0.67, "learning_rate": 4.5706774374171854e-05, "logits/chosen": -1.8427444696426392, "logits/rejected": -1.861626148223877, "logps/chosen": -179.20623779296875, "logps/rejected": -185.3961181640625, "loss": 0.7066, "rewards/accuracies": 0.625, "rewards/chosen": -0.45703721046447754, "rewards/margins": 0.051899224519729614, "rewards/rejected": -0.5089364647865295, "step": 515 }, { "epoch": 0.68, "learning_rate": 4.56866764542187e-05, "logits/chosen": -1.7605255842208862, "logits/rejected": -1.7601615190505981, "logps/chosen": -196.36715698242188, "logps/rejected": -227.3414306640625, "loss": 0.6068, "rewards/accuracies": 0.75, "rewards/chosen": -0.5065858364105225, "rewards/margins": 0.42499038577079773, "rewards/rejected": -0.9315762519836426, "step": 516 }, { "epoch": 0.68, "learning_rate": 4.566653604205805e-05, "logits/chosen": -1.8466157913208008, "logits/rejected": -1.7656481266021729, "logps/chosen": -192.74110412597656, "logps/rejected": -192.05816650390625, "loss": 0.9557, "rewards/accuracies": 0.1875, "rewards/chosen": -1.4459354877471924, "rewards/margins": -0.40716552734375, "rewards/rejected": -1.0387699604034424, "step": 517 }, { "epoch": 0.68, "learning_rate": 4.5646353179060057e-05, "logits/chosen": -1.8340647220611572, "logits/rejected": -1.8470059633255005, "logps/chosen": -210.009765625, "logps/rejected": -211.32252502441406, "loss": 0.8106, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6514133810997009, "rewards/margins": -0.08276738226413727, "rewards/rejected": -0.5686460137367249, "step": 518 }, { "epoch": 0.68, "learning_rate": 4.562612790668204e-05, "logits/chosen": -1.9675233364105225, "logits/rejected": -1.9512717723846436, "logps/chosen": -145.25350952148438, "logps/rejected": -151.3224334716797, "loss": 0.7672, "rewards/accuracies": 0.5, "rewards/chosen": -0.6850120425224304, "rewards/margins": 0.0520671084523201, "rewards/rejected": -0.7370792031288147, "step": 519 }, { "epoch": 0.68, "learning_rate": 4.560586026646845e-05, "logits/chosen": -1.7509602308273315, "logits/rejected": -1.701064944267273, "logps/chosen": -228.2190704345703, "logps/rejected": -214.31961059570312, "loss": 0.9088, "rewards/accuracies": 0.4375, "rewards/chosen": -0.739928662776947, "rewards/margins": -0.19188320636749268, "rewards/rejected": -0.5480455160140991, "step": 520 }, { "epoch": 0.68, "learning_rate": 4.558555030005075e-05, "logits/chosen": -2.1057076454162598, "logits/rejected": -2.0997025966644287, "logps/chosen": -219.45846557617188, "logps/rejected": -222.98912048339844, "loss": 0.7314, "rewards/accuracies": 0.5, "rewards/chosen": -0.62837815284729, "rewards/margins": 0.05226774513721466, "rewards/rejected": -0.6806458234786987, "step": 521 }, { "epoch": 0.68, "learning_rate": 4.556519804914736e-05, "logits/chosen": -2.0136232376098633, "logits/rejected": -1.996194839477539, "logps/chosen": -184.241455078125, "logps/rejected": -173.1997528076172, "loss": 0.6137, "rewards/accuracies": 0.75, "rewards/chosen": -0.2964561879634857, "rewards/margins": 0.219641774892807, "rewards/rejected": -0.5160979628562927, "step": 522 }, { "epoch": 0.68, "learning_rate": 4.554480355556354e-05, "logits/chosen": -1.9112343788146973, "logits/rejected": -1.8661640882492065, "logps/chosen": -168.06959533691406, "logps/rejected": -173.24607849121094, "loss": 0.7573, "rewards/accuracies": 0.5, "rewards/chosen": -0.5839947462081909, "rewards/margins": 0.007685039192438126, "rewards/rejected": -0.5916797518730164, "step": 523 }, { "epoch": 0.69, "learning_rate": 4.552436686119134e-05, "logits/chosen": -1.8316876888275146, "logits/rejected": -1.8144513368606567, "logps/chosen": -181.4498291015625, "logps/rejected": -186.10784912109375, "loss": 0.9599, "rewards/accuracies": 0.375, "rewards/chosen": -0.6419578194618225, "rewards/margins": -0.39174380898475647, "rewards/rejected": -0.25021398067474365, "step": 524 }, { "epoch": 0.69, "learning_rate": 4.550388800800948e-05, "logits/chosen": -1.8764700889587402, "logits/rejected": -1.9319936037063599, "logps/chosen": -168.9038543701172, "logps/rejected": -168.67950439453125, "loss": 0.6579, "rewards/accuracies": 0.6875, "rewards/chosen": -0.301873117685318, "rewards/margins": 0.17850691080093384, "rewards/rejected": -0.48037999868392944, "step": 525 }, { "epoch": 0.69, "learning_rate": 4.548336703808328e-05, "logits/chosen": -1.9540125131607056, "logits/rejected": -1.9322861433029175, "logps/chosen": -228.04966735839844, "logps/rejected": -231.02769470214844, "loss": 0.8968, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6345371007919312, "rewards/margins": -0.17063456773757935, "rewards/rejected": -0.4639025628566742, "step": 526 }, { "epoch": 0.69, "learning_rate": 4.546280399356457e-05, "logits/chosen": -1.6315593719482422, "logits/rejected": -1.6160613298416138, "logps/chosen": -225.32296752929688, "logps/rejected": -212.6197052001953, "loss": 0.627, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24752049148082733, "rewards/margins": 0.3519718647003174, "rewards/rejected": -0.5994923710823059, "step": 527 }, { "epoch": 0.69, "learning_rate": 4.54421989166916e-05, "logits/chosen": -2.0561516284942627, "logits/rejected": -2.1020302772521973, "logps/chosen": -168.7066192626953, "logps/rejected": -180.7903289794922, "loss": 0.782, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4703682065010071, "rewards/margins": -0.10123680531978607, "rewards/rejected": -0.3691314160823822, "step": 528 }, { "epoch": 0.69, "learning_rate": 4.542155184978898e-05, "logits/chosen": -1.8236006498336792, "logits/rejected": -1.810032606124878, "logps/chosen": -169.7281951904297, "logps/rejected": -162.98208618164062, "loss": 0.9228, "rewards/accuracies": 0.3125, "rewards/chosen": -0.45871978998184204, "rewards/margins": -0.2870974838733673, "rewards/rejected": -0.17162233591079712, "step": 529 }, { "epoch": 0.69, "learning_rate": 4.540086283526754e-05, "logits/chosen": -2.0382392406463623, "logits/rejected": -2.0122694969177246, "logps/chosen": -196.42291259765625, "logps/rejected": -196.21530151367188, "loss": 0.9005, "rewards/accuracies": 0.3125, "rewards/chosen": -0.638641357421875, "rewards/margins": -0.2959403991699219, "rewards/rejected": -0.34270092844963074, "step": 530 }, { "epoch": 0.69, "learning_rate": 4.538013191562431e-05, "logits/chosen": -1.4818511009216309, "logits/rejected": -1.5436879396438599, "logps/chosen": -173.07022094726562, "logps/rejected": -172.4759063720703, "loss": 0.7594, "rewards/accuracies": 0.375, "rewards/chosen": -0.5663694143295288, "rewards/margins": -0.0718955397605896, "rewards/rejected": -0.4944738447666168, "step": 531 }, { "epoch": 0.7, "learning_rate": 4.5359359133442356e-05, "logits/chosen": -1.788183331489563, "logits/rejected": -1.7689244747161865, "logps/chosen": -194.0176239013672, "logps/rejected": -183.86965942382812, "loss": 0.5613, "rewards/accuracies": 0.625, "rewards/chosen": -0.342574805021286, "rewards/margins": 0.44486334919929504, "rewards/rejected": -0.787438154220581, "step": 532 }, { "epoch": 0.7, "learning_rate": 4.533854453139077e-05, "logits/chosen": -1.6594241857528687, "logits/rejected": -1.6917797327041626, "logps/chosen": -235.48097229003906, "logps/rejected": -265.2259826660156, "loss": 0.8443, "rewards/accuracies": 0.375, "rewards/chosen": -0.6860448718070984, "rewards/margins": -0.2058294713497162, "rewards/rejected": -0.4802154004573822, "step": 533 }, { "epoch": 0.7, "learning_rate": 4.5317688152224515e-05, "logits/chosen": -2.104198932647705, "logits/rejected": -2.0917530059814453, "logps/chosen": -193.2283935546875, "logps/rejected": -199.61131286621094, "loss": 0.9177, "rewards/accuracies": 0.5625, "rewards/chosen": -0.36964061856269836, "rewards/margins": -0.1281491219997406, "rewards/rejected": -0.24149154126644135, "step": 534 }, { "epoch": 0.7, "learning_rate": 4.52967900387844e-05, "logits/chosen": -2.016317367553711, "logits/rejected": -2.0518314838409424, "logps/chosen": -192.35496520996094, "logps/rejected": -203.24896240234375, "loss": 0.9074, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2986190915107727, "rewards/margins": -0.2612786591053009, "rewards/rejected": -0.037340469658374786, "step": 535 }, { "epoch": 0.7, "learning_rate": 4.5275850233996925e-05, "logits/chosen": -1.9520158767700195, "logits/rejected": -1.9376802444458008, "logps/chosen": -189.20904541015625, "logps/rejected": -222.3477783203125, "loss": 0.6845, "rewards/accuracies": 0.4375, "rewards/chosen": -0.38919875025749207, "rewards/margins": 0.12857961654663086, "rewards/rejected": -0.5177783370018005, "step": 536 }, { "epoch": 0.7, "learning_rate": 4.525486878087426e-05, "logits/chosen": -1.7735748291015625, "logits/rejected": -1.7847058773040771, "logps/chosen": -177.4947967529297, "logps/rejected": -180.04049682617188, "loss": 0.6215, "rewards/accuracies": 0.625, "rewards/chosen": -0.2464422881603241, "rewards/margins": 0.21997497975826263, "rewards/rejected": -0.4664173126220703, "step": 537 }, { "epoch": 0.7, "learning_rate": 4.523384572251409e-05, "logits/chosen": -1.6225758790969849, "logits/rejected": -1.6475239992141724, "logps/chosen": -176.7261962890625, "logps/rejected": -201.78408813476562, "loss": 0.6428, "rewards/accuracies": 0.625, "rewards/chosen": -0.31473392248153687, "rewards/margins": 0.2050209641456604, "rewards/rejected": -0.5197548866271973, "step": 538 }, { "epoch": 0.71, "learning_rate": 4.52127811020996e-05, "logits/chosen": -2.0809147357940674, "logits/rejected": -2.0989580154418945, "logps/chosen": -228.8104705810547, "logps/rejected": -215.7156524658203, "loss": 0.7835, "rewards/accuracies": 0.5625, "rewards/chosen": -0.21490764617919922, "rewards/margins": -0.11036863178014755, "rewards/rejected": -0.10453899949789047, "step": 539 }, { "epoch": 0.71, "learning_rate": 4.5191674962899314e-05, "logits/chosen": -1.7017827033996582, "logits/rejected": -1.7276175022125244, "logps/chosen": -155.532470703125, "logps/rejected": -170.7515411376953, "loss": 0.788, "rewards/accuracies": 0.25, "rewards/chosen": -0.5771217346191406, "rewards/margins": -0.06404206156730652, "rewards/rejected": -0.5130796432495117, "step": 540 }, { "epoch": 0.71, "learning_rate": 4.5170527348267054e-05, "logits/chosen": -1.8137165307998657, "logits/rejected": -1.7630757093429565, "logps/chosen": -177.9446563720703, "logps/rejected": -174.14016723632812, "loss": 0.7775, "rewards/accuracies": 0.5625, "rewards/chosen": -0.48409003019332886, "rewards/margins": -0.04493946209549904, "rewards/rejected": -0.4391506016254425, "step": 541 }, { "epoch": 0.71, "learning_rate": 4.5149338301641845e-05, "logits/chosen": -2.1948161125183105, "logits/rejected": -2.1407151222229004, "logps/chosen": -170.88812255859375, "logps/rejected": -178.73684692382812, "loss": 0.7424, "rewards/accuracies": 0.5, "rewards/chosen": -0.1686539500951767, "rewards/margins": -0.012474283576011658, "rewards/rejected": -0.15617968142032623, "step": 542 }, { "epoch": 0.71, "learning_rate": 4.512810786654779e-05, "logits/chosen": -2.117692708969116, "logits/rejected": -2.156689405441284, "logps/chosen": -214.4130859375, "logps/rejected": -214.46578979492188, "loss": 0.6464, "rewards/accuracies": 0.625, "rewards/chosen": -0.06381916999816895, "rewards/margins": 0.22082670032978058, "rewards/rejected": -0.28464585542678833, "step": 543 }, { "epoch": 0.71, "learning_rate": 4.510683608659403e-05, "logits/chosen": -2.0320470333099365, "logits/rejected": -2.0040557384490967, "logps/chosen": -163.62440490722656, "logps/rejected": -147.44810485839844, "loss": 0.985, "rewards/accuracies": 0.4375, "rewards/chosen": -0.31194961071014404, "rewards/margins": -0.37895485758781433, "rewards/rejected": 0.06700524687767029, "step": 544 }, { "epoch": 0.71, "learning_rate": 4.508552300547463e-05, "logits/chosen": -1.8441392183303833, "logits/rejected": -1.8550631999969482, "logps/chosen": -165.3875274658203, "logps/rejected": -163.02366638183594, "loss": 0.9708, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3251355290412903, "rewards/margins": -0.3274462819099426, "rewards/rejected": 0.002310771495103836, "step": 545 }, { "epoch": 0.71, "learning_rate": 4.506416866696848e-05, "logits/chosen": -1.8278872966766357, "logits/rejected": -1.807031273841858, "logps/chosen": -184.7231903076172, "logps/rejected": -194.12863159179688, "loss": 0.7609, "rewards/accuracies": 0.5, "rewards/chosen": -0.35950741171836853, "rewards/margins": 0.040937766432762146, "rewards/rejected": -0.40044522285461426, "step": 546 }, { "epoch": 0.72, "learning_rate": 4.504277311493922e-05, "logits/chosen": -1.9982786178588867, "logits/rejected": -1.9985647201538086, "logps/chosen": -171.0308380126953, "logps/rejected": -187.55636596679688, "loss": 0.6264, "rewards/accuracies": 0.625, "rewards/chosen": -0.04028481990098953, "rewards/margins": 0.33397042751312256, "rewards/rejected": -0.3742552697658539, "step": 547 }, { "epoch": 0.72, "learning_rate": 4.502133639333516e-05, "logits/chosen": -1.8734229803085327, "logits/rejected": -1.8940666913986206, "logps/chosen": -170.61341857910156, "logps/rejected": -159.20034790039062, "loss": 0.7322, "rewards/accuracies": 0.5625, "rewards/chosen": 0.08178206533193588, "rewards/margins": 0.0979895144701004, "rewards/rejected": -0.016207464039325714, "step": 548 }, { "epoch": 0.72, "learning_rate": 4.499985854618915e-05, "logits/chosen": -1.7860755920410156, "logits/rejected": -1.8111528158187866, "logps/chosen": -166.02748107910156, "logps/rejected": -184.15676879882812, "loss": 0.7579, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3080641031265259, "rewards/margins": -0.010175898671150208, "rewards/rejected": -0.29788821935653687, "step": 549 }, { "epoch": 0.72, "learning_rate": 4.497833961761855e-05, "logits/chosen": -1.3680813312530518, "logits/rejected": -1.399601697921753, "logps/chosen": -177.28103637695312, "logps/rejected": -221.0879364013672, "loss": 0.745, "rewards/accuracies": 0.375, "rewards/chosen": -0.43804460763931274, "rewards/margins": 0.094999298453331, "rewards/rejected": -0.5330439209938049, "step": 550 }, { "epoch": 0.72, "learning_rate": 4.495677965182506e-05, "logits/chosen": -1.704332947731018, "logits/rejected": -1.8049595355987549, "logps/chosen": -214.8744354248047, "logps/rejected": -240.92410278320312, "loss": 0.6746, "rewards/accuracies": 0.5625, "rewards/chosen": -0.26206129789352417, "rewards/margins": 0.24123568832874298, "rewards/rejected": -0.503296971321106, "step": 551 }, { "epoch": 0.72, "learning_rate": 4.4935178693094714e-05, "logits/chosen": -1.9950153827667236, "logits/rejected": -1.9464647769927979, "logps/chosen": -191.90756225585938, "logps/rejected": -199.6075439453125, "loss": 0.6123, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23637795448303223, "rewards/margins": 0.35672998428344727, "rewards/rejected": -0.12035202980041504, "step": 552 }, { "epoch": 0.72, "learning_rate": 4.491353678579774e-05, "logits/chosen": -2.1501073837280273, "logits/rejected": -2.076387405395508, "logps/chosen": -207.20103454589844, "logps/rejected": -181.49301147460938, "loss": 0.6066, "rewards/accuracies": 0.75, "rewards/chosen": 0.04991075396537781, "rewards/margins": 0.3844224512577057, "rewards/rejected": -0.3345116972923279, "step": 553 }, { "epoch": 0.73, "learning_rate": 4.489185397438845e-05, "logits/chosen": -2.0605039596557617, "logits/rejected": -2.0319156646728516, "logps/chosen": -227.3732147216797, "logps/rejected": -207.87069702148438, "loss": 0.92, "rewards/accuracies": 0.1875, "rewards/chosen": -0.13487425446510315, "rewards/margins": -0.36219164729118347, "rewards/rejected": 0.22731736302375793, "step": 554 }, { "epoch": 0.73, "learning_rate": 4.4870130303405214e-05, "logits/chosen": -1.849971890449524, "logits/rejected": -1.7690582275390625, "logps/chosen": -181.0364532470703, "logps/rejected": -200.14926147460938, "loss": 0.9661, "rewards/accuracies": 0.3125, "rewards/chosen": -0.27974945306777954, "rewards/margins": -0.3051350712776184, "rewards/rejected": 0.025385625660419464, "step": 555 }, { "epoch": 0.73, "learning_rate": 4.484836581747032e-05, "logits/chosen": -1.952078938484192, "logits/rejected": -1.961308240890503, "logps/chosen": -178.1840057373047, "logps/rejected": -183.68377685546875, "loss": 0.5584, "rewards/accuracies": 0.75, "rewards/chosen": -0.3255058526992798, "rewards/margins": 0.4501607418060303, "rewards/rejected": -0.7756666541099548, "step": 556 }, { "epoch": 0.73, "learning_rate": 4.4826560561289865e-05, "logits/chosen": -1.9457815885543823, "logits/rejected": -2.0380780696868896, "logps/chosen": -178.47607421875, "logps/rejected": -187.38902282714844, "loss": 0.87, "rewards/accuracies": 0.4375, "rewards/chosen": -0.361419141292572, "rewards/margins": -0.22761335968971252, "rewards/rejected": -0.1338057518005371, "step": 557 }, { "epoch": 0.73, "learning_rate": 4.4804714579653736e-05, "logits/chosen": -1.8216781616210938, "logits/rejected": -1.8323644399642944, "logps/chosen": -235.6400146484375, "logps/rejected": -214.7845001220703, "loss": 0.8859, "rewards/accuracies": 0.375, "rewards/chosen": -0.32507357001304626, "rewards/margins": -0.19276276230812073, "rewards/rejected": -0.13231079280376434, "step": 558 }, { "epoch": 0.73, "learning_rate": 4.4782827917435454e-05, "logits/chosen": -2.179039716720581, "logits/rejected": -2.2042417526245117, "logps/chosen": -138.87144470214844, "logps/rejected": -152.27728271484375, "loss": 0.859, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2654874324798584, "rewards/margins": -0.15553446114063263, "rewards/rejected": -0.10995297133922577, "step": 559 }, { "epoch": 0.73, "learning_rate": 4.4760900619592085e-05, "logits/chosen": -1.9957417249679565, "logits/rejected": -2.006272315979004, "logps/chosen": -156.08970642089844, "logps/rejected": -156.20684814453125, "loss": 0.5498, "rewards/accuracies": 0.625, "rewards/chosen": 0.06699158251285553, "rewards/margins": 0.520176887512207, "rewards/rejected": -0.4531853199005127, "step": 560 }, { "epoch": 0.73, "learning_rate": 4.4738932731164194e-05, "logits/chosen": -2.0068199634552, "logits/rejected": -2.0332655906677246, "logps/chosen": -192.71728515625, "logps/rejected": -208.94129943847656, "loss": 0.8786, "rewards/accuracies": 0.5, "rewards/chosen": -0.30353277921676636, "rewards/margins": -0.13430212438106537, "rewards/rejected": -0.16923066973686218, "step": 561 }, { "epoch": 0.74, "learning_rate": 4.47169242972757e-05, "logits/chosen": -2.0624818801879883, "logits/rejected": -2.0608577728271484, "logps/chosen": -186.4285430908203, "logps/rejected": -195.51882934570312, "loss": 0.7084, "rewards/accuracies": 0.4375, "rewards/chosen": 0.07775077223777771, "rewards/margins": 0.15880751609802246, "rewards/rejected": -0.08105673640966415, "step": 562 }, { "epoch": 0.74, "learning_rate": 4.469487536313381e-05, "logits/chosen": -1.780775547027588, "logits/rejected": -1.7067487239837646, "logps/chosen": -182.54295349121094, "logps/rejected": -186.3512420654297, "loss": 0.8105, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5589167475700378, "rewards/margins": -0.0893404483795166, "rewards/rejected": -0.46957623958587646, "step": 563 }, { "epoch": 0.74, "learning_rate": 4.467278597402894e-05, "logits/chosen": -1.8799240589141846, "logits/rejected": -1.8977303504943848, "logps/chosen": -151.5462646484375, "logps/rejected": -153.37344360351562, "loss": 0.6187, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06963834166526794, "rewards/margins": 0.30848821997642517, "rewards/rejected": -0.23884987831115723, "step": 564 }, { "epoch": 0.74, "learning_rate": 4.465065617533457e-05, "logits/chosen": -1.7631909847259521, "logits/rejected": -1.7585983276367188, "logps/chosen": -192.2979278564453, "logps/rejected": -191.92445373535156, "loss": 0.664, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0888519287109375, "rewards/margins": 0.18764151632785797, "rewards/rejected": -0.27649345993995667, "step": 565 }, { "epoch": 0.74, "learning_rate": 4.462848601250722e-05, "logits/chosen": -2.05842924118042, "logits/rejected": -2.0034210681915283, "logps/chosen": -167.9142608642578, "logps/rejected": -176.2246551513672, "loss": 0.754, "rewards/accuracies": 0.5, "rewards/chosen": -0.05985128879547119, "rewards/margins": -0.02553650364279747, "rewards/rejected": -0.034314800053834915, "step": 566 }, { "epoch": 0.74, "learning_rate": 4.4606275531086295e-05, "logits/chosen": -1.7910778522491455, "logits/rejected": -1.746850848197937, "logps/chosen": -146.9508056640625, "logps/rejected": -154.16268920898438, "loss": 0.7626, "rewards/accuracies": 0.375, "rewards/chosen": -0.19962230324745178, "rewards/margins": -0.01553274691104889, "rewards/rejected": -0.1840895563364029, "step": 567 }, { "epoch": 0.74, "learning_rate": 4.4584024776694035e-05, "logits/chosen": -1.7370885610580444, "logits/rejected": -1.7392570972442627, "logps/chosen": -195.53094482421875, "logps/rejected": -185.76638793945312, "loss": 0.9247, "rewards/accuracies": 0.1875, "rewards/chosen": -0.5089238286018372, "rewards/margins": -0.37316685914993286, "rewards/rejected": -0.13575701415538788, "step": 568 }, { "epoch": 0.74, "learning_rate": 4.45617337950354e-05, "logits/chosen": -2.0297796726226807, "logits/rejected": -1.9959697723388672, "logps/chosen": -187.27316284179688, "logps/rejected": -171.93833923339844, "loss": 0.853, "rewards/accuracies": 0.4375, "rewards/chosen": -0.13575240969657898, "rewards/margins": -0.1941533386707306, "rewards/rejected": 0.058400966227054596, "step": 569 }, { "epoch": 0.75, "learning_rate": 4.453940263189797e-05, "logits/chosen": -1.8683103322982788, "logits/rejected": -1.8316171169281006, "logps/chosen": -245.50885009765625, "logps/rejected": -218.13079833984375, "loss": 0.9437, "rewards/accuracies": 0.375, "rewards/chosen": -0.6386896967887878, "rewards/margins": -0.3385563790798187, "rewards/rejected": -0.3001333475112915, "step": 570 }, { "epoch": 0.75, "learning_rate": 4.4517031333151874e-05, "logits/chosen": -1.982710361480713, "logits/rejected": -2.028301239013672, "logps/chosen": -147.2786865234375, "logps/rejected": -162.232177734375, "loss": 0.762, "rewards/accuracies": 0.5, "rewards/chosen": 0.11944058537483215, "rewards/margins": 0.18246760964393616, "rewards/rejected": -0.06302699446678162, "step": 571 }, { "epoch": 0.75, "learning_rate": 4.449461994474968e-05, "logits/chosen": -1.6838575601577759, "logits/rejected": -1.7166067361831665, "logps/chosen": -197.30078125, "logps/rejected": -184.3292999267578, "loss": 0.8271, "rewards/accuracies": 0.25, "rewards/chosen": -0.18462339043617249, "rewards/margins": -0.059001460671424866, "rewards/rejected": -0.12562192976474762, "step": 572 }, { "epoch": 0.75, "learning_rate": 4.44721685127263e-05, "logits/chosen": -2.028676986694336, "logits/rejected": -2.0241925716400146, "logps/chosen": -171.38328552246094, "logps/rejected": -168.54164123535156, "loss": 0.9617, "rewards/accuracies": 0.25, "rewards/chosen": -0.37711527943611145, "rewards/margins": -0.406730592250824, "rewards/rejected": 0.029615353792905807, "step": 573 }, { "epoch": 0.75, "learning_rate": 4.4449677083198896e-05, "logits/chosen": -1.7875943183898926, "logits/rejected": -1.790823221206665, "logps/chosen": -166.38113403320312, "logps/rejected": -168.552490234375, "loss": 0.8054, "rewards/accuracies": 0.5, "rewards/chosen": -0.35224342346191406, "rewards/margins": -0.07848221063613892, "rewards/rejected": -0.27376121282577515, "step": 574 }, { "epoch": 0.75, "learning_rate": 4.4427145702366804e-05, "logits/chosen": -1.856335163116455, "logits/rejected": -1.8741549253463745, "logps/chosen": -148.2678680419922, "logps/rejected": -154.46356201171875, "loss": 0.8199, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3622299134731293, "rewards/margins": -0.16889148950576782, "rewards/rejected": -0.19333842396736145, "step": 575 }, { "epoch": 0.75, "learning_rate": 4.440457441651139e-05, "logits/chosen": -2.041019916534424, "logits/rejected": -2.060852289199829, "logps/chosen": -163.51490783691406, "logps/rejected": -167.4606170654297, "loss": 0.7434, "rewards/accuracies": 0.5, "rewards/chosen": -0.2925058901309967, "rewards/margins": 0.018959401175379753, "rewards/rejected": -0.3114652931690216, "step": 576 }, { "epoch": 0.76, "learning_rate": 4.4381963271996044e-05, "logits/chosen": -1.9944902658462524, "logits/rejected": -1.9588004350662231, "logps/chosen": -197.7550506591797, "logps/rejected": -220.91131591796875, "loss": 0.7051, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7190204858779907, "rewards/margins": 0.08902256935834885, "rewards/rejected": -0.8080430626869202, "step": 577 }, { "epoch": 0.76, "learning_rate": 4.435931231526597e-05, "logits/chosen": -1.665032148361206, "logits/rejected": -1.7525084018707275, "logps/chosen": -179.66204833984375, "logps/rejected": -176.45870971679688, "loss": 0.6586, "rewards/accuracies": 0.5, "rewards/chosen": -0.15862412750720978, "rewards/margins": 0.2471276819705963, "rewards/rejected": -0.4057517945766449, "step": 578 }, { "epoch": 0.76, "learning_rate": 4.433662159284818e-05, "logits/chosen": -2.038362741470337, "logits/rejected": -2.01652193069458, "logps/chosen": -164.3741455078125, "logps/rejected": -179.5104522705078, "loss": 0.7518, "rewards/accuracies": 0.5, "rewards/chosen": -0.31034964323043823, "rewards/margins": 0.014281976036727428, "rewards/rejected": -0.32463157176971436, "step": 579 }, { "epoch": 0.76, "learning_rate": 4.4313891151351375e-05, "logits/chosen": -1.9882714748382568, "logits/rejected": -1.9608687162399292, "logps/chosen": -172.2445068359375, "logps/rejected": -166.55502319335938, "loss": 0.8068, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2949073314666748, "rewards/margins": 0.04608011618256569, "rewards/rejected": -0.3409874737262726, "step": 580 }, { "epoch": 0.76, "learning_rate": 4.429112103746582e-05, "logits/chosen": -1.999483585357666, "logits/rejected": -1.8845561742782593, "logps/chosen": -175.16586303710938, "logps/rejected": -193.78138732910156, "loss": 0.7416, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3049665093421936, "rewards/margins": 0.03948044776916504, "rewards/rejected": -0.34444695711135864, "step": 581 }, { "epoch": 0.76, "learning_rate": 4.4268311297963295e-05, "logits/chosen": -2.089709758758545, "logits/rejected": -2.10359263420105, "logps/chosen": -188.5309600830078, "logps/rejected": -185.20364379882812, "loss": 0.6649, "rewards/accuracies": 0.5, "rewards/chosen": -0.11079156398773193, "rewards/margins": 0.3179706037044525, "rewards/rejected": -0.42876213788986206, "step": 582 }, { "epoch": 0.76, "learning_rate": 4.4245461979696937e-05, "logits/chosen": -1.8897924423217773, "logits/rejected": -1.8803207874298096, "logps/chosen": -244.1274871826172, "logps/rejected": -252.70843505859375, "loss": 0.8402, "rewards/accuracies": 0.4375, "rewards/chosen": -0.36066266894340515, "rewards/margins": -0.10755321383476257, "rewards/rejected": -0.2531094551086426, "step": 583 }, { "epoch": 0.76, "learning_rate": 4.422257312960123e-05, "logits/chosen": -1.826185703277588, "logits/rejected": -1.883529782295227, "logps/chosen": -177.42892456054688, "logps/rejected": -199.69570922851562, "loss": 0.7324, "rewards/accuracies": 0.4375, "rewards/chosen": -0.18842053413391113, "rewards/margins": 0.08203045278787613, "rewards/rejected": -0.27045097947120667, "step": 584 }, { "epoch": 0.77, "learning_rate": 4.419964479469182e-05, "logits/chosen": -1.8737729787826538, "logits/rejected": -1.8830443620681763, "logps/chosen": -179.8202362060547, "logps/rejected": -186.00640869140625, "loss": 0.717, "rewards/accuracies": 0.4375, "rewards/chosen": -0.18414545059204102, "rewards/margins": 0.08150999248027802, "rewards/rejected": -0.2656554579734802, "step": 585 }, { "epoch": 0.77, "learning_rate": 4.417667702206548e-05, "logits/chosen": -1.9959778785705566, "logits/rejected": -2.040126085281372, "logps/chosen": -163.7072296142578, "logps/rejected": -166.8711700439453, "loss": 0.6836, "rewards/accuracies": 0.625, "rewards/chosen": -0.08035236597061157, "rewards/margins": 0.12447042763233185, "rewards/rejected": -0.20482276380062103, "step": 586 }, { "epoch": 0.77, "learning_rate": 4.415366985889998e-05, "logits/chosen": -1.8016334772109985, "logits/rejected": -1.7292026281356812, "logps/chosen": -221.72146606445312, "logps/rejected": -246.97320556640625, "loss": 0.5375, "rewards/accuracies": 0.625, "rewards/chosen": -0.22102700173854828, "rewards/margins": 0.4568701386451721, "rewards/rejected": -0.677897036075592, "step": 587 }, { "epoch": 0.77, "learning_rate": 4.413062335245402e-05, "logits/chosen": -2.1090190410614014, "logits/rejected": -2.0904922485351562, "logps/chosen": -165.54083251953125, "logps/rejected": -178.31280517578125, "loss": 0.9611, "rewards/accuracies": 0.375, "rewards/chosen": -0.23936405777931213, "rewards/margins": -0.30644306540489197, "rewards/rejected": 0.06707899272441864, "step": 588 }, { "epoch": 0.77, "learning_rate": 4.410753755006708e-05, "logits/chosen": -2.0900731086730957, "logits/rejected": -2.109846591949463, "logps/chosen": -163.52517700195312, "logps/rejected": -176.35760498046875, "loss": 0.7631, "rewards/accuracies": 0.5, "rewards/chosen": -0.0015172064304351807, "rewards/margins": -0.01845034398138523, "rewards/rejected": 0.016933124512434006, "step": 589 }, { "epoch": 0.77, "learning_rate": 4.408441249915938e-05, "logits/chosen": -1.9141035079956055, "logits/rejected": -1.933986783027649, "logps/chosen": -172.36045837402344, "logps/rejected": -175.99472045898438, "loss": 0.9, "rewards/accuracies": 0.375, "rewards/chosen": -0.4615350365638733, "rewards/margins": -0.29253697395324707, "rewards/rejected": -0.16899806261062622, "step": 590 }, { "epoch": 0.77, "learning_rate": 4.4061248247231776e-05, "logits/chosen": -1.7625254392623901, "logits/rejected": -1.7860521078109741, "logps/chosen": -214.19223022460938, "logps/rejected": -205.5415802001953, "loss": 0.8853, "rewards/accuracies": 0.375, "rewards/chosen": -0.8153303861618042, "rewards/margins": -0.16456466913223267, "rewards/rejected": -0.6507657170295715, "step": 591 }, { "epoch": 0.77, "learning_rate": 4.4038044841865614e-05, "logits/chosen": -1.9525036811828613, "logits/rejected": -1.9083642959594727, "logps/chosen": -158.90896606445312, "logps/rejected": -151.97276306152344, "loss": 0.7655, "rewards/accuracies": 0.5, "rewards/chosen": -0.2276325672864914, "rewards/margins": -0.026436805725097656, "rewards/rejected": -0.20119577646255493, "step": 592 }, { "epoch": 0.78, "learning_rate": 4.401480233072268e-05, "logits/chosen": -1.9390695095062256, "logits/rejected": -1.9349026679992676, "logps/chosen": -169.4829864501953, "logps/rejected": -175.8646240234375, "loss": 0.8046, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3574194014072418, "rewards/margins": -0.10936145484447479, "rewards/rejected": -0.24805793166160583, "step": 593 }, { "epoch": 0.78, "learning_rate": 4.399152076154509e-05, "logits/chosen": -1.7492634057998657, "logits/rejected": -1.7610502243041992, "logps/chosen": -187.03453063964844, "logps/rejected": -181.9283905029297, "loss": 0.7496, "rewards/accuracies": 0.4375, "rewards/chosen": -0.39506590366363525, "rewards/margins": 0.02907838299870491, "rewards/rejected": -0.42414435744285583, "step": 594 }, { "epoch": 0.78, "learning_rate": 4.396820018215518e-05, "logits/chosen": -1.5396924018859863, "logits/rejected": -1.6151387691497803, "logps/chosen": -155.45480346679688, "logps/rejected": -160.28761291503906, "loss": 0.7919, "rewards/accuracies": 0.3125, "rewards/chosen": -0.2293637990951538, "rewards/margins": -0.06823254376649857, "rewards/rejected": -0.16113126277923584, "step": 595 }, { "epoch": 0.78, "learning_rate": 4.394484064045542e-05, "logits/chosen": -1.7704952955245972, "logits/rejected": -1.838837742805481, "logps/chosen": -169.5724334716797, "logps/rejected": -215.3038330078125, "loss": 0.8807, "rewards/accuracies": 0.375, "rewards/chosen": -0.5652868151664734, "rewards/margins": -0.16716551780700684, "rewards/rejected": -0.39812129735946655, "step": 596 }, { "epoch": 0.78, "learning_rate": 4.392144218442831e-05, "logits/chosen": -1.8503105640411377, "logits/rejected": -1.917284369468689, "logps/chosen": -191.72821044921875, "logps/rejected": -207.21307373046875, "loss": 0.6902, "rewards/accuracies": 0.625, "rewards/chosen": -0.15306434035301208, "rewards/margins": 0.08791357278823853, "rewards/rejected": -0.2409779131412506, "step": 597 }, { "epoch": 0.78, "learning_rate": 4.3898004862136286e-05, "logits/chosen": -1.8588240146636963, "logits/rejected": -1.8746473789215088, "logps/chosen": -155.37948608398438, "logps/rejected": -164.0855712890625, "loss": 0.6794, "rewards/accuracies": 0.625, "rewards/chosen": -0.22585612535476685, "rewards/margins": 0.20000189542770386, "rewards/rejected": -0.4258580207824707, "step": 598 }, { "epoch": 0.78, "learning_rate": 4.3874528721721624e-05, "logits/chosen": -2.0860049724578857, "logits/rejected": -2.040462017059326, "logps/chosen": -178.65829467773438, "logps/rejected": -164.56826782226562, "loss": 0.8497, "rewards/accuracies": 0.5, "rewards/chosen": -0.2987426221370697, "rewards/margins": -0.12979570031166077, "rewards/rejected": -0.16894695162773132, "step": 599 }, { "epoch": 0.79, "learning_rate": 4.385101381140633e-05, "logits/chosen": -2.0084433555603027, "logits/rejected": -1.9932807683944702, "logps/chosen": -177.64755249023438, "logps/rejected": -183.24024963378906, "loss": 0.6995, "rewards/accuracies": 0.5625, "rewards/chosen": -0.26401758193969727, "rewards/margins": 0.12354859709739685, "rewards/rejected": -0.38756614923477173, "step": 600 }, { "epoch": 0.79, "learning_rate": 4.382746017949203e-05, "logits/chosen": -1.8708142042160034, "logits/rejected": -1.7702758312225342, "logps/chosen": -176.54803466796875, "logps/rejected": -182.53933715820312, "loss": 0.9847, "rewards/accuracies": 0.375, "rewards/chosen": -0.42444175481796265, "rewards/margins": -0.36502790451049805, "rewards/rejected": -0.059413861483335495, "step": 601 }, { "epoch": 0.79, "learning_rate": 4.380386787435992e-05, "logits/chosen": -1.9202303886413574, "logits/rejected": -1.940227746963501, "logps/chosen": -156.80599975585938, "logps/rejected": -165.71456909179688, "loss": 0.8598, "rewards/accuracies": 0.375, "rewards/chosen": -0.22691097855567932, "rewards/margins": -0.20334003865718842, "rewards/rejected": -0.02357092872262001, "step": 602 }, { "epoch": 0.79, "learning_rate": 4.378023694447061e-05, "logits/chosen": -1.751800537109375, "logits/rejected": -1.6968414783477783, "logps/chosen": -203.2870635986328, "logps/rejected": -175.4205322265625, "loss": 0.8759, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5231369733810425, "rewards/margins": -0.2074095904827118, "rewards/rejected": -0.3157273828983307, "step": 603 }, { "epoch": 0.79, "learning_rate": 4.375656743836407e-05, "logits/chosen": -1.9159809350967407, "logits/rejected": -1.8906610012054443, "logps/chosen": -158.56918334960938, "logps/rejected": -162.91845703125, "loss": 0.7958, "rewards/accuracies": 0.5, "rewards/chosen": -0.34893998503685, "rewards/margins": -0.01667727530002594, "rewards/rejected": -0.33226269483566284, "step": 604 }, { "epoch": 0.79, "learning_rate": 4.373285940465948e-05, "logits/chosen": -2.0548174381256104, "logits/rejected": -2.0111958980560303, "logps/chosen": -172.6857147216797, "logps/rejected": -161.885498046875, "loss": 0.6875, "rewards/accuracies": 0.5, "rewards/chosen": -0.16591539978981018, "rewards/margins": 0.14734701812267303, "rewards/rejected": -0.313262403011322, "step": 605 }, { "epoch": 0.79, "learning_rate": 4.370911289205518e-05, "logits/chosen": -1.9247201681137085, "logits/rejected": -1.8969950675964355, "logps/chosen": -193.29541015625, "logps/rejected": -193.72703552246094, "loss": 0.8154, "rewards/accuracies": 0.25, "rewards/chosen": -0.11465515196323395, "rewards/margins": -0.17800137400627136, "rewards/rejected": 0.06334619224071503, "step": 606 }, { "epoch": 0.79, "learning_rate": 4.368532794932854e-05, "logits/chosen": -1.6993311643600464, "logits/rejected": -1.7183904647827148, "logps/chosen": -196.13595581054688, "logps/rejected": -198.9134521484375, "loss": 0.9953, "rewards/accuracies": 0.375, "rewards/chosen": -0.3177400529384613, "rewards/margins": -0.28991618752479553, "rewards/rejected": -0.02782391384243965, "step": 607 }, { "epoch": 0.8, "learning_rate": 4.366150462533588e-05, "logits/chosen": -1.9511172771453857, "logits/rejected": -2.002882957458496, "logps/chosen": -178.0076141357422, "logps/rejected": -182.05166625976562, "loss": 0.7215, "rewards/accuracies": 0.625, "rewards/chosen": -0.021460914984345436, "rewards/margins": 0.14429938793182373, "rewards/rejected": -0.16576027870178223, "step": 608 }, { "epoch": 0.8, "learning_rate": 4.363764296901234e-05, "logits/chosen": -1.8208976984024048, "logits/rejected": -1.8266651630401611, "logps/chosen": -164.09571838378906, "logps/rejected": -178.66693115234375, "loss": 0.7568, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1179531067609787, "rewards/margins": 0.09427131712436676, "rewards/rejected": -0.21222442388534546, "step": 609 }, { "epoch": 0.8, "learning_rate": 4.361374302937182e-05, "logits/chosen": -1.5063440799713135, "logits/rejected": -1.5503039360046387, "logps/chosen": -195.34811401367188, "logps/rejected": -200.38067626953125, "loss": 0.7159, "rewards/accuracies": 0.5, "rewards/chosen": -0.013212010264396667, "rewards/margins": 0.05199579522013664, "rewards/rejected": -0.06520780920982361, "step": 610 }, { "epoch": 0.8, "learning_rate": 4.358980485550683e-05, "logits/chosen": -1.8006740808486938, "logits/rejected": -1.7630650997161865, "logps/chosen": -198.05638122558594, "logps/rejected": -167.6128387451172, "loss": 0.8621, "rewards/accuracies": 0.3125, "rewards/chosen": -0.465340793132782, "rewards/margins": -0.2472882717847824, "rewards/rejected": -0.21805252134799957, "step": 611 }, { "epoch": 0.8, "learning_rate": 4.356582849658845e-05, "logits/chosen": -1.8933278322219849, "logits/rejected": -1.9565207958221436, "logps/chosen": -165.55247497558594, "logps/rejected": -172.00611877441406, "loss": 0.6704, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09733805060386658, "rewards/margins": 0.23297539353370667, "rewards/rejected": -0.13563737273216248, "step": 612 }, { "epoch": 0.8, "learning_rate": 4.354181400186617e-05, "logits/chosen": -1.3361432552337646, "logits/rejected": -1.4160830974578857, "logps/chosen": -204.05889892578125, "logps/rejected": -203.48159790039062, "loss": 0.8597, "rewards/accuracies": 0.375, "rewards/chosen": -0.468783974647522, "rewards/margins": -0.07349361479282379, "rewards/rejected": -0.3952903151512146, "step": 613 }, { "epoch": 0.8, "learning_rate": 4.351776142066782e-05, "logits/chosen": -1.8746682405471802, "logits/rejected": -1.9063349962234497, "logps/chosen": -161.05291748046875, "logps/rejected": -155.7138671875, "loss": 0.7912, "rewards/accuracies": 0.3125, "rewards/chosen": 0.021917428821325302, "rewards/margins": -0.07229090481996536, "rewards/rejected": 0.09420835971832275, "step": 614 }, { "epoch": 0.8, "learning_rate": 4.349367080239946e-05, "logits/chosen": -1.9660152196884155, "logits/rejected": -1.9447054862976074, "logps/chosen": -171.2956085205078, "logps/rejected": -167.69021606445312, "loss": 0.747, "rewards/accuracies": 0.4375, "rewards/chosen": -0.011758615262806416, "rewards/margins": -0.006392620503902435, "rewards/rejected": -0.005365990102291107, "step": 615 }, { "epoch": 0.81, "learning_rate": 4.34695421965453e-05, "logits/chosen": -1.7947853803634644, "logits/rejected": -1.8116130828857422, "logps/chosen": -191.9875946044922, "logps/rejected": -188.09500122070312, "loss": 0.7336, "rewards/accuracies": 0.5, "rewards/chosen": 0.2941385507583618, "rewards/margins": 0.003938054665923119, "rewards/rejected": 0.29020047187805176, "step": 616 }, { "epoch": 0.81, "learning_rate": 4.344537565266755e-05, "logits/chosen": -1.918999433517456, "logits/rejected": -1.9537014961242676, "logps/chosen": -172.28302001953125, "logps/rejected": -184.16195678710938, "loss": 0.777, "rewards/accuracies": 0.5, "rewards/chosen": 0.13267874717712402, "rewards/margins": -0.03271746635437012, "rewards/rejected": 0.16539622843265533, "step": 617 }, { "epoch": 0.81, "learning_rate": 4.342117122040637e-05, "logits/chosen": -1.900996208190918, "logits/rejected": -1.8993828296661377, "logps/chosen": -209.77993774414062, "logps/rejected": -206.2876739501953, "loss": 0.9541, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1300465315580368, "rewards/margins": -0.33919087052345276, "rewards/rejected": 0.46923738718032837, "step": 618 }, { "epoch": 0.81, "learning_rate": 4.339692894947974e-05, "logits/chosen": -1.8545596599578857, "logits/rejected": -1.860669493675232, "logps/chosen": -186.5775909423828, "logps/rejected": -209.63888549804688, "loss": 0.7799, "rewards/accuracies": 0.5, "rewards/chosen": 0.011012416332960129, "rewards/margins": 0.014494312927126884, "rewards/rejected": -0.003481905907392502, "step": 619 }, { "epoch": 0.81, "learning_rate": 4.3372648889683364e-05, "logits/chosen": -1.8433650732040405, "logits/rejected": -1.902207612991333, "logps/chosen": -193.0688934326172, "logps/rejected": -177.6405792236328, "loss": 0.7566, "rewards/accuracies": 0.5, "rewards/chosen": 0.0212489552795887, "rewards/margins": 0.15483959019184113, "rewards/rejected": -0.13359062373638153, "step": 620 }, { "epoch": 0.81, "learning_rate": 4.334833109089057e-05, "logits/chosen": -1.6036548614501953, "logits/rejected": -1.5984629392623901, "logps/chosen": -166.178955078125, "logps/rejected": -174.63314819335938, "loss": 0.7545, "rewards/accuracies": 0.5, "rewards/chosen": -0.20678240060806274, "rewards/margins": 0.031082022935152054, "rewards/rejected": -0.2378644049167633, "step": 621 }, { "epoch": 0.81, "learning_rate": 4.33239756030522e-05, "logits/chosen": -1.8155311346054077, "logits/rejected": -1.759658694267273, "logps/chosen": -196.43370056152344, "logps/rejected": -193.10064697265625, "loss": 0.8896, "rewards/accuracies": 0.375, "rewards/chosen": -0.2915378212928772, "rewards/margins": -0.29409241676330566, "rewards/rejected": 0.0025546252727508545, "step": 622 }, { "epoch": 0.82, "learning_rate": 4.329958247619651e-05, "logits/chosen": -1.74562668800354, "logits/rejected": -1.807843565940857, "logps/chosen": -167.89886474609375, "logps/rejected": -186.01974487304688, "loss": 0.5757, "rewards/accuracies": 0.625, "rewards/chosen": -0.22084513306617737, "rewards/margins": 0.36972615122795105, "rewards/rejected": -0.5905711650848389, "step": 623 }, { "epoch": 0.82, "learning_rate": 4.3275151760429075e-05, "logits/chosen": -1.8474458456039429, "logits/rejected": -1.8740853071212769, "logps/chosen": -143.50112915039062, "logps/rejected": -154.0746307373047, "loss": 0.6583, "rewards/accuracies": 0.5, "rewards/chosen": 0.10751471668481827, "rewards/margins": 0.15516290068626404, "rewards/rejected": -0.047648198902606964, "step": 624 }, { "epoch": 0.82, "learning_rate": 4.325068350593268e-05, "logits/chosen": -1.7046520709991455, "logits/rejected": -1.8137892484664917, "logps/chosen": -179.74691772460938, "logps/rejected": -195.8003387451172, "loss": 0.7036, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2236245721578598, "rewards/margins": 0.15745726227760315, "rewards/rejected": -0.38108178973197937, "step": 625 }, { "epoch": 0.82, "learning_rate": 4.322617776296723e-05, "logits/chosen": -1.7875136137008667, "logits/rejected": -1.7463059425354004, "logps/chosen": -189.130126953125, "logps/rejected": -180.15403747558594, "loss": 0.9103, "rewards/accuracies": 0.375, "rewards/chosen": -0.34625834226608276, "rewards/margins": -0.27761733531951904, "rewards/rejected": -0.06864099949598312, "step": 626 }, { "epoch": 0.82, "learning_rate": 4.320163458186961e-05, "logits/chosen": -1.7227963209152222, "logits/rejected": -1.6379646062850952, "logps/chosen": -207.108154296875, "logps/rejected": -187.44049072265625, "loss": 0.7069, "rewards/accuracies": 0.4375, "rewards/chosen": -0.06443925201892853, "rewards/margins": 0.11024240404367447, "rewards/rejected": -0.1746816635131836, "step": 627 }, { "epoch": 0.82, "learning_rate": 4.317705401305362e-05, "logits/chosen": -1.6465739011764526, "logits/rejected": -1.676656723022461, "logps/chosen": -156.50994873046875, "logps/rejected": -169.0661163330078, "loss": 0.854, "rewards/accuracies": 0.375, "rewards/chosen": 0.04478641599416733, "rewards/margins": -0.16434717178344727, "rewards/rejected": 0.2091335952281952, "step": 628 }, { "epoch": 0.82, "learning_rate": 4.315243610700986e-05, "logits/chosen": -1.869480848312378, "logits/rejected": -1.8965606689453125, "logps/chosen": -179.2183380126953, "logps/rejected": -193.56800842285156, "loss": 0.6107, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12309933453798294, "rewards/margins": 0.24142897129058838, "rewards/rejected": -0.11832961440086365, "step": 629 }, { "epoch": 0.82, "learning_rate": 4.312778091430563e-05, "logits/chosen": -1.572332739830017, "logits/rejected": -1.5489404201507568, "logps/chosen": -185.81053161621094, "logps/rejected": -180.34814453125, "loss": 0.725, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4762308597564697, "rewards/margins": 0.07182511687278748, "rewards/rejected": -0.5480560660362244, "step": 630 }, { "epoch": 0.83, "learning_rate": 4.310308848558479e-05, "logits/chosen": -1.7954251766204834, "logits/rejected": -1.80027437210083, "logps/chosen": -217.88711547851562, "logps/rejected": -232.96463012695312, "loss": 0.8256, "rewards/accuracies": 0.25, "rewards/chosen": -0.10674677789211273, "rewards/margins": -0.12963929772377014, "rewards/rejected": 0.022892538458108902, "step": 631 }, { "epoch": 0.83, "learning_rate": 4.3078358871567706e-05, "logits/chosen": -1.759313702583313, "logits/rejected": -1.8224774599075317, "logps/chosen": -179.50665283203125, "logps/rejected": -172.76654052734375, "loss": 0.7918, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04008585214614868, "rewards/margins": 0.13793742656707764, "rewards/rejected": -0.17802327871322632, "step": 632 }, { "epoch": 0.83, "learning_rate": 4.305359212305115e-05, "logits/chosen": -1.9337170124053955, "logits/rejected": -1.9300730228424072, "logps/chosen": -174.96153259277344, "logps/rejected": -175.67324829101562, "loss": 0.66, "rewards/accuracies": 0.625, "rewards/chosen": 0.008273787796497345, "rewards/margins": 0.14017948508262634, "rewards/rejected": -0.1319057047367096, "step": 633 }, { "epoch": 0.83, "learning_rate": 4.302878829090813e-05, "logits/chosen": -1.779855728149414, "logits/rejected": -1.770744800567627, "logps/chosen": -196.88430786132812, "logps/rejected": -185.92510986328125, "loss": 0.6659, "rewards/accuracies": 0.625, "rewards/chosen": -0.15623866021633148, "rewards/margins": 0.16380999982357025, "rewards/rejected": -0.32004866003990173, "step": 634 }, { "epoch": 0.83, "learning_rate": 4.300394742608784e-05, "logits/chosen": -1.756955623626709, "logits/rejected": -1.8318352699279785, "logps/chosen": -154.08811950683594, "logps/rejected": -156.06085205078125, "loss": 0.8138, "rewards/accuracies": 0.6875, "rewards/chosen": -0.16264663636684418, "rewards/margins": 0.004404932260513306, "rewards/rejected": -0.16705156862735748, "step": 635 }, { "epoch": 0.83, "learning_rate": 4.2979069579615564e-05, "logits/chosen": -1.8656206130981445, "logits/rejected": -1.8203411102294922, "logps/chosen": -188.10708618164062, "logps/rejected": -183.7784423828125, "loss": 0.9251, "rewards/accuracies": 0.375, "rewards/chosen": -0.29391956329345703, "rewards/margins": -0.2983461618423462, "rewards/rejected": 0.004426578059792519, "step": 636 }, { "epoch": 0.83, "learning_rate": 4.2954154802592514e-05, "logits/chosen": -1.604241132736206, "logits/rejected": -1.6417977809906006, "logps/chosen": -159.251708984375, "logps/rejected": -163.13052368164062, "loss": 0.777, "rewards/accuracies": 0.5, "rewards/chosen": -0.3667724132537842, "rewards/margins": -0.06517796963453293, "rewards/rejected": -0.30159446597099304, "step": 637 }, { "epoch": 0.83, "learning_rate": 4.292920314619578e-05, "logits/chosen": -1.7538769245147705, "logits/rejected": -1.7120615243911743, "logps/chosen": -224.12762451171875, "logps/rejected": -205.00430297851562, "loss": 0.8706, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10906004160642624, "rewards/margins": -0.1597534716129303, "rewards/rejected": 0.050693415105342865, "step": 638 }, { "epoch": 0.84, "learning_rate": 4.290421466167822e-05, "logits/chosen": -1.509183406829834, "logits/rejected": -1.4262561798095703, "logps/chosen": -180.26675415039062, "logps/rejected": -181.98175048828125, "loss": 0.9278, "rewards/accuracies": 0.25, "rewards/chosen": -0.41150203347206116, "rewards/margins": -0.36716675758361816, "rewards/rejected": -0.044335294514894485, "step": 639 }, { "epoch": 0.84, "learning_rate": 4.2879189400368314e-05, "logits/chosen": -1.4397776126861572, "logits/rejected": -1.3926661014556885, "logps/chosen": -165.068603515625, "logps/rejected": -178.24217224121094, "loss": 0.8944, "rewards/accuracies": 0.375, "rewards/chosen": 0.2168307602405548, "rewards/margins": -0.030826739966869354, "rewards/rejected": 0.24765750765800476, "step": 640 }, { "epoch": 0.84, "learning_rate": 4.2854127413670096e-05, "logits/chosen": -1.8310662508010864, "logits/rejected": -1.8567099571228027, "logps/chosen": -173.8634490966797, "logps/rejected": -170.68670654296875, "loss": 0.8247, "rewards/accuracies": 0.375, "rewards/chosen": -0.36099639534950256, "rewards/margins": -0.10963311791419983, "rewards/rejected": -0.25136324763298035, "step": 641 }, { "epoch": 0.84, "learning_rate": 4.282902875306304e-05, "logits/chosen": -1.6725221872329712, "logits/rejected": -1.6123416423797607, "logps/chosen": -210.78814697265625, "logps/rejected": -199.3596649169922, "loss": 0.7146, "rewards/accuracies": 0.5, "rewards/chosen": -0.2739197611808777, "rewards/margins": 0.1191968321800232, "rewards/rejected": -0.39311662316322327, "step": 642 }, { "epoch": 0.84, "learning_rate": 4.280389347010194e-05, "logits/chosen": -1.8086258172988892, "logits/rejected": -1.818231225013733, "logps/chosen": -175.2198944091797, "logps/rejected": -176.08119201660156, "loss": 0.8965, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3234391212463379, "rewards/margins": -0.2487168163061142, "rewards/rejected": -0.07472231984138489, "step": 643 }, { "epoch": 0.84, "learning_rate": 4.277872161641682e-05, "logits/chosen": -1.8151034116744995, "logits/rejected": -1.8139681816101074, "logps/chosen": -160.6265869140625, "logps/rejected": -169.05947875976562, "loss": 0.8433, "rewards/accuracies": 0.5, "rewards/chosen": 0.014796596020460129, "rewards/margins": -0.17133474349975586, "rewards/rejected": 0.18613135814666748, "step": 644 }, { "epoch": 0.84, "learning_rate": 4.275351324371283e-05, "logits/chosen": -1.8731780052185059, "logits/rejected": -1.903558373451233, "logps/chosen": -172.57882690429688, "logps/rejected": -180.54104614257812, "loss": 0.8243, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4790874421596527, "rewards/margins": -0.1846233606338501, "rewards/rejected": -0.2944641411304474, "step": 645 }, { "epoch": 0.85, "learning_rate": 4.2728268403770145e-05, "logits/chosen": -1.8170170783996582, "logits/rejected": -1.8452297449111938, "logps/chosen": -172.42489624023438, "logps/rejected": -174.13865661621094, "loss": 0.9024, "rewards/accuracies": 0.375, "rewards/chosen": -0.345039427280426, "rewards/margins": -0.303177148103714, "rewards/rejected": -0.041862305253744125, "step": 646 }, { "epoch": 0.85, "learning_rate": 4.270298714844381e-05, "logits/chosen": -1.7615243196487427, "logits/rejected": -1.8334003686904907, "logps/chosen": -166.7871551513672, "logps/rejected": -179.83578491210938, "loss": 0.7427, "rewards/accuracies": 0.375, "rewards/chosen": -0.3320530951023102, "rewards/margins": -0.020228669047355652, "rewards/rejected": -0.3118244409561157, "step": 647 }, { "epoch": 0.85, "learning_rate": 4.267766952966369e-05, "logits/chosen": -1.7627480030059814, "logits/rejected": -1.724000334739685, "logps/chosen": -168.9439697265625, "logps/rejected": -163.2694854736328, "loss": 0.8367, "rewards/accuracies": 0.4375, "rewards/chosen": -0.42008066177368164, "rewards/margins": -0.16962656378746033, "rewards/rejected": -0.2504541277885437, "step": 648 }, { "epoch": 0.85, "learning_rate": 4.2652315599434354e-05, "logits/chosen": -1.666603922843933, "logits/rejected": -1.6485559940338135, "logps/chosen": -159.96652221679688, "logps/rejected": -157.31324768066406, "loss": 0.7431, "rewards/accuracies": 0.5, "rewards/chosen": -0.16877800226211548, "rewards/margins": 0.033473365008831024, "rewards/rejected": -0.2022513449192047, "step": 649 }, { "epoch": 0.85, "learning_rate": 4.262692540983496e-05, "logits/chosen": -1.6999112367630005, "logits/rejected": -1.7737563848495483, "logps/chosen": -156.45681762695312, "logps/rejected": -185.85430908203125, "loss": 0.7336, "rewards/accuracies": 0.5, "rewards/chosen": -0.33198750019073486, "rewards/margins": -0.019247818738222122, "rewards/rejected": -0.31273967027664185, "step": 650 }, { "epoch": 0.85, "learning_rate": 4.2601499013019126e-05, "logits/chosen": -1.680021047592163, "logits/rejected": -1.7226678133010864, "logps/chosen": -156.96827697753906, "logps/rejected": -159.31240844726562, "loss": 0.7094, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1396864801645279, "rewards/margins": 0.11252421140670776, "rewards/rejected": -0.25221067667007446, "step": 651 }, { "epoch": 0.85, "learning_rate": 4.257603646121484e-05, "logits/chosen": -1.6294937133789062, "logits/rejected": -1.6531920433044434, "logps/chosen": -194.5813446044922, "logps/rejected": -193.0689697265625, "loss": 0.8568, "rewards/accuracies": 0.4375, "rewards/chosen": -0.348048597574234, "rewards/margins": -0.0678197517991066, "rewards/rejected": -0.280228853225708, "step": 652 }, { "epoch": 0.85, "learning_rate": 4.2550537806724384e-05, "logits/chosen": -1.653900146484375, "logits/rejected": -1.725424885749817, "logps/chosen": -188.66009521484375, "logps/rejected": -189.3316650390625, "loss": 0.6937, "rewards/accuracies": 0.4375, "rewards/chosen": -0.10200534015893936, "rewards/margins": 0.19126522541046143, "rewards/rejected": -0.2932705581188202, "step": 653 }, { "epoch": 0.86, "learning_rate": 4.2525003101924164e-05, "logits/chosen": -1.7711960077285767, "logits/rejected": -1.7922366857528687, "logps/chosen": -188.667724609375, "logps/rejected": -199.7714080810547, "loss": 0.8373, "rewards/accuracies": 0.5625, "rewards/chosen": -0.46268340945243835, "rewards/margins": -0.047878868877887726, "rewards/rejected": -0.4148045480251312, "step": 654 }, { "epoch": 0.86, "learning_rate": 4.249943239926467e-05, "logits/chosen": -1.870734691619873, "logits/rejected": -1.888472080230713, "logps/chosen": -173.50595092773438, "logps/rejected": -187.27886962890625, "loss": 0.7626, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11934824287891388, "rewards/margins": 0.06529253721237183, "rewards/rejected": -0.1846407949924469, "step": 655 }, { "epoch": 0.86, "learning_rate": 4.247382575127031e-05, "logits/chosen": -1.6958601474761963, "logits/rejected": -1.657966136932373, "logps/chosen": -176.43829345703125, "logps/rejected": -214.62869262695312, "loss": 0.6291, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09161286056041718, "rewards/margins": 0.4105958044528961, "rewards/rejected": -0.5022085905075073, "step": 656 }, { "epoch": 0.86, "learning_rate": 4.2448183210539334e-05, "logits/chosen": -1.628333568572998, "logits/rejected": -1.6449060440063477, "logps/chosen": -148.4341583251953, "logps/rejected": -144.66708374023438, "loss": 0.6733, "rewards/accuracies": 0.5, "rewards/chosen": -0.00030353665351867676, "rewards/margins": 0.1996443122625351, "rewards/rejected": -0.19994783401489258, "step": 657 }, { "epoch": 0.86, "learning_rate": 4.2422504829743724e-05, "logits/chosen": -1.765242099761963, "logits/rejected": -1.7532581090927124, "logps/chosen": -164.06983947753906, "logps/rejected": -183.4119110107422, "loss": 0.4436, "rewards/accuracies": 0.875, "rewards/chosen": 0.08075151592493057, "rewards/margins": 0.8164087533950806, "rewards/rejected": -0.7356572151184082, "step": 658 }, { "epoch": 0.86, "learning_rate": 4.239679066162907e-05, "logits/chosen": -1.910383701324463, "logits/rejected": -1.8477028608322144, "logps/chosen": -168.18246459960938, "logps/rejected": -175.83685302734375, "loss": 0.727, "rewards/accuracies": 0.5625, "rewards/chosen": -0.29001450538635254, "rewards/margins": 0.1458943635225296, "rewards/rejected": -0.43590888381004333, "step": 659 }, { "epoch": 0.86, "learning_rate": 4.237104075901449e-05, "logits/chosen": -1.5762726068496704, "logits/rejected": -1.6210079193115234, "logps/chosen": -202.59307861328125, "logps/rejected": -213.10899353027344, "loss": 0.7975, "rewards/accuracies": 0.5, "rewards/chosen": -0.4130185544490814, "rewards/margins": -0.1117025762796402, "rewards/rejected": -0.30131596326828003, "step": 660 }, { "epoch": 0.87, "learning_rate": 4.234525517479248e-05, "logits/chosen": -1.8088970184326172, "logits/rejected": -1.833585262298584, "logps/chosen": -168.6361541748047, "logps/rejected": -173.4584503173828, "loss": 0.7173, "rewards/accuracies": 0.375, "rewards/chosen": -0.29294320940971375, "rewards/margins": 0.052368972450494766, "rewards/rejected": -0.3453121483325958, "step": 661 }, { "epoch": 0.87, "learning_rate": 4.2319433961928844e-05, "logits/chosen": -1.7081291675567627, "logits/rejected": -1.7482770681381226, "logps/chosen": -165.43646240234375, "logps/rejected": -193.29000854492188, "loss": 0.8157, "rewards/accuracies": 0.3125, "rewards/chosen": -0.42327651381492615, "rewards/margins": -0.0674619972705841, "rewards/rejected": -0.35581451654434204, "step": 662 }, { "epoch": 0.87, "learning_rate": 4.229357717346257e-05, "logits/chosen": -1.9136518239974976, "logits/rejected": -1.855428695678711, "logps/chosen": -190.4178466796875, "logps/rejected": -188.52420043945312, "loss": 0.7248, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10876584053039551, "rewards/margins": 0.00017318874597549438, "rewards/rejected": -0.1089390367269516, "step": 663 }, { "epoch": 0.87, "learning_rate": 4.226768486250572e-05, "logits/chosen": -1.8013920783996582, "logits/rejected": -1.8036880493164062, "logps/chosen": -194.70623779296875, "logps/rejected": -216.53485107421875, "loss": 0.6308, "rewards/accuracies": 0.625, "rewards/chosen": -0.5092557668685913, "rewards/margins": 0.299633264541626, "rewards/rejected": -0.8088890314102173, "step": 664 }, { "epoch": 0.87, "learning_rate": 4.224175708224332e-05, "logits/chosen": -1.7224409580230713, "logits/rejected": -1.718071460723877, "logps/chosen": -187.01220703125, "logps/rejected": -182.26296997070312, "loss": 0.8315, "rewards/accuracies": 0.375, "rewards/chosen": -0.3375519812107086, "rewards/margins": -0.13301782310009003, "rewards/rejected": -0.2045341432094574, "step": 665 }, { "epoch": 0.87, "learning_rate": 4.221579388593326e-05, "logits/chosen": -1.7278739213943481, "logits/rejected": -1.7006864547729492, "logps/chosen": -178.83615112304688, "logps/rejected": -185.47445678710938, "loss": 0.7341, "rewards/accuracies": 0.625, "rewards/chosen": -0.32711875438690186, "rewards/margins": 0.09781965613365173, "rewards/rejected": -0.4249383807182312, "step": 666 }, { "epoch": 0.87, "learning_rate": 4.218979532690616e-05, "logits/chosen": -2.025843620300293, "logits/rejected": -2.0172643661499023, "logps/chosen": -165.31512451171875, "logps/rejected": -160.6278839111328, "loss": 0.7005, "rewards/accuracies": 0.5, "rewards/chosen": -0.4214673638343811, "rewards/margins": 0.03614773973822594, "rewards/rejected": -0.45761507749557495, "step": 667 }, { "epoch": 0.87, "learning_rate": 4.216376145856529e-05, "logits/chosen": -1.802656888961792, "logits/rejected": -1.771694302558899, "logps/chosen": -196.45603942871094, "logps/rejected": -202.7981414794922, "loss": 0.5865, "rewards/accuracies": 0.75, "rewards/chosen": -0.3090572953224182, "rewards/margins": 0.4084371328353882, "rewards/rejected": -0.7174944877624512, "step": 668 }, { "epoch": 0.88, "learning_rate": 4.213769233438646e-05, "logits/chosen": -1.7967143058776855, "logits/rejected": -1.772080898284912, "logps/chosen": -237.03370666503906, "logps/rejected": -236.46771240234375, "loss": 0.8748, "rewards/accuracies": 0.5, "rewards/chosen": -0.5695147514343262, "rewards/margins": -0.10085253417491913, "rewards/rejected": -0.46866220235824585, "step": 669 }, { "epoch": 0.88, "learning_rate": 4.211158800791788e-05, "logits/chosen": -1.9533357620239258, "logits/rejected": -1.9477338790893555, "logps/chosen": -195.1766357421875, "logps/rejected": -168.0863037109375, "loss": 0.947, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6404910087585449, "rewards/margins": -0.25117361545562744, "rewards/rejected": -0.3893173635005951, "step": 670 }, { "epoch": 0.88, "learning_rate": 4.208544853278008e-05, "logits/chosen": -1.7536804676055908, "logits/rejected": -1.7879080772399902, "logps/chosen": -167.13681030273438, "logps/rejected": -163.1280517578125, "loss": 0.9277, "rewards/accuracies": 0.375, "rewards/chosen": -0.6327708959579468, "rewards/margins": -0.3013594448566437, "rewards/rejected": -0.3314114212989807, "step": 671 }, { "epoch": 0.88, "learning_rate": 4.205927396266577e-05, "logits/chosen": -1.923248291015625, "logits/rejected": -1.9297988414764404, "logps/chosen": -206.3466796875, "logps/rejected": -210.54095458984375, "loss": 0.8353, "rewards/accuracies": 0.5, "rewards/chosen": -0.27377229928970337, "rewards/margins": 0.06531517207622528, "rewards/rejected": -0.33908745646476746, "step": 672 }, { "epoch": 0.88, "learning_rate": 4.203306435133978e-05, "logits/chosen": -1.7622315883636475, "logits/rejected": -1.673543930053711, "logps/chosen": -205.08291625976562, "logps/rejected": -207.41424560546875, "loss": 0.8054, "rewards/accuracies": 0.5, "rewards/chosen": -0.26307201385498047, "rewards/margins": 0.005908198654651642, "rewards/rejected": -0.2689802348613739, "step": 673 }, { "epoch": 0.88, "learning_rate": 4.200681975263888e-05, "logits/chosen": -2.055384635925293, "logits/rejected": -2.045701503753662, "logps/chosen": -180.04840087890625, "logps/rejected": -169.86734008789062, "loss": 0.8382, "rewards/accuracies": 0.4375, "rewards/chosen": -0.41850394010543823, "rewards/margins": -0.2109215408563614, "rewards/rejected": -0.20758239924907684, "step": 674 }, { "epoch": 0.88, "learning_rate": 4.1980540220471744e-05, "logits/chosen": -1.940061092376709, "logits/rejected": -1.9028154611587524, "logps/chosen": -156.75413513183594, "logps/rejected": -165.96414184570312, "loss": 1.0616, "rewards/accuracies": 0.375, "rewards/chosen": -0.19156816601753235, "rewards/margins": -0.4865376949310303, "rewards/rejected": 0.29496949911117554, "step": 675 }, { "epoch": 0.88, "learning_rate": 4.195422580881878e-05, "logits/chosen": -1.651497721672058, "logits/rejected": -1.6224371194839478, "logps/chosen": -193.23190307617188, "logps/rejected": -176.51651000976562, "loss": 0.7193, "rewards/accuracies": 0.625, "rewards/chosen": -0.344645231962204, "rewards/margins": 0.07098521292209625, "rewards/rejected": -0.41563040018081665, "step": 676 }, { "epoch": 0.89, "learning_rate": 4.192787657173204e-05, "logits/chosen": -1.779762625694275, "logits/rejected": -1.8038190603256226, "logps/chosen": -172.65908813476562, "logps/rejected": -175.31455993652344, "loss": 0.775, "rewards/accuracies": 0.5, "rewards/chosen": -0.49525824189186096, "rewards/margins": -0.10039810091257095, "rewards/rejected": -0.394860178232193, "step": 677 }, { "epoch": 0.89, "learning_rate": 4.1901492563335115e-05, "logits/chosen": -2.0335657596588135, "logits/rejected": -1.9739696979522705, "logps/chosen": -209.03280639648438, "logps/rejected": -197.73880004882812, "loss": 0.9794, "rewards/accuracies": 0.1875, "rewards/chosen": -0.6130102872848511, "rewards/margins": -0.4307286739349365, "rewards/rejected": -0.18228159844875336, "step": 678 }, { "epoch": 0.89, "learning_rate": 4.187507383782303e-05, "logits/chosen": -1.7845653295516968, "logits/rejected": -1.8473167419433594, "logps/chosen": -175.36248779296875, "logps/rejected": -192.36488342285156, "loss": 0.6858, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3342759311199188, "rewards/margins": 0.16146346926689148, "rewards/rejected": -0.4957394003868103, "step": 679 }, { "epoch": 0.89, "learning_rate": 4.1848620449462115e-05, "logits/chosen": -1.681877613067627, "logits/rejected": -1.7026312351226807, "logps/chosen": -193.92356872558594, "logps/rejected": -171.92117309570312, "loss": 0.9088, "rewards/accuracies": 0.25, "rewards/chosen": -0.47164231538772583, "rewards/margins": -0.34437641501426697, "rewards/rejected": -0.12726587057113647, "step": 680 }, { "epoch": 0.89, "learning_rate": 4.1822132452589885e-05, "logits/chosen": -2.0411343574523926, "logits/rejected": -2.014064311981201, "logps/chosen": -177.00779724121094, "logps/rejected": -179.8595428466797, "loss": 0.559, "rewards/accuracies": 0.6875, "rewards/chosen": -0.27876853942871094, "rewards/margins": 0.3328251242637634, "rewards/rejected": -0.6115936636924744, "step": 681 }, { "epoch": 0.89, "learning_rate": 4.1795609901614966e-05, "logits/chosen": -1.8736965656280518, "logits/rejected": -1.8809561729431152, "logps/chosen": -176.62069702148438, "logps/rejected": -183.5467529296875, "loss": 0.7303, "rewards/accuracies": 0.5, "rewards/chosen": -0.2544437348842621, "rewards/margins": 0.08209258317947388, "rewards/rejected": -0.33653631806373596, "step": 682 }, { "epoch": 0.89, "learning_rate": 4.176905285101695e-05, "logits/chosen": -1.7462670803070068, "logits/rejected": -1.761965036392212, "logps/chosen": -151.63314819335938, "logps/rejected": -151.56427001953125, "loss": 0.6546, "rewards/accuracies": 0.625, "rewards/chosen": -0.3507009446620941, "rewards/margins": 0.17545562982559204, "rewards/rejected": -0.5261565446853638, "step": 683 }, { "epoch": 0.9, "learning_rate": 4.17424613553463e-05, "logits/chosen": -1.8288514614105225, "logits/rejected": -1.859897255897522, "logps/chosen": -171.83522033691406, "logps/rejected": -191.14358520507812, "loss": 0.7507, "rewards/accuracies": 0.3125, "rewards/chosen": -0.28291165828704834, "rewards/margins": -0.012934118509292603, "rewards/rejected": -0.26997753977775574, "step": 684 }, { "epoch": 0.9, "learning_rate": 4.171583546922423e-05, "logits/chosen": -1.8343758583068848, "logits/rejected": -1.8267977237701416, "logps/chosen": -135.20872497558594, "logps/rejected": -134.46913146972656, "loss": 0.7083, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1251545250415802, "rewards/margins": 0.06863637268543243, "rewards/rejected": -0.19379091262817383, "step": 685 }, { "epoch": 0.9, "learning_rate": 4.1689175247342584e-05, "logits/chosen": -1.9471396207809448, "logits/rejected": -1.9692058563232422, "logps/chosen": -192.59835815429688, "logps/rejected": -198.68960571289062, "loss": 0.6293, "rewards/accuracies": 0.5, "rewards/chosen": -0.2941395044326782, "rewards/margins": 0.24021106958389282, "rewards/rejected": -0.5343505144119263, "step": 686 }, { "epoch": 0.9, "learning_rate": 4.1662480744463744e-05, "logits/chosen": -1.9480713605880737, "logits/rejected": -1.9755477905273438, "logps/chosen": -179.84698486328125, "logps/rejected": -167.24124145507812, "loss": 0.7688, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4756694436073303, "rewards/margins": 0.07222910225391388, "rewards/rejected": -0.547898530960083, "step": 687 }, { "epoch": 0.9, "learning_rate": 4.163575201542052e-05, "logits/chosen": -1.9736183881759644, "logits/rejected": -1.9988312721252441, "logps/chosen": -155.44598388671875, "logps/rejected": -169.1442108154297, "loss": 0.6985, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0029303748160600662, "rewards/margins": 0.2341531217098236, "rewards/rejected": -0.23708350956439972, "step": 688 }, { "epoch": 0.9, "learning_rate": 4.1608989115116e-05, "logits/chosen": -1.8959014415740967, "logits/rejected": -1.9003026485443115, "logps/chosen": -191.73049926757812, "logps/rejected": -210.97207641601562, "loss": 0.8625, "rewards/accuracies": 0.4375, "rewards/chosen": -0.14525511860847473, "rewards/margins": -0.15813013911247253, "rewards/rejected": 0.012875035405158997, "step": 689 }, { "epoch": 0.9, "learning_rate": 4.158219209852349e-05, "logits/chosen": -1.875536561012268, "logits/rejected": -1.8144606351852417, "logps/chosen": -183.92649841308594, "logps/rejected": -184.21307373046875, "loss": 0.8123, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3249794840812683, "rewards/margins": -0.07503964751958847, "rewards/rejected": -0.24993987381458282, "step": 690 }, { "epoch": 0.9, "learning_rate": 4.155536102068636e-05, "logits/chosen": -1.874202013015747, "logits/rejected": -1.906548261642456, "logps/chosen": -194.843505859375, "logps/rejected": -206.90777587890625, "loss": 0.8466, "rewards/accuracies": 0.375, "rewards/chosen": -0.4885198473930359, "rewards/margins": -0.20483222603797913, "rewards/rejected": -0.28368765115737915, "step": 691 }, { "epoch": 0.91, "learning_rate": 4.152849593671793e-05, "logits/chosen": -1.8392844200134277, "logits/rejected": -1.8655755519866943, "logps/chosen": -232.13516235351562, "logps/rejected": -248.52139282226562, "loss": 0.8003, "rewards/accuracies": 0.4375, "rewards/chosen": -0.466138631105423, "rewards/margins": -0.003687852993607521, "rewards/rejected": -0.46245077252388, "step": 692 }, { "epoch": 0.91, "learning_rate": 4.1501596901801384e-05, "logits/chosen": -1.7795881032943726, "logits/rejected": -1.749732255935669, "logps/chosen": -207.29209899902344, "logps/rejected": -188.52491760253906, "loss": 0.7602, "rewards/accuracies": 0.5, "rewards/chosen": -0.2848435938358307, "rewards/margins": 0.07350137829780579, "rewards/rejected": -0.3583449721336365, "step": 693 }, { "epoch": 0.91, "learning_rate": 4.147466397118968e-05, "logits/chosen": -1.8992621898651123, "logits/rejected": -1.870530128479004, "logps/chosen": -291.69342041015625, "logps/rejected": -304.0249938964844, "loss": 0.7225, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4635903239250183, "rewards/margins": 0.08878035098314285, "rewards/rejected": -0.5523706674575806, "step": 694 }, { "epoch": 0.91, "learning_rate": 4.144769720020533e-05, "logits/chosen": -1.7536263465881348, "logits/rejected": -1.779775619506836, "logps/chosen": -216.75164794921875, "logps/rejected": -257.4428405761719, "loss": 0.8723, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7266858220100403, "rewards/margins": -0.09885367751121521, "rewards/rejected": -0.6278320550918579, "step": 695 }, { "epoch": 0.91, "learning_rate": 4.142069664424041e-05, "logits/chosen": -1.9797451496124268, "logits/rejected": -1.9267971515655518, "logps/chosen": -165.46441650390625, "logps/rejected": -167.7301483154297, "loss": 0.7904, "rewards/accuracies": 0.5, "rewards/chosen": -0.5045456290245056, "rewards/margins": -0.02093798667192459, "rewards/rejected": -0.4836076498031616, "step": 696 }, { "epoch": 0.91, "learning_rate": 4.139366235875637e-05, "logits/chosen": -1.8825287818908691, "logits/rejected": -1.8622865676879883, "logps/chosen": -185.31285095214844, "logps/rejected": -207.89364624023438, "loss": 0.8387, "rewards/accuracies": 0.5, "rewards/chosen": -0.23477278649806976, "rewards/margins": -0.08795370161533356, "rewards/rejected": -0.146819069981575, "step": 697 }, { "epoch": 0.91, "learning_rate": 4.136659439928397e-05, "logits/chosen": -1.8915377855300903, "logits/rejected": -1.8720455169677734, "logps/chosen": -194.33871459960938, "logps/rejected": -203.15911865234375, "loss": 0.5884, "rewards/accuracies": 0.625, "rewards/chosen": 0.17537376284599304, "rewards/margins": 0.34904515743255615, "rewards/rejected": -0.17367137968540192, "step": 698 }, { "epoch": 0.91, "learning_rate": 4.13394928214231e-05, "logits/chosen": -1.3153133392333984, "logits/rejected": -1.2558776140213013, "logps/chosen": -226.58245849609375, "logps/rejected": -229.76478576660156, "loss": 0.7587, "rewards/accuracies": 0.4375, "rewards/chosen": -0.569631040096283, "rewards/margins": -0.02661065012216568, "rewards/rejected": -0.5430203676223755, "step": 699 }, { "epoch": 0.92, "learning_rate": 4.1312357680842735e-05, "logits/chosen": -1.6899161338806152, "logits/rejected": -1.8224129676818848, "logps/chosen": -157.56524658203125, "logps/rejected": -175.26402282714844, "loss": 0.8479, "rewards/accuracies": 0.375, "rewards/chosen": -0.6318594813346863, "rewards/margins": -0.13617253303527832, "rewards/rejected": -0.49568694829940796, "step": 700 }, { "epoch": 0.92, "learning_rate": 4.128518903328078e-05, "logits/chosen": -1.964836597442627, "logits/rejected": -1.9238197803497314, "logps/chosen": -153.36062622070312, "logps/rejected": -157.65313720703125, "loss": 0.7669, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5114039778709412, "rewards/margins": 0.02798408642411232, "rewards/rejected": -0.5393880605697632, "step": 701 }, { "epoch": 0.92, "learning_rate": 4.125798693454396e-05, "logits/chosen": -2.0570058822631836, "logits/rejected": -2.0080535411834717, "logps/chosen": -194.94296264648438, "logps/rejected": -177.8400421142578, "loss": 0.752, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22861789166927338, "rewards/margins": 0.06845703721046448, "rewards/rejected": -0.29707497358322144, "step": 702 }, { "epoch": 0.92, "learning_rate": 4.123075144050772e-05, "logits/chosen": -1.79075026512146, "logits/rejected": -1.7762547731399536, "logps/chosen": -182.18724060058594, "logps/rejected": -171.6795654296875, "loss": 0.8897, "rewards/accuracies": 0.375, "rewards/chosen": -0.5534724593162537, "rewards/margins": -0.30023112893104553, "rewards/rejected": -0.25324133038520813, "step": 703 }, { "epoch": 0.92, "learning_rate": 4.120348260711611e-05, "logits/chosen": -1.4944194555282593, "logits/rejected": -1.5302600860595703, "logps/chosen": -200.65188598632812, "logps/rejected": -205.47659301757812, "loss": 0.6494, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6325050592422485, "rewards/margins": 0.22558629512786865, "rewards/rejected": -0.8580912351608276, "step": 704 }, { "epoch": 0.92, "learning_rate": 4.117618049038165e-05, "logits/chosen": -1.6539419889450073, "logits/rejected": -1.763761043548584, "logps/chosen": -204.7454376220703, "logps/rejected": -203.48162841796875, "loss": 0.8093, "rewards/accuracies": 0.5, "rewards/chosen": -0.12654227018356323, "rewards/margins": -0.014543063938617706, "rewards/rejected": -0.11199923604726791, "step": 705 }, { "epoch": 0.92, "learning_rate": 4.1148845146385214e-05, "logits/chosen": -1.9443776607513428, "logits/rejected": -1.9469598531723022, "logps/chosen": -160.744140625, "logps/rejected": -189.53492736816406, "loss": 0.9369, "rewards/accuracies": 0.3125, "rewards/chosen": -0.3124936819076538, "rewards/margins": -0.24969440698623657, "rewards/rejected": -0.06279925256967545, "step": 706 }, { "epoch": 0.93, "learning_rate": 4.112147663127596e-05, "logits/chosen": -1.9382710456848145, "logits/rejected": -1.9214251041412354, "logps/chosen": -222.22467041015625, "logps/rejected": -222.050537109375, "loss": 0.8019, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11056547611951828, "rewards/margins": -0.16465231776237488, "rewards/rejected": 0.054086834192276, "step": 707 }, { "epoch": 0.93, "learning_rate": 4.109407500127116e-05, "logits/chosen": -1.9728318452835083, "logits/rejected": -2.0126798152923584, "logps/chosen": -215.3148651123047, "logps/rejected": -226.9068603515625, "loss": 0.6567, "rewards/accuracies": 0.4375, "rewards/chosen": 0.04354026913642883, "rewards/margins": 0.456782728433609, "rewards/rejected": -0.4132424294948578, "step": 708 }, { "epoch": 0.93, "learning_rate": 4.106664031265611e-05, "logits/chosen": -2.0477206707000732, "logits/rejected": -2.063199043273926, "logps/chosen": -175.30618286132812, "logps/rejected": -174.9276885986328, "loss": 0.6284, "rewards/accuracies": 0.625, "rewards/chosen": -0.07596893608570099, "rewards/margins": 0.2849644720554352, "rewards/rejected": -0.36093342304229736, "step": 709 }, { "epoch": 0.93, "learning_rate": 4.103917262178402e-05, "logits/chosen": -1.9526329040527344, "logits/rejected": -2.0106678009033203, "logps/chosen": -168.55447387695312, "logps/rejected": -186.49818420410156, "loss": 0.6474, "rewards/accuracies": 0.625, "rewards/chosen": -0.09987005591392517, "rewards/margins": 0.15396371483802795, "rewards/rejected": -0.2538337707519531, "step": 710 }, { "epoch": 0.93, "learning_rate": 4.1011671985075865e-05, "logits/chosen": -1.7663332223892212, "logits/rejected": -1.7783488035202026, "logps/chosen": -147.06881713867188, "logps/rejected": -149.45726013183594, "loss": 0.6022, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16052348911762238, "rewards/margins": 0.24407917261123657, "rewards/rejected": -0.08355572074651718, "step": 711 }, { "epoch": 0.93, "learning_rate": 4.098413845902033e-05, "logits/chosen": -1.698045253753662, "logits/rejected": -1.800790786743164, "logps/chosen": -167.90267944335938, "logps/rejected": -177.54620361328125, "loss": 0.8036, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11254360526800156, "rewards/margins": -0.024379856884479523, "rewards/rejected": -0.08816378563642502, "step": 712 }, { "epoch": 0.93, "learning_rate": 4.095657210017364e-05, "logits/chosen": -1.8730559349060059, "logits/rejected": -1.8630263805389404, "logps/chosen": -178.40362548828125, "logps/rejected": -168.40597534179688, "loss": 0.7513, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1604912132024765, "rewards/margins": -0.03123726323246956, "rewards/rejected": 0.19172844290733337, "step": 713 }, { "epoch": 0.93, "learning_rate": 4.092897296515944e-05, "logits/chosen": -1.9024522304534912, "logits/rejected": -1.9056739807128906, "logps/chosen": -167.58963012695312, "logps/rejected": -177.44297790527344, "loss": 0.9506, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1954386681318283, "rewards/margins": -0.3931421637535095, "rewards/rejected": 0.1977035105228424, "step": 714 }, { "epoch": 0.94, "learning_rate": 4.090134111066874e-05, "logits/chosen": -1.9650360345840454, "logits/rejected": -2.005605459213257, "logps/chosen": -168.1615447998047, "logps/rejected": -198.7694549560547, "loss": 0.7222, "rewards/accuracies": 0.625, "rewards/chosen": 0.17665693163871765, "rewards/margins": 0.07247452437877655, "rewards/rejected": 0.1041824072599411, "step": 715 }, { "epoch": 0.94, "learning_rate": 4.0873676593459725e-05, "logits/chosen": -1.9938017129898071, "logits/rejected": -1.9984208345413208, "logps/chosen": -162.42727661132812, "logps/rejected": -154.55877685546875, "loss": 0.9037, "rewards/accuracies": 0.25, "rewards/chosen": -0.017479307949543, "rewards/margins": -0.1982007771730423, "rewards/rejected": 0.18072140216827393, "step": 716 }, { "epoch": 0.94, "learning_rate": 4.08459794703577e-05, "logits/chosen": -1.8340137004852295, "logits/rejected": -1.7937979698181152, "logps/chosen": -163.39852905273438, "logps/rejected": -163.37271118164062, "loss": 0.8219, "rewards/accuracies": 0.375, "rewards/chosen": 0.0151270292699337, "rewards/margins": -0.09924013912677765, "rewards/rejected": 0.11436714231967926, "step": 717 }, { "epoch": 0.94, "learning_rate": 4.081824979825492e-05, "logits/chosen": -1.834256649017334, "logits/rejected": -1.7396763563156128, "logps/chosen": -178.6987762451172, "logps/rejected": -186.36557006835938, "loss": 0.7235, "rewards/accuracies": 0.375, "rewards/chosen": -0.19348809123039246, "rewards/margins": 0.030947115272283554, "rewards/rejected": -0.2244352102279663, "step": 718 }, { "epoch": 0.94, "learning_rate": 4.07904876341105e-05, "logits/chosen": -1.9482046365737915, "logits/rejected": -1.9566956758499146, "logps/chosen": -165.24465942382812, "logps/rejected": -194.7000274658203, "loss": 0.6016, "rewards/accuracies": 0.625, "rewards/chosen": 0.015304666012525558, "rewards/margins": 0.2807159423828125, "rewards/rejected": -0.26541128754615784, "step": 719 }, { "epoch": 0.94, "learning_rate": 4.076269303495033e-05, "logits/chosen": -1.6069618463516235, "logits/rejected": -1.5738195180892944, "logps/chosen": -218.8490447998047, "logps/rejected": -214.1678466796875, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.12460929155349731, "rewards/margins": 0.16634413599967957, "rewards/rejected": -0.04173481464385986, "step": 720 }, { "epoch": 0.94, "learning_rate": 4.073486605786689e-05, "logits/chosen": -1.8867998123168945, "logits/rejected": -1.9408642053604126, "logps/chosen": -162.95245361328125, "logps/rejected": -180.93423461914062, "loss": 0.7724, "rewards/accuracies": 0.5625, "rewards/chosen": -0.036885879933834076, "rewards/margins": -0.006463900208473206, "rewards/rejected": -0.030421972274780273, "step": 721 }, { "epoch": 0.94, "learning_rate": 4.0707006760019175e-05, "logits/chosen": -1.9228891134262085, "logits/rejected": -1.9769740104675293, "logps/chosen": -168.56643676757812, "logps/rejected": -176.51063537597656, "loss": 0.6297, "rewards/accuracies": 0.625, "rewards/chosen": 0.20986348390579224, "rewards/margins": 0.29382628202438354, "rewards/rejected": -0.08396277576684952, "step": 722 }, { "epoch": 0.95, "learning_rate": 4.067911519863257e-05, "logits/chosen": -1.6868350505828857, "logits/rejected": -1.666567087173462, "logps/chosen": -173.6453399658203, "logps/rejected": -166.50987243652344, "loss": 0.7459, "rewards/accuracies": 0.375, "rewards/chosen": -0.21088244020938873, "rewards/margins": 0.03544869273900986, "rewards/rejected": -0.246331125497818, "step": 723 }, { "epoch": 0.95, "learning_rate": 4.065119143099874e-05, "logits/chosen": -1.78118896484375, "logits/rejected": -1.769471526145935, "logps/chosen": -200.60067749023438, "logps/rejected": -214.09454345703125, "loss": 0.7429, "rewards/accuracies": 0.5, "rewards/chosen": -0.5265517830848694, "rewards/margins": 0.10783174633979797, "rewards/rejected": -0.6343836188316345, "step": 724 }, { "epoch": 0.95, "learning_rate": 4.062323551447549e-05, "logits/chosen": -1.8144245147705078, "logits/rejected": -1.7792181968688965, "logps/chosen": -141.9656524658203, "logps/rejected": -135.1618194580078, "loss": 0.748, "rewards/accuracies": 0.375, "rewards/chosen": -0.021636370569467545, "rewards/margins": -0.00877220556139946, "rewards/rejected": -0.012864157557487488, "step": 725 }, { "epoch": 0.95, "learning_rate": 4.059524750648668e-05, "logits/chosen": -1.7768784761428833, "logits/rejected": -1.7305599451065063, "logps/chosen": -186.06993103027344, "logps/rejected": -194.9784698486328, "loss": 0.7282, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07539110630750656, "rewards/margins": 0.016810424625873566, "rewards/rejected": -0.09220151603221893, "step": 726 }, { "epoch": 0.95, "learning_rate": 4.056722746452207e-05, "logits/chosen": -1.8889234066009521, "logits/rejected": -1.9104032516479492, "logps/chosen": -265.5204772949219, "logps/rejected": -279.9514465332031, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": -0.13372832536697388, "rewards/margins": 0.1461489200592041, "rewards/rejected": -0.279877245426178, "step": 727 }, { "epoch": 0.95, "learning_rate": 4.053917544613723e-05, "logits/chosen": -1.8855706453323364, "logits/rejected": -1.9057148694992065, "logps/chosen": -186.10189819335938, "logps/rejected": -186.93589782714844, "loss": 0.9102, "rewards/accuracies": 0.3125, "rewards/chosen": -0.23901258409023285, "rewards/margins": -0.2926080822944641, "rewards/rejected": 0.05359550192952156, "step": 728 }, { "epoch": 0.95, "learning_rate": 4.051109150895343e-05, "logits/chosen": -1.8091373443603516, "logits/rejected": -1.8701030015945435, "logps/chosen": -217.9191131591797, "logps/rejected": -178.51612854003906, "loss": 0.8551, "rewards/accuracies": 0.5, "rewards/chosen": 0.050105515867471695, "rewards/margins": -0.13900446891784668, "rewards/rejected": 0.18911001086235046, "step": 729 }, { "epoch": 0.96, "learning_rate": 4.0482975710657455e-05, "logits/chosen": -1.8592071533203125, "logits/rejected": -1.8811057806015015, "logps/chosen": -168.57730102539062, "logps/rejected": -204.54586791992188, "loss": 0.7531, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08510392904281616, "rewards/margins": 0.1694384664297104, "rewards/rejected": -0.25454244017601013, "step": 730 }, { "epoch": 0.96, "learning_rate": 4.045482810900159e-05, "logits/chosen": -2.088895797729492, "logits/rejected": -2.0842466354370117, "logps/chosen": -177.7628631591797, "logps/rejected": -171.59335327148438, "loss": 0.5767, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16781626641750336, "rewards/margins": 0.34129244089126587, "rewards/rejected": -0.17347615957260132, "step": 731 }, { "epoch": 0.96, "learning_rate": 4.042664876180341e-05, "logits/chosen": -2.0007247924804688, "logits/rejected": -1.9096792936325073, "logps/chosen": -180.6066131591797, "logps/rejected": -170.75169372558594, "loss": 0.6665, "rewards/accuracies": 0.375, "rewards/chosen": -0.059895459562540054, "rewards/margins": 0.11071177572011948, "rewards/rejected": -0.17060723900794983, "step": 732 }, { "epoch": 0.96, "learning_rate": 4.0398437726945716e-05, "logits/chosen": -1.975681185722351, "logits/rejected": -1.9806488752365112, "logps/chosen": -190.25149536132812, "logps/rejected": -190.06532287597656, "loss": 0.5606, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14021314680576324, "rewards/margins": 0.528197169303894, "rewards/rejected": -0.3879840672016144, "step": 733 }, { "epoch": 0.96, "learning_rate": 4.037019506237638e-05, "logits/chosen": -1.9460779428482056, "logits/rejected": -1.9403655529022217, "logps/chosen": -203.6526336669922, "logps/rejected": -222.22845458984375, "loss": 0.7525, "rewards/accuracies": 0.5, "rewards/chosen": -0.027491139248013496, "rewards/margins": -0.014092395082116127, "rewards/rejected": -0.013398736715316772, "step": 734 }, { "epoch": 0.96, "learning_rate": 4.034192082610828e-05, "logits/chosen": -1.9467103481292725, "logits/rejected": -1.9498211145401, "logps/chosen": -178.94808959960938, "logps/rejected": -193.01995849609375, "loss": 0.6873, "rewards/accuracies": 0.75, "rewards/chosen": 0.14050908386707306, "rewards/margins": 0.16484788060188293, "rewards/rejected": -0.024338817223906517, "step": 735 }, { "epoch": 0.96, "learning_rate": 4.031361507621911e-05, "logits/chosen": -1.9716556072235107, "logits/rejected": -1.9920234680175781, "logps/chosen": -169.69122314453125, "logps/rejected": -174.0656280517578, "loss": 0.601, "rewards/accuracies": 0.5625, "rewards/chosen": 0.26089152693748474, "rewards/margins": 0.31827694177627563, "rewards/rejected": -0.057385385036468506, "step": 736 }, { "epoch": 0.96, "learning_rate": 4.02852778708513e-05, "logits/chosen": -1.8285408020019531, "logits/rejected": -1.825709581375122, "logps/chosen": -201.5704345703125, "logps/rejected": -208.6463165283203, "loss": 0.7082, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05932292714715004, "rewards/margins": 0.07191256433725357, "rewards/rejected": -0.1312354952096939, "step": 737 }, { "epoch": 0.97, "learning_rate": 4.0256909268211914e-05, "logits/chosen": -1.866570234298706, "logits/rejected": -1.9105130434036255, "logps/chosen": -155.8884735107422, "logps/rejected": -167.31063842773438, "loss": 0.7262, "rewards/accuracies": 0.4375, "rewards/chosen": 0.06431854516267776, "rewards/margins": 0.020435571670532227, "rewards/rejected": 0.04388298839330673, "step": 738 }, { "epoch": 0.97, "learning_rate": 4.0228509326572496e-05, "logits/chosen": -1.8914620876312256, "logits/rejected": -1.835383415222168, "logps/chosen": -193.6123809814453, "logps/rejected": -189.48651123046875, "loss": 1.0269, "rewards/accuracies": 0.25, "rewards/chosen": -0.6893608570098877, "rewards/margins": -0.49458661675453186, "rewards/rejected": -0.19477425515651703, "step": 739 }, { "epoch": 0.97, "learning_rate": 4.0200078104268944e-05, "logits/chosen": -2.1169674396514893, "logits/rejected": -2.1131343841552734, "logps/chosen": -197.66700744628906, "logps/rejected": -203.12513732910156, "loss": 0.6898, "rewards/accuracies": 0.5, "rewards/chosen": 0.24454453587532043, "rewards/margins": 0.23662899434566498, "rewards/rejected": 0.007915541529655457, "step": 740 }, { "epoch": 0.97, "learning_rate": 4.017161565970144e-05, "logits/chosen": -1.946454405784607, "logits/rejected": -1.9714114665985107, "logps/chosen": -195.95687866210938, "logps/rejected": -196.1630401611328, "loss": 0.8647, "rewards/accuracies": 0.4375, "rewards/chosen": -0.28991246223449707, "rewards/margins": -0.02519702911376953, "rewards/rejected": -0.26471543312072754, "step": 741 }, { "epoch": 0.97, "learning_rate": 4.014312205133428e-05, "logits/chosen": -1.8742949962615967, "logits/rejected": -1.8860889673233032, "logps/chosen": -170.3537139892578, "logps/rejected": -176.24937438964844, "loss": 0.5796, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2740973234176636, "rewards/margins": 0.3400841951370239, "rewards/rejected": -0.06598688662052155, "step": 742 }, { "epoch": 0.97, "learning_rate": 4.011459733769579e-05, "logits/chosen": -1.8638414144515991, "logits/rejected": -1.8556957244873047, "logps/chosen": -219.03761291503906, "logps/rejected": -227.22244262695312, "loss": 0.8823, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4476678967475891, "rewards/margins": -0.2778720259666443, "rewards/rejected": -0.16979584097862244, "step": 743 }, { "epoch": 0.97, "learning_rate": 4.0086041577378166e-05, "logits/chosen": -1.8305257558822632, "logits/rejected": -1.7608011960983276, "logps/chosen": -187.88043212890625, "logps/rejected": -169.92657470703125, "loss": 0.7201, "rewards/accuracies": 0.6875, "rewards/chosen": -0.090822733938694, "rewards/margins": 0.07794052362442017, "rewards/rejected": -0.16876326501369476, "step": 744 }, { "epoch": 0.97, "learning_rate": 4.005745482903739e-05, "logits/chosen": -1.8173670768737793, "logits/rejected": -1.7855581045150757, "logps/chosen": -179.5590362548828, "logps/rejected": -166.4422607421875, "loss": 0.6341, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17833015322685242, "rewards/margins": 0.2266974300146103, "rewards/rejected": -0.04836726933717728, "step": 745 }, { "epoch": 0.98, "learning_rate": 4.002883715139309e-05, "logits/chosen": -1.7349971532821655, "logits/rejected": -1.7977635860443115, "logps/chosen": -170.27752685546875, "logps/rejected": -167.193603515625, "loss": 0.7021, "rewards/accuracies": 0.375, "rewards/chosen": -0.12941241264343262, "rewards/margins": 0.05854359269142151, "rewards/rejected": -0.1879560351371765, "step": 746 }, { "epoch": 0.98, "learning_rate": 4.000018860322845e-05, "logits/chosen": -1.7580136060714722, "logits/rejected": -1.7533882856369019, "logps/chosen": -178.30084228515625, "logps/rejected": -163.27293395996094, "loss": 0.6128, "rewards/accuracies": 0.625, "rewards/chosen": 0.03221731632947922, "rewards/margins": 0.30581218004226685, "rewards/rejected": -0.2735949158668518, "step": 747 }, { "epoch": 0.98, "learning_rate": 3.9971509243390025e-05, "logits/chosen": -1.8093492984771729, "logits/rejected": -1.8199714422225952, "logps/chosen": -144.11441040039062, "logps/rejected": -162.16128540039062, "loss": 0.723, "rewards/accuracies": 0.375, "rewards/chosen": 0.020870715379714966, "rewards/margins": 0.024553870782256126, "rewards/rejected": -0.0036831647157669067, "step": 748 }, { "epoch": 0.98, "learning_rate": 3.99427991307877e-05, "logits/chosen": -1.867729902267456, "logits/rejected": -1.8255057334899902, "logps/chosen": -183.21380615234375, "logps/rejected": -181.64158630371094, "loss": 0.7334, "rewards/accuracies": 0.5, "rewards/chosen": 0.21690025925636292, "rewards/margins": 0.19758300483226776, "rewards/rejected": 0.01931723952293396, "step": 749 }, { "epoch": 0.98, "learning_rate": 3.9914058324394486e-05, "logits/chosen": -1.9604843854904175, "logits/rejected": -2.0102388858795166, "logps/chosen": -140.79994201660156, "logps/rejected": -147.5875244140625, "loss": 0.7198, "rewards/accuracies": 0.5, "rewards/chosen": 0.15232627093791962, "rewards/margins": 0.18342937529087067, "rewards/rejected": -0.03110312670469284, "step": 750 }, { "epoch": 0.98, "learning_rate": 3.9885286883246476e-05, "logits/chosen": -1.7222356796264648, "logits/rejected": -1.70112144947052, "logps/chosen": -147.16017150878906, "logps/rejected": -154.9020233154297, "loss": 0.92, "rewards/accuracies": 0.375, "rewards/chosen": 0.529461145401001, "rewards/margins": -0.19158104062080383, "rewards/rejected": 0.7210422158241272, "step": 751 }, { "epoch": 0.98, "learning_rate": 3.985648486644267e-05, "logits/chosen": -1.743016004562378, "logits/rejected": -1.6408265829086304, "logps/chosen": -236.60455322265625, "logps/rejected": -226.41018676757812, "loss": 0.7036, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3873446583747864, "rewards/margins": 0.05065695941448212, "rewards/rejected": -0.4380015730857849, "step": 752 }, { "epoch": 0.99, "learning_rate": 3.982765233314489e-05, "logits/chosen": -1.9756314754486084, "logits/rejected": -1.9660886526107788, "logps/chosen": -179.02066040039062, "logps/rejected": -178.9298858642578, "loss": 0.832, "rewards/accuracies": 0.3125, "rewards/chosen": 0.08823183178901672, "rewards/margins": -0.16290561854839325, "rewards/rejected": 0.25113746523857117, "step": 753 }, { "epoch": 0.99, "learning_rate": 3.979878934257762e-05, "logits/chosen": -1.6876684427261353, "logits/rejected": -1.738308310508728, "logps/chosen": -284.55389404296875, "logps/rejected": -267.27178955078125, "loss": 0.8761, "rewards/accuracies": 0.5, "rewards/chosen": 0.01247774064540863, "rewards/margins": -0.1615845113992691, "rewards/rejected": 0.17406225204467773, "step": 754 }, { "epoch": 0.99, "learning_rate": 3.976989595402793e-05, "logits/chosen": -1.7371678352355957, "logits/rejected": -1.767361044883728, "logps/chosen": -166.92442321777344, "logps/rejected": -175.44528198242188, "loss": 0.7914, "rewards/accuracies": 0.375, "rewards/chosen": -0.061636172235012054, "rewards/margins": -0.010894455015659332, "rewards/rejected": -0.050741732120513916, "step": 755 }, { "epoch": 0.99, "learning_rate": 3.974097222684532e-05, "logits/chosen": -1.9828624725341797, "logits/rejected": -1.9450846910476685, "logps/chosen": -182.8007049560547, "logps/rejected": -179.33963012695312, "loss": 0.8109, "rewards/accuracies": 0.4375, "rewards/chosen": 0.20281130075454712, "rewards/margins": -0.025454670190811157, "rewards/rejected": 0.22826597094535828, "step": 756 }, { "epoch": 0.99, "learning_rate": 3.9712018220441596e-05, "logits/chosen": -2.006014585494995, "logits/rejected": -2.0298256874084473, "logps/chosen": -163.69482421875, "logps/rejected": -163.8912353515625, "loss": 0.7536, "rewards/accuracies": 0.3125, "rewards/chosen": -0.13053151965141296, "rewards/margins": -0.029842479154467583, "rewards/rejected": -0.10068905353546143, "step": 757 }, { "epoch": 0.99, "learning_rate": 3.9683033994290767e-05, "logits/chosen": -1.5359764099121094, "logits/rejected": -1.5906102657318115, "logps/chosen": -176.468505859375, "logps/rejected": -200.26760864257812, "loss": 0.7742, "rewards/accuracies": 0.5625, "rewards/chosen": -0.018672414124011993, "rewards/margins": 0.06273224204778671, "rewards/rejected": -0.08140464872121811, "step": 758 }, { "epoch": 0.99, "learning_rate": 3.965401960792894e-05, "logits/chosen": -1.7277990579605103, "logits/rejected": -1.7565523386001587, "logps/chosen": -180.8856658935547, "logps/rejected": -178.6083526611328, "loss": 0.742, "rewards/accuracies": 0.4375, "rewards/chosen": -0.007134014740586281, "rewards/margins": 0.06274452805519104, "rewards/rejected": -0.06987853348255157, "step": 759 }, { "epoch": 0.99, "learning_rate": 3.962497512095412e-05, "logits/chosen": -1.7388745546340942, "logits/rejected": -1.780763864517212, "logps/chosen": -193.02093505859375, "logps/rejected": -216.36962890625, "loss": 0.8278, "rewards/accuracies": 0.375, "rewards/chosen": 0.12493382394313812, "rewards/margins": -0.1711753010749817, "rewards/rejected": 0.296109139919281, "step": 760 }, { "epoch": 1.0, "learning_rate": 3.95959005930262e-05, "logits/chosen": -1.7948150634765625, "logits/rejected": -1.85101318359375, "logps/chosen": -229.8053741455078, "logps/rejected": -240.33815002441406, "loss": 0.6705, "rewards/accuracies": 0.625, "rewards/chosen": 0.19152618944644928, "rewards/margins": 0.1523137390613556, "rewards/rejected": 0.03921244665980339, "step": 761 }, { "epoch": 1.0, "learning_rate": 3.9566796083866756e-05, "logits/chosen": -1.626505732536316, "logits/rejected": -1.6348035335540771, "logps/chosen": -146.9053955078125, "logps/rejected": -159.9919891357422, "loss": 0.784, "rewards/accuracies": 0.5, "rewards/chosen": 0.29515981674194336, "rewards/margins": 0.03867659345269203, "rewards/rejected": 0.2564832270145416, "step": 762 }, { "epoch": 1.0, "learning_rate": 3.953766165325892e-05, "logits/chosen": -1.582480549812317, "logits/rejected": -1.5091769695281982, "logps/chosen": -175.38235473632812, "logps/rejected": -168.53660583496094, "loss": 0.972, "rewards/accuracies": 0.375, "rewards/chosen": -0.4482717514038086, "rewards/margins": -0.27672773599624634, "rewards/rejected": -0.17154404520988464, "step": 763 }, { "epoch": 1.0, "learning_rate": 3.9508497361047334e-05, "logits/chosen": -1.8544284105300903, "logits/rejected": -1.8667871952056885, "logps/chosen": -173.8372039794922, "logps/rejected": -182.494873046875, "loss": 0.6637, "rewards/accuracies": 0.625, "rewards/chosen": 0.09536217153072357, "rewards/margins": 0.11863154172897339, "rewards/rejected": -0.023269355297088623, "step": 764 }, { "epoch": 1.0, "learning_rate": 3.9479303267137944e-05, "logits/chosen": -1.843505859375, "logits/rejected": -1.9012435674667358, "logps/chosen": -136.9263458251953, "logps/rejected": -189.14739990234375, "loss": 0.167, "rewards/accuracies": 0.9375, "rewards/chosen": 1.681319236755371, "rewards/margins": 4.289565086364746, "rewards/rejected": -2.608245849609375, "step": 765 }, { "epoch": 1.0, "learning_rate": 3.9450079431497936e-05, "logits/chosen": -1.6857999563217163, "logits/rejected": -1.6979467868804932, "logps/chosen": -242.67828369140625, "logps/rejected": -301.3565673828125, "loss": 0.0933, "rewards/accuracies": 0.9375, "rewards/chosen": 2.9228906631469727, "rewards/margins": 6.939729690551758, "rewards/rejected": -4.016839027404785, "step": 766 }, { "epoch": 1.0, "learning_rate": 3.9420825914155554e-05, "logits/chosen": -1.873047113418579, "logits/rejected": -1.9470162391662598, "logps/chosen": -154.30426025390625, "logps/rejected": -221.78143310546875, "loss": 0.0623, "rewards/accuracies": 1.0, "rewards/chosen": 3.0256850719451904, "rewards/margins": 6.026838779449463, "rewards/rejected": -3.0011534690856934, "step": 767 }, { "epoch": 1.01, "learning_rate": 3.939154277520006e-05, "logits/chosen": -1.6709171533584595, "logits/rejected": -1.6833773851394653, "logps/chosen": -145.9373321533203, "logps/rejected": -248.9676055908203, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 3.266024351119995, "rewards/margins": 8.912620544433594, "rewards/rejected": -5.6465959548950195, "step": 768 }, { "epoch": 1.01, "learning_rate": 3.9362230074781506e-05, "logits/chosen": -1.9885060787200928, "logits/rejected": -1.9463064670562744, "logps/chosen": -152.9366912841797, "logps/rejected": -194.1561279296875, "loss": 0.062, "rewards/accuracies": 0.9375, "rewards/chosen": 2.965550422668457, "rewards/margins": 6.187473773956299, "rewards/rejected": -3.2219231128692627, "step": 769 }, { "epoch": 1.01, "learning_rate": 3.9332887873110695e-05, "logits/chosen": -1.4813066720962524, "logits/rejected": -1.4207929372787476, "logps/chosen": -132.97509765625, "logps/rejected": -212.00186157226562, "loss": 0.0931, "rewards/accuracies": 0.875, "rewards/chosen": 2.51338791847229, "rewards/margins": 6.154384613037109, "rewards/rejected": -3.6409966945648193, "step": 770 }, { "epoch": 1.01, "learning_rate": 3.9303516230459035e-05, "logits/chosen": -1.8454346656799316, "logits/rejected": -1.9101213216781616, "logps/chosen": -173.029296875, "logps/rejected": -259.5926513671875, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 3.007847785949707, "rewards/margins": 7.848160266876221, "rewards/rejected": -4.840312480926514, "step": 771 }, { "epoch": 1.01, "learning_rate": 3.92741152071584e-05, "logits/chosen": -1.8387994766235352, "logits/rejected": -1.8383971452713013, "logps/chosen": -146.1198272705078, "logps/rejected": -220.85960388183594, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 2.8462674617767334, "rewards/margins": 7.253849983215332, "rewards/rejected": -4.4075822830200195, "step": 772 }, { "epoch": 1.01, "learning_rate": 3.924468486360101e-05, "logits/chosen": -1.62917160987854, "logits/rejected": -1.645263433456421, "logps/chosen": -129.12579345703125, "logps/rejected": -185.53977966308594, "loss": 0.2783, "rewards/accuracies": 0.8125, "rewards/chosen": 1.6398133039474487, "rewards/margins": 5.0858635902404785, "rewards/rejected": -3.4460501670837402, "step": 773 }, { "epoch": 1.01, "learning_rate": 3.921522526023931e-05, "logits/chosen": -1.7776762247085571, "logits/rejected": -1.79482901096344, "logps/chosen": -149.4893798828125, "logps/rejected": -202.3997802734375, "loss": 0.126, "rewards/accuracies": 0.875, "rewards/chosen": 2.4249045848846436, "rewards/margins": 5.789612770080566, "rewards/rejected": -3.364708423614502, "step": 774 }, { "epoch": 1.01, "learning_rate": 3.918573645758586e-05, "logits/chosen": -1.8283805847167969, "logits/rejected": -1.739791750907898, "logps/chosen": -144.54391479492188, "logps/rejected": -221.6058807373047, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 1.915061593055725, "rewards/margins": 5.494343280792236, "rewards/rejected": -3.579281806945801, "step": 775 }, { "epoch": 1.02, "learning_rate": 3.915621851621318e-05, "logits/chosen": -1.9749910831451416, "logits/rejected": -2.0017504692077637, "logps/chosen": -134.46275329589844, "logps/rejected": -218.1602783203125, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 2.595043420791626, "rewards/margins": 7.404112815856934, "rewards/rejected": -4.809070110321045, "step": 776 }, { "epoch": 1.02, "learning_rate": 3.9126671496753666e-05, "logits/chosen": -1.9023005962371826, "logits/rejected": -1.9459162950515747, "logps/chosen": -148.63253784179688, "logps/rejected": -228.6715545654297, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 2.283623218536377, "rewards/margins": 6.3677473068237305, "rewards/rejected": -4.084123611450195, "step": 777 }, { "epoch": 1.02, "learning_rate": 3.909709545989942e-05, "logits/chosen": -1.82866370677948, "logits/rejected": -1.8816454410552979, "logps/chosen": -143.5281982421875, "logps/rejected": -203.86846923828125, "loss": 0.0826, "rewards/accuracies": 0.9375, "rewards/chosen": 2.4189865589141846, "rewards/margins": 5.4766387939453125, "rewards/rejected": -3.057651996612549, "step": 778 }, { "epoch": 1.02, "learning_rate": 3.9067490466402156e-05, "logits/chosen": -1.8777246475219727, "logits/rejected": -1.951180338859558, "logps/chosen": -141.29640197753906, "logps/rejected": -220.64962768554688, "loss": 0.0561, "rewards/accuracies": 0.9375, "rewards/chosen": 3.0638809204101562, "rewards/margins": 7.617124080657959, "rewards/rejected": -4.553243160247803, "step": 779 }, { "epoch": 1.02, "learning_rate": 3.903785657707307e-05, "logits/chosen": -1.7680386304855347, "logits/rejected": -1.7862606048583984, "logps/chosen": -151.75965881347656, "logps/rejected": -215.16712951660156, "loss": 0.1213, "rewards/accuracies": 0.875, "rewards/chosen": 1.1962571144104004, "rewards/margins": 5.733034133911133, "rewards/rejected": -4.536776542663574, "step": 780 }, { "epoch": 1.02, "learning_rate": 3.9008193852782733e-05, "logits/chosen": -1.6814169883728027, "logits/rejected": -1.6905479431152344, "logps/chosen": -151.2788848876953, "logps/rejected": -212.93544006347656, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 1.4553009271621704, "rewards/margins": 5.7619452476501465, "rewards/rejected": -4.306644916534424, "step": 781 }, { "epoch": 1.02, "learning_rate": 3.897850235446089e-05, "logits/chosen": -2.067516565322876, "logits/rejected": -2.084655284881592, "logps/chosen": -184.94361877441406, "logps/rejected": -231.28411865234375, "loss": 0.1911, "rewards/accuracies": 0.75, "rewards/chosen": 2.0862700939178467, "rewards/margins": 5.273751258850098, "rewards/rejected": -3.187481164932251, "step": 782 }, { "epoch": 1.02, "learning_rate": 3.894878214309645e-05, "logits/chosen": -1.7666475772857666, "logits/rejected": -1.8602631092071533, "logps/chosen": -147.2201385498047, "logps/rejected": -206.15673828125, "loss": 0.1023, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5482131242752075, "rewards/margins": 4.794281482696533, "rewards/rejected": -3.246067762374878, "step": 783 }, { "epoch": 1.03, "learning_rate": 3.8919033279737274e-05, "logits/chosen": -1.707251787185669, "logits/rejected": -1.7892178297042847, "logps/chosen": -196.2216339111328, "logps/rejected": -238.6273193359375, "loss": 0.1521, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7280199527740479, "rewards/margins": 3.8712680339813232, "rewards/rejected": -3.1432480812072754, "step": 784 }, { "epoch": 1.03, "learning_rate": 3.888925582549006e-05, "logits/chosen": -1.66917884349823, "logits/rejected": -1.6865543127059937, "logps/chosen": -141.897705078125, "logps/rejected": -234.94375610351562, "loss": 0.0569, "rewards/accuracies": 0.9375, "rewards/chosen": 2.608001708984375, "rewards/margins": 7.768197059631348, "rewards/rejected": -5.160194396972656, "step": 785 }, { "epoch": 1.03, "learning_rate": 3.885944984152027e-05, "logits/chosen": -1.6489955186843872, "logits/rejected": -1.6364490985870361, "logps/chosen": -177.00204467773438, "logps/rejected": -221.19728088378906, "loss": 0.098, "rewards/accuracies": 0.9375, "rewards/chosen": 1.609315276145935, "rewards/margins": 6.382396697998047, "rewards/rejected": -4.7730817794799805, "step": 786 }, { "epoch": 1.03, "learning_rate": 3.882961538905194e-05, "logits/chosen": -1.6952452659606934, "logits/rejected": -1.7020906209945679, "logps/chosen": -175.68885803222656, "logps/rejected": -243.1931915283203, "loss": 0.0879, "rewards/accuracies": 0.9375, "rewards/chosen": 2.6182076930999756, "rewards/margins": 7.586241722106934, "rewards/rejected": -4.968033790588379, "step": 787 }, { "epoch": 1.03, "learning_rate": 3.879975252936761e-05, "logits/chosen": -1.8468316793441772, "logits/rejected": -1.9614177942276, "logps/chosen": -147.63035583496094, "logps/rejected": -252.27645874023438, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": 2.5491623878479004, "rewards/margins": 8.667781829833984, "rewards/rejected": -6.118618965148926, "step": 788 }, { "epoch": 1.03, "learning_rate": 3.876986132380814e-05, "logits/chosen": -1.6783697605133057, "logits/rejected": -1.7172523736953735, "logps/chosen": -186.01351928710938, "logps/rejected": -243.29734802246094, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 1.7059178352355957, "rewards/margins": 6.882784843444824, "rewards/rejected": -5.17686653137207, "step": 789 }, { "epoch": 1.03, "learning_rate": 3.8739941833772643e-05, "logits/chosen": -2.0805482864379883, "logits/rejected": -2.104659080505371, "logps/chosen": -164.75535583496094, "logps/rejected": -213.48208618164062, "loss": 0.1263, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5398253202438354, "rewards/margins": 5.201066017150879, "rewards/rejected": -3.6612401008605957, "step": 790 }, { "epoch": 1.04, "learning_rate": 3.870999412071829e-05, "logits/chosen": -1.733805537223816, "logits/rejected": -1.6911817789077759, "logps/chosen": -163.6641082763672, "logps/rejected": -234.33090209960938, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": 2.18049693107605, "rewards/margins": 8.514333724975586, "rewards/rejected": -6.333837509155273, "step": 791 }, { "epoch": 1.04, "learning_rate": 3.8680018246160295e-05, "logits/chosen": -1.4504364728927612, "logits/rejected": -1.5445796251296997, "logps/chosen": -152.53546142578125, "logps/rejected": -256.28460693359375, "loss": 0.0555, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3757332563400269, "rewards/margins": 6.887189865112305, "rewards/rejected": -5.51145601272583, "step": 792 }, { "epoch": 1.04, "learning_rate": 3.865001427167164e-05, "logits/chosen": -1.8626388311386108, "logits/rejected": -1.9130809307098389, "logps/chosen": -146.39051818847656, "logps/rejected": -227.55630493164062, "loss": 0.1054, "rewards/accuracies": 0.875, "rewards/chosen": 1.8630967140197754, "rewards/margins": 7.0272722244262695, "rewards/rejected": -5.164175033569336, "step": 793 }, { "epoch": 1.04, "learning_rate": 3.861998225888307e-05, "logits/chosen": -1.7840837240219116, "logits/rejected": -1.786619782447815, "logps/chosen": -172.25775146484375, "logps/rejected": -235.2631378173828, "loss": 0.0602, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5000873804092407, "rewards/margins": 7.424620628356934, "rewards/rejected": -5.924533367156982, "step": 794 }, { "epoch": 1.04, "learning_rate": 3.8589922269482924e-05, "logits/chosen": -1.9556963443756104, "logits/rejected": -1.9700720310211182, "logps/chosen": -145.4001922607422, "logps/rejected": -225.95314025878906, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 1.9621433019638062, "rewards/margins": 6.593455791473389, "rewards/rejected": -4.631312370300293, "step": 795 }, { "epoch": 1.04, "learning_rate": 3.855983436521699e-05, "logits/chosen": -1.7366329431533813, "logits/rejected": -1.733896017074585, "logps/chosen": -178.80482482910156, "logps/rejected": -243.79937744140625, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": 1.5504860877990723, "rewards/margins": 7.0473833084106445, "rewards/rejected": -5.496897220611572, "step": 796 }, { "epoch": 1.04, "learning_rate": 3.8529718607888394e-05, "logits/chosen": -1.7065753936767578, "logits/rejected": -1.742546796798706, "logps/chosen": -148.2886199951172, "logps/rejected": -208.7879638671875, "loss": 0.0934, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5418556928634644, "rewards/margins": 6.03062629699707, "rewards/rejected": -4.488770961761475, "step": 797 }, { "epoch": 1.04, "learning_rate": 3.8499575059357506e-05, "logits/chosen": -1.7552120685577393, "logits/rejected": -1.6821340322494507, "logps/chosen": -164.05032348632812, "logps/rejected": -235.27076721191406, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 2.730198621749878, "rewards/margins": 8.732093811035156, "rewards/rejected": -6.001894950866699, "step": 798 }, { "epoch": 1.05, "learning_rate": 3.8469403781541745e-05, "logits/chosen": -1.770749807357788, "logits/rejected": -1.7531272172927856, "logps/chosen": -174.0284881591797, "logps/rejected": -260.97369384765625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 1.5956072807312012, "rewards/margins": 7.5574846267700195, "rewards/rejected": -5.961877822875977, "step": 799 }, { "epoch": 1.05, "learning_rate": 3.843920483641551e-05, "logits/chosen": -2.1300318241119385, "logits/rejected": -2.06135630607605, "logps/chosen": -174.64796447753906, "logps/rejected": -267.5289306640625, "loss": 0.0777, "rewards/accuracies": 0.9375, "rewards/chosen": 1.753110647201538, "rewards/margins": 8.441932678222656, "rewards/rejected": -6.6888227462768555, "step": 800 }, { "epoch": 1.05, "learning_rate": 3.840897828601002e-05, "logits/chosen": -1.5714927911758423, "logits/rejected": -1.5915296077728271, "logps/chosen": -158.0545196533203, "logps/rejected": -225.82086181640625, "loss": 0.0912, "rewards/accuracies": 0.875, "rewards/chosen": 1.809893250465393, "rewards/margins": 7.726199626922607, "rewards/rejected": -5.916306018829346, "step": 801 }, { "epoch": 1.05, "learning_rate": 3.83787241924132e-05, "logits/chosen": -2.058527946472168, "logits/rejected": -2.106814384460449, "logps/chosen": -155.36590576171875, "logps/rejected": -222.94711303710938, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": 1.4193453788757324, "rewards/margins": 7.101151943206787, "rewards/rejected": -5.681806564331055, "step": 802 }, { "epoch": 1.05, "learning_rate": 3.8348442617769564e-05, "logits/chosen": -1.699159860610962, "logits/rejected": -1.7645118236541748, "logps/chosen": -156.55690002441406, "logps/rejected": -204.28575134277344, "loss": 0.1464, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8710487484931946, "rewards/margins": 5.2086567878723145, "rewards/rejected": -4.337608337402344, "step": 803 }, { "epoch": 1.05, "learning_rate": 3.831813362428005e-05, "logits/chosen": -1.7965739965438843, "logits/rejected": -1.7984563112258911, "logps/chosen": -142.91116333007812, "logps/rejected": -249.2438201904297, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": 0.7751538157463074, "rewards/margins": 7.4659013748168945, "rewards/rejected": -6.690747261047363, "step": 804 }, { "epoch": 1.05, "learning_rate": 3.8287797274201934e-05, "logits/chosen": -1.651024580001831, "logits/rejected": -1.6581289768218994, "logps/chosen": -175.62232971191406, "logps/rejected": -262.756103515625, "loss": 0.0982, "rewards/accuracies": 0.875, "rewards/chosen": 0.8959269523620605, "rewards/margins": 6.970417499542236, "rewards/rejected": -6.074490070343018, "step": 805 }, { "epoch": 1.05, "learning_rate": 3.825743362984868e-05, "logits/chosen": -1.6261019706726074, "logits/rejected": -1.6354936361312866, "logps/chosen": -200.4067840576172, "logps/rejected": -259.9723815917969, "loss": 0.0891, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0647060871124268, "rewards/margins": 6.824378967285156, "rewards/rejected": -5.759673118591309, "step": 806 }, { "epoch": 1.06, "learning_rate": 3.8227042753589824e-05, "logits/chosen": -1.8112773895263672, "logits/rejected": -1.778592824935913, "logps/chosen": -170.0568389892578, "logps/rejected": -230.7504425048828, "loss": 0.1387, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11378135532140732, "rewards/margins": 5.9576334953308105, "rewards/rejected": -6.071414947509766, "step": 807 }, { "epoch": 1.06, "learning_rate": 3.819662470785082e-05, "logits/chosen": -1.573864221572876, "logits/rejected": -1.6022815704345703, "logps/chosen": -153.41400146484375, "logps/rejected": -201.80267333984375, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": 1.118287205696106, "rewards/margins": 6.153818607330322, "rewards/rejected": -5.035531044006348, "step": 808 }, { "epoch": 1.06, "learning_rate": 3.816617955511296e-05, "logits/chosen": -1.71113920211792, "logits/rejected": -1.685309648513794, "logps/chosen": -161.2007598876953, "logps/rejected": -260.3743896484375, "loss": 0.0872, "rewards/accuracies": 0.9375, "rewards/chosen": 1.445091724395752, "rewards/margins": 8.42944049835205, "rewards/rejected": -6.984348773956299, "step": 809 }, { "epoch": 1.06, "learning_rate": 3.8135707357913176e-05, "logits/chosen": -1.684647560119629, "logits/rejected": -1.7264988422393799, "logps/chosen": -161.3009033203125, "logps/rejected": -287.2543640136719, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": 1.751045823097229, "rewards/margins": 10.769046783447266, "rewards/rejected": -9.018001556396484, "step": 810 }, { "epoch": 1.06, "learning_rate": 3.8105208178843984e-05, "logits/chosen": -1.7524151802062988, "logits/rejected": -1.7648451328277588, "logps/chosen": -156.43142700195312, "logps/rejected": -232.1067352294922, "loss": 0.0499, "rewards/accuracies": 0.9375, "rewards/chosen": 1.329323649406433, "rewards/margins": 8.172022819519043, "rewards/rejected": -6.8426995277404785, "step": 811 }, { "epoch": 1.06, "learning_rate": 3.8074682080553335e-05, "logits/chosen": -1.649200677871704, "logits/rejected": -1.6988123655319214, "logps/chosen": -150.70803833007812, "logps/rejected": -230.33238220214844, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": 0.6994099020957947, "rewards/margins": 7.033271789550781, "rewards/rejected": -6.3338623046875, "step": 812 }, { "epoch": 1.06, "learning_rate": 3.804412912574442e-05, "logits/chosen": -1.7932782173156738, "logits/rejected": -1.7817943096160889, "logps/chosen": -167.56292724609375, "logps/rejected": -223.35543823242188, "loss": 0.0896, "rewards/accuracies": 0.875, "rewards/chosen": 1.1120498180389404, "rewards/margins": 6.955061435699463, "rewards/rejected": -5.843011856079102, "step": 813 }, { "epoch": 1.07, "learning_rate": 3.801354937717565e-05, "logits/chosen": -1.7469156980514526, "logits/rejected": -1.7623285055160522, "logps/chosen": -246.330322265625, "logps/rejected": -311.6409912109375, "loss": 0.1693, "rewards/accuracies": 0.875, "rewards/chosen": -0.7100722789764404, "rewards/margins": 7.019688606262207, "rewards/rejected": -7.729760646820068, "step": 814 }, { "epoch": 1.07, "learning_rate": 3.798294289766043e-05, "logits/chosen": -1.4315271377563477, "logits/rejected": -1.3901729583740234, "logps/chosen": -174.4903564453125, "logps/rejected": -273.9493713378906, "loss": 0.0574, "rewards/accuracies": 0.9375, "rewards/chosen": 1.452012300491333, "rewards/margins": 8.629990577697754, "rewards/rejected": -7.177978515625, "step": 815 }, { "epoch": 1.07, "learning_rate": 3.795230975006712e-05, "logits/chosen": -1.807803988456726, "logits/rejected": -1.8593116998672485, "logps/chosen": -159.69613647460938, "logps/rejected": -266.27716064453125, "loss": 0.0968, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7878644466400146, "rewards/margins": 7.8176188468933105, "rewards/rejected": -7.029754638671875, "step": 816 }, { "epoch": 1.07, "learning_rate": 3.792164999731881e-05, "logits/chosen": -1.9901769161224365, "logits/rejected": -1.984059453010559, "logps/chosen": -150.95579528808594, "logps/rejected": -220.51315307617188, "loss": 0.114, "rewards/accuracies": 0.875, "rewards/chosen": 0.49705514311790466, "rewards/margins": 6.722858428955078, "rewards/rejected": -6.225803852081299, "step": 817 }, { "epoch": 1.07, "learning_rate": 3.789096370239328e-05, "logits/chosen": -1.8745883703231812, "logits/rejected": -1.872071623802185, "logps/chosen": -186.24365234375, "logps/rejected": -262.4015197753906, "loss": 0.0995, "rewards/accuracies": 0.875, "rewards/chosen": 0.6306481957435608, "rewards/margins": 7.7143330574035645, "rewards/rejected": -7.08368444442749, "step": 818 }, { "epoch": 1.07, "learning_rate": 3.786025092832279e-05, "logits/chosen": -1.7694121599197388, "logits/rejected": -1.7589280605316162, "logps/chosen": -172.3319854736328, "logps/rejected": -252.0114288330078, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 0.7360559105873108, "rewards/margins": 8.137409210205078, "rewards/rejected": -7.401352882385254, "step": 819 }, { "epoch": 1.07, "learning_rate": 3.782951173819403e-05, "logits/chosen": -1.6934165954589844, "logits/rejected": -1.7441664934158325, "logps/chosen": -191.2655487060547, "logps/rejected": -317.37506103515625, "loss": 0.0627, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24915921688079834, "rewards/margins": 6.986499309539795, "rewards/rejected": -6.737339973449707, "step": 820 }, { "epoch": 1.07, "learning_rate": 3.7798746195147914e-05, "logits/chosen": -1.7119529247283936, "logits/rejected": -1.7413674592971802, "logps/chosen": -246.66983032226562, "logps/rejected": -347.75433349609375, "loss": 0.092, "rewards/accuracies": 0.875, "rewards/chosen": 0.6458020806312561, "rewards/margins": 7.750561714172363, "rewards/rejected": -7.1047587394714355, "step": 821 }, { "epoch": 1.08, "learning_rate": 3.776795436237954e-05, "logits/chosen": -1.7255234718322754, "logits/rejected": -1.691392421722412, "logps/chosen": -167.06256103515625, "logps/rejected": -253.3814697265625, "loss": 0.1163, "rewards/accuracies": 0.875, "rewards/chosen": 1.1990185976028442, "rewards/margins": 8.745574951171875, "rewards/rejected": -7.546555995941162, "step": 822 }, { "epoch": 1.08, "learning_rate": 3.773713630313793e-05, "logits/chosen": -1.5812559127807617, "logits/rejected": -1.50252366065979, "logps/chosen": -169.1471405029297, "logps/rejected": -258.7736511230469, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 0.11404211819171906, "rewards/margins": 6.661553382873535, "rewards/rejected": -6.547510623931885, "step": 823 }, { "epoch": 1.08, "learning_rate": 3.7706292080726055e-05, "logits/chosen": -1.608155608177185, "logits/rejected": -1.5912903547286987, "logps/chosen": -147.78099060058594, "logps/rejected": -269.6217346191406, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 1.0774881839752197, "rewards/margins": 8.897539138793945, "rewards/rejected": -7.820050239562988, "step": 824 }, { "epoch": 1.08, "learning_rate": 3.767542175850058e-05, "logits/chosen": -1.8905123472213745, "logits/rejected": -1.8191280364990234, "logps/chosen": -143.8882293701172, "logps/rejected": -216.68756103515625, "loss": 0.1049, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0501607656478882, "rewards/margins": 6.524418830871582, "rewards/rejected": -5.474257946014404, "step": 825 }, { "epoch": 1.08, "learning_rate": 3.764452539987179e-05, "logits/chosen": -1.5478218793869019, "logits/rejected": -1.5789484977722168, "logps/chosen": -240.63043212890625, "logps/rejected": -349.4985656738281, "loss": 0.0549, "rewards/accuracies": 0.9375, "rewards/chosen": 0.48741525411605835, "rewards/margins": 9.919611930847168, "rewards/rejected": -9.432197570800781, "step": 826 }, { "epoch": 1.08, "learning_rate": 3.761360306830345e-05, "logits/chosen": -1.500814437866211, "logits/rejected": -1.4689234495162964, "logps/chosen": -267.2929382324219, "logps/rejected": -361.5623779296875, "loss": 0.047, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6333886981010437, "rewards/margins": 8.350789070129395, "rewards/rejected": -7.717400074005127, "step": 827 }, { "epoch": 1.08, "learning_rate": 3.75826548273127e-05, "logits/chosen": -1.8496098518371582, "logits/rejected": -1.8652946949005127, "logps/chosen": -166.85903930664062, "logps/rejected": -254.02622985839844, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": 1.2260518074035645, "rewards/margins": 9.077446937561035, "rewards/rejected": -7.851395606994629, "step": 828 }, { "epoch": 1.08, "learning_rate": 3.7551680740469874e-05, "logits/chosen": -1.8195977210998535, "logits/rejected": -1.8940608501434326, "logps/chosen": -184.0416259765625, "logps/rejected": -289.5961608886719, "loss": 0.1249, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4078165292739868, "rewards/margins": 7.792662143707275, "rewards/rejected": -7.384845733642578, "step": 829 }, { "epoch": 1.09, "learning_rate": 3.752068087139839e-05, "logits/chosen": -1.6744965314865112, "logits/rejected": -1.56267511844635, "logps/chosen": -222.7291259765625, "logps/rejected": -262.7376403808594, "loss": 0.1741, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7223973274230957, "rewards/margins": 7.620642185211182, "rewards/rejected": -6.898245811462402, "step": 830 }, { "epoch": 1.09, "learning_rate": 3.7489655283774657e-05, "logits/chosen": -1.6755584478378296, "logits/rejected": -1.7019976377487183, "logps/chosen": -192.16610717773438, "logps/rejected": -266.71337890625, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 0.2536769509315491, "rewards/margins": 6.898894309997559, "rewards/rejected": -6.6452178955078125, "step": 831 }, { "epoch": 1.09, "learning_rate": 3.7458604041327874e-05, "logits/chosen": -1.4574893712997437, "logits/rejected": -1.4942587614059448, "logps/chosen": -206.4982147216797, "logps/rejected": -312.23504638671875, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 1.2815260887145996, "rewards/margins": 8.590280532836914, "rewards/rejected": -7.3087544441223145, "step": 832 }, { "epoch": 1.09, "learning_rate": 3.742752720783997e-05, "logits/chosen": -1.6686670780181885, "logits/rejected": -1.7064058780670166, "logps/chosen": -193.9601593017578, "logps/rejected": -258.1878356933594, "loss": 0.0769, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1386452317237854, "rewards/margins": 6.448615074157715, "rewards/rejected": -6.309969425201416, "step": 833 }, { "epoch": 1.09, "learning_rate": 3.7396424847145425e-05, "logits/chosen": -1.814921259880066, "logits/rejected": -1.9397302865982056, "logps/chosen": -136.74314880371094, "logps/rejected": -240.24948120117188, "loss": 0.1083, "rewards/accuracies": 0.875, "rewards/chosen": 0.41532331705093384, "rewards/margins": 7.389758110046387, "rewards/rejected": -6.974433898925781, "step": 834 }, { "epoch": 1.09, "learning_rate": 3.736529702313114e-05, "logits/chosen": -1.766431450843811, "logits/rejected": -1.7151589393615723, "logps/chosen": -184.22366333007812, "logps/rejected": -266.4801025390625, "loss": 0.0664, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2503972053527832, "rewards/margins": 6.810416221618652, "rewards/rejected": -6.560018062591553, "step": 835 }, { "epoch": 1.09, "learning_rate": 3.733414379973635e-05, "logits/chosen": -1.8444455862045288, "logits/rejected": -1.8781747817993164, "logps/chosen": -158.47128295898438, "logps/rejected": -253.3724365234375, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": 0.781490683555603, "rewards/margins": 8.908397674560547, "rewards/rejected": -8.126907348632812, "step": 836 }, { "epoch": 1.1, "learning_rate": 3.730296524095245e-05, "logits/chosen": -1.782663345336914, "logits/rejected": -1.8073902130126953, "logps/chosen": -198.73843383789062, "logps/rejected": -277.1875, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 1.417394995689392, "rewards/margins": 9.88292407989502, "rewards/rejected": -8.465529441833496, "step": 837 }, { "epoch": 1.1, "learning_rate": 3.7271761410822856e-05, "logits/chosen": -1.6727392673492432, "logits/rejected": -1.7842079401016235, "logps/chosen": -165.4208984375, "logps/rejected": -247.4107666015625, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 0.052548423409461975, "rewards/margins": 6.773774147033691, "rewards/rejected": -6.721225738525391, "step": 838 }, { "epoch": 1.1, "learning_rate": 3.724053237344294e-05, "logits/chosen": -1.7459189891815186, "logits/rejected": -1.7782320976257324, "logps/chosen": -160.22817993164062, "logps/rejected": -229.1803741455078, "loss": 0.076, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19302824139595032, "rewards/margins": 6.072666168212891, "rewards/rejected": -5.879638671875, "step": 839 }, { "epoch": 1.1, "learning_rate": 3.720927819295979e-05, "logits/chosen": -1.5151444673538208, "logits/rejected": -1.4409478902816772, "logps/chosen": -173.93408203125, "logps/rejected": -244.9287567138672, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 0.37196123600006104, "rewards/margins": 7.8127875328063965, "rewards/rejected": -7.440826892852783, "step": 840 }, { "epoch": 1.1, "learning_rate": 3.7177998933572186e-05, "logits/chosen": -1.7388434410095215, "logits/rejected": -1.7487221956253052, "logps/chosen": -167.10128784179688, "logps/rejected": -233.32119750976562, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": 0.59880131483078, "rewards/margins": 6.784365177154541, "rewards/rejected": -6.185564041137695, "step": 841 }, { "epoch": 1.1, "learning_rate": 3.7146694659530425e-05, "logits/chosen": -1.613723874092102, "logits/rejected": -1.5765931606292725, "logps/chosen": -188.1655731201172, "logps/rejected": -248.87420654296875, "loss": 0.0793, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5173835754394531, "rewards/margins": 7.769992828369141, "rewards/rejected": -7.252608776092529, "step": 842 }, { "epoch": 1.1, "learning_rate": 3.711536543513614e-05, "logits/chosen": -1.7683793306350708, "logits/rejected": -1.7448205947875977, "logps/chosen": -186.20999145507812, "logps/rejected": -290.3359375, "loss": 0.0719, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24317941069602966, "rewards/margins": 8.628244400024414, "rewards/rejected": -8.385065078735352, "step": 843 }, { "epoch": 1.1, "learning_rate": 3.708401132474228e-05, "logits/chosen": -1.6240133047103882, "logits/rejected": -1.5917489528656006, "logps/chosen": -176.8642578125, "logps/rejected": -245.3429718017578, "loss": 0.092, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23126250505447388, "rewards/margins": 7.286675453186035, "rewards/rejected": -7.055412292480469, "step": 844 }, { "epoch": 1.11, "learning_rate": 3.705263239275284e-05, "logits/chosen": -1.6431668996810913, "logits/rejected": -1.6729328632354736, "logps/chosen": -224.10528564453125, "logps/rejected": -291.15435791015625, "loss": 0.0575, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7494313716888428, "rewards/margins": 5.913324356079102, "rewards/rejected": -6.662755966186523, "step": 845 }, { "epoch": 1.11, "learning_rate": 3.702122870362286e-05, "logits/chosen": -1.8470516204833984, "logits/rejected": -1.9361313581466675, "logps/chosen": -202.118408203125, "logps/rejected": -262.67987060546875, "loss": 0.0793, "rewards/accuracies": 0.9375, "rewards/chosen": 0.422929048538208, "rewards/margins": 6.7612433433532715, "rewards/rejected": -6.338314056396484, "step": 846 }, { "epoch": 1.11, "learning_rate": 3.698980032185821e-05, "logits/chosen": -1.6243702173233032, "logits/rejected": -1.5518081188201904, "logps/chosen": -156.64627075195312, "logps/rejected": -276.55133056640625, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 0.5403996706008911, "rewards/margins": 9.535897254943848, "rewards/rejected": -8.995497703552246, "step": 847 }, { "epoch": 1.11, "learning_rate": 3.695834731201548e-05, "logits/chosen": -1.706137776374817, "logits/rejected": -1.659182071685791, "logps/chosen": -195.2234344482422, "logps/rejected": -257.5596008300781, "loss": 0.2016, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3023313879966736, "rewards/margins": 5.863803863525391, "rewards/rejected": -5.5614728927612305, "step": 848 }, { "epoch": 1.11, "learning_rate": 3.692686973870184e-05, "logits/chosen": -1.8808422088623047, "logits/rejected": -1.8596181869506836, "logps/chosen": -158.5035858154297, "logps/rejected": -200.2980499267578, "loss": 0.1072, "rewards/accuracies": 0.875, "rewards/chosen": 0.9980390667915344, "rewards/margins": 5.744872570037842, "rewards/rejected": -4.746833324432373, "step": 849 }, { "epoch": 1.11, "learning_rate": 3.689536766657494e-05, "logits/chosen": -1.5721148252487183, "logits/rejected": -1.6410491466522217, "logps/chosen": -168.682861328125, "logps/rejected": -284.8531799316406, "loss": 0.0575, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6643502116203308, "rewards/margins": 9.553466796875, "rewards/rejected": -8.889117240905762, "step": 850 }, { "epoch": 1.11, "learning_rate": 3.6863841160342723e-05, "logits/chosen": -1.563720703125, "logits/rejected": -1.6035958528518677, "logps/chosen": -153.61029052734375, "logps/rejected": -225.42413330078125, "loss": 0.0756, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4325341582298279, "rewards/margins": 7.381722450256348, "rewards/rejected": -6.949188709259033, "step": 851 }, { "epoch": 1.12, "learning_rate": 3.683229028476334e-05, "logits/chosen": -1.7827197313308716, "logits/rejected": -1.882027268409729, "logps/chosen": -162.43850708007812, "logps/rejected": -275.7604064941406, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 1.3920907974243164, "rewards/margins": 10.132492065429688, "rewards/rejected": -8.740402221679688, "step": 852 }, { "epoch": 1.12, "learning_rate": 3.6800715104645e-05, "logits/chosen": -1.7220182418823242, "logits/rejected": -1.7189013957977295, "logps/chosen": -140.41427612304688, "logps/rejected": -214.07586669921875, "loss": 0.0604, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6028104424476624, "rewards/margins": 6.747117042541504, "rewards/rejected": -6.1443071365356445, "step": 853 }, { "epoch": 1.12, "learning_rate": 3.676911568484583e-05, "logits/chosen": -1.8405961990356445, "logits/rejected": -1.800131916999817, "logps/chosen": -209.40870666503906, "logps/rejected": -268.63397216796875, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 0.5028942227363586, "rewards/margins": 7.966097354888916, "rewards/rejected": -7.463202476501465, "step": 854 }, { "epoch": 1.12, "learning_rate": 3.673749209027375e-05, "logits/chosen": -1.9057409763336182, "logits/rejected": -1.9989405870437622, "logps/chosen": -134.8566436767578, "logps/rejected": -231.26950073242188, "loss": 0.141, "rewards/accuracies": 0.875, "rewards/chosen": 0.4298328161239624, "rewards/margins": 7.505517482757568, "rewards/rejected": -7.075685977935791, "step": 855 }, { "epoch": 1.12, "learning_rate": 3.6705844385886334e-05, "logits/chosen": -1.86636483669281, "logits/rejected": -1.9175128936767578, "logps/chosen": -142.00611877441406, "logps/rejected": -224.15219116210938, "loss": 0.1332, "rewards/accuracies": 0.875, "rewards/chosen": 0.33483776450157166, "rewards/margins": 6.456839561462402, "rewards/rejected": -6.122001647949219, "step": 856 }, { "epoch": 1.12, "learning_rate": 3.667417263669068e-05, "logits/chosen": -1.9074723720550537, "logits/rejected": -1.922853946685791, "logps/chosen": -153.0294189453125, "logps/rejected": -247.6094970703125, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": 0.6480506658554077, "rewards/margins": 8.770407676696777, "rewards/rejected": -8.122356414794922, "step": 857 }, { "epoch": 1.12, "learning_rate": 3.6642476907743276e-05, "logits/chosen": -1.8091105222702026, "logits/rejected": -1.8391491174697876, "logps/chosen": -173.12405395507812, "logps/rejected": -269.18701171875, "loss": 0.0564, "rewards/accuracies": 1.0, "rewards/chosen": -0.2547444999217987, "rewards/margins": 7.713589191436768, "rewards/rejected": -7.968333721160889, "step": 858 }, { "epoch": 1.12, "learning_rate": 3.661075726414986e-05, "logits/chosen": -1.6282808780670166, "logits/rejected": -1.5667518377304077, "logps/chosen": -158.9324951171875, "logps/rejected": -252.65725708007812, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 0.3669869303703308, "rewards/margins": 7.350515842437744, "rewards/rejected": -6.9835286140441895, "step": 859 }, { "epoch": 1.13, "learning_rate": 3.6579013771065305e-05, "logits/chosen": -1.5194811820983887, "logits/rejected": -1.5251891613006592, "logps/chosen": -162.73855590820312, "logps/rejected": -289.39093017578125, "loss": 0.0515, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7854263186454773, "rewards/margins": 9.395334243774414, "rewards/rejected": -8.609909057617188, "step": 860 }, { "epoch": 1.13, "learning_rate": 3.654724649369348e-05, "logits/chosen": -1.7831377983093262, "logits/rejected": -1.8511077165603638, "logps/chosen": -159.1595916748047, "logps/rejected": -260.9335021972656, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.343432903289795, "rewards/margins": 9.590715408325195, "rewards/rejected": -8.247282981872559, "step": 861 }, { "epoch": 1.13, "learning_rate": 3.651545549728709e-05, "logits/chosen": -1.8324871063232422, "logits/rejected": -1.8227266073226929, "logps/chosen": -187.99771118164062, "logps/rejected": -261.81964111328125, "loss": 0.091, "rewards/accuracies": 0.9375, "rewards/chosen": -0.018547460436820984, "rewards/margins": 7.8460283279418945, "rewards/rejected": -7.86457633972168, "step": 862 }, { "epoch": 1.13, "learning_rate": 3.6483640847147554e-05, "logits/chosen": -1.7904331684112549, "logits/rejected": -1.7915871143341064, "logps/chosen": -162.7968292236328, "logps/rejected": -221.08908081054688, "loss": 0.0596, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10385438799858093, "rewards/margins": 7.333845138549805, "rewards/rejected": -7.2299909591674805, "step": 863 }, { "epoch": 1.13, "learning_rate": 3.645180260862492e-05, "logits/chosen": -1.6333101987838745, "logits/rejected": -1.680311918258667, "logps/chosen": -205.26104736328125, "logps/rejected": -275.9607849121094, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -0.6030675768852234, "rewards/margins": 7.268556118011475, "rewards/rejected": -7.8716230392456055, "step": 864 }, { "epoch": 1.13, "learning_rate": 3.6419940847117626e-05, "logits/chosen": -1.7607299089431763, "logits/rejected": -1.7272199392318726, "logps/chosen": -186.3272705078125, "logps/rejected": -243.32936096191406, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -0.5442850589752197, "rewards/margins": 6.532572269439697, "rewards/rejected": -7.076857089996338, "step": 865 }, { "epoch": 1.13, "learning_rate": 3.638805562807249e-05, "logits/chosen": -1.6750125885009766, "logits/rejected": -1.6804828643798828, "logps/chosen": -183.07521057128906, "logps/rejected": -282.3734130859375, "loss": 0.0878, "rewards/accuracies": 0.875, "rewards/chosen": 0.4056967496871948, "rewards/margins": 8.726588249206543, "rewards/rejected": -8.320891380310059, "step": 866 }, { "epoch": 1.13, "learning_rate": 3.635614701698448e-05, "logits/chosen": -1.6088908910751343, "logits/rejected": -1.6467278003692627, "logps/chosen": -171.95211791992188, "logps/rejected": -264.917236328125, "loss": 0.0761, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2036418467760086, "rewards/margins": 8.157671928405762, "rewards/rejected": -8.36131477355957, "step": 867 }, { "epoch": 1.14, "learning_rate": 3.632421507939661e-05, "logits/chosen": -1.5411345958709717, "logits/rejected": -1.5687683820724487, "logps/chosen": -171.95297241210938, "logps/rejected": -265.6844177246094, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": 0.8591725826263428, "rewards/margins": 8.619390487670898, "rewards/rejected": -7.760217666625977, "step": 868 }, { "epoch": 1.14, "learning_rate": 3.629225988089983e-05, "logits/chosen": -1.677573800086975, "logits/rejected": -1.7168667316436768, "logps/chosen": -180.29104614257812, "logps/rejected": -263.3822326660156, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": 0.3133031129837036, "rewards/margins": 8.842041015625, "rewards/rejected": -8.528738021850586, "step": 869 }, { "epoch": 1.14, "learning_rate": 3.6260281487132846e-05, "logits/chosen": -1.5518522262573242, "logits/rejected": -1.5730174779891968, "logps/chosen": -184.6771697998047, "logps/rejected": -269.0153503417969, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": 0.7430427670478821, "rewards/margins": 8.222708702087402, "rewards/rejected": -7.479666233062744, "step": 870 }, { "epoch": 1.14, "learning_rate": 3.622827996378203e-05, "logits/chosen": -1.8273077011108398, "logits/rejected": -1.753129482269287, "logps/chosen": -192.535400390625, "logps/rejected": -255.18661499023438, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -0.2789681851863861, "rewards/margins": 7.683708190917969, "rewards/rejected": -7.962676525115967, "step": 871 }, { "epoch": 1.14, "learning_rate": 3.6196255376581254e-05, "logits/chosen": -1.723561406135559, "logits/rejected": -1.7448184490203857, "logps/chosen": -179.89093017578125, "logps/rejected": -277.4232177734375, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 0.9937200546264648, "rewards/margins": 9.481063842773438, "rewards/rejected": -8.487343788146973, "step": 872 }, { "epoch": 1.14, "learning_rate": 3.616420779131177e-05, "logits/chosen": -1.7428555488586426, "logits/rejected": -1.8246798515319824, "logps/chosen": -175.7808380126953, "logps/rejected": -260.22637939453125, "loss": 0.0549, "rewards/accuracies": 1.0, "rewards/chosen": 0.8115906119346619, "rewards/margins": 8.594477653503418, "rewards/rejected": -7.782886981964111, "step": 873 }, { "epoch": 1.14, "learning_rate": 3.613213727380206e-05, "logits/chosen": -1.8163658380508423, "logits/rejected": -1.8145133256912231, "logps/chosen": -163.59075927734375, "logps/rejected": -231.6703338623047, "loss": 0.0934, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6859133243560791, "rewards/margins": 5.622536659240723, "rewards/rejected": -6.308449745178223, "step": 874 }, { "epoch": 1.15, "learning_rate": 3.610004388992771e-05, "logits/chosen": -1.7247226238250732, "logits/rejected": -1.7080042362213135, "logps/chosen": -160.51553344726562, "logps/rejected": -245.70233154296875, "loss": 0.0738, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1169636249542236, "rewards/margins": 9.182759284973145, "rewards/rejected": -8.065794944763184, "step": 875 }, { "epoch": 1.15, "learning_rate": 3.6067927705611304e-05, "logits/chosen": -1.6803061962127686, "logits/rejected": -1.696300745010376, "logps/chosen": -163.08665466308594, "logps/rejected": -258.111328125, "loss": 0.1504, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11361874639987946, "rewards/margins": 7.612594127655029, "rewards/rejected": -7.49897575378418, "step": 876 }, { "epoch": 1.15, "learning_rate": 3.6035788786822225e-05, "logits/chosen": -1.7864495515823364, "logits/rejected": -1.8517754077911377, "logps/chosen": -169.38356018066406, "logps/rejected": -261.1342468261719, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 0.9378201365470886, "rewards/margins": 8.446355819702148, "rewards/rejected": -7.508536338806152, "step": 877 }, { "epoch": 1.15, "learning_rate": 3.6003627199576564e-05, "logits/chosen": -1.6908057928085327, "logits/rejected": -1.7146823406219482, "logps/chosen": -149.77984619140625, "logps/rejected": -215.45220947265625, "loss": 0.0889, "rewards/accuracies": 0.9375, "rewards/chosen": -0.16511961817741394, "rewards/margins": 6.96973180770874, "rewards/rejected": -7.134850978851318, "step": 878 }, { "epoch": 1.15, "learning_rate": 3.597144300993699e-05, "logits/chosen": -1.8882876634597778, "logits/rejected": -1.9150993824005127, "logps/chosen": -143.28770446777344, "logps/rejected": -224.34437561035156, "loss": 0.0574, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4502476453781128, "rewards/margins": 7.336152076721191, "rewards/rejected": -6.885904312133789, "step": 879 }, { "epoch": 1.15, "learning_rate": 3.593923628401259e-05, "logits/chosen": -1.8090243339538574, "logits/rejected": -1.8175634145736694, "logps/chosen": -148.93310546875, "logps/rejected": -242.72918701171875, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": 1.103208303451538, "rewards/margins": 8.617406845092773, "rewards/rejected": -7.514198303222656, "step": 880 }, { "epoch": 1.15, "learning_rate": 3.5907007087958726e-05, "logits/chosen": -1.803382396697998, "logits/rejected": -1.7716857194900513, "logps/chosen": -183.49339294433594, "logps/rejected": -251.29864501953125, "loss": 0.1326, "rewards/accuracies": 0.875, "rewards/chosen": -0.4789639115333557, "rewards/margins": 6.777097225189209, "rewards/rejected": -7.256060600280762, "step": 881 }, { "epoch": 1.15, "learning_rate": 3.587475548797694e-05, "logits/chosen": -1.5749878883361816, "logits/rejected": -1.617025375366211, "logps/chosen": -148.74200439453125, "logps/rejected": -228.47140502929688, "loss": 0.0471, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5491048097610474, "rewards/margins": 7.191831588745117, "rewards/rejected": -6.642726421356201, "step": 882 }, { "epoch": 1.16, "learning_rate": 3.5842481550314794e-05, "logits/chosen": -1.8245117664337158, "logits/rejected": -1.80112886428833, "logps/chosen": -165.4649200439453, "logps/rejected": -213.1611785888672, "loss": 0.0503, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9193047881126404, "rewards/margins": 7.948483467102051, "rewards/rejected": -7.029178619384766, "step": 883 }, { "epoch": 1.16, "learning_rate": 3.581018534126571e-05, "logits/chosen": -1.801705002784729, "logits/rejected": -1.834633231163025, "logps/chosen": -170.57785034179688, "logps/rejected": -256.3074035644531, "loss": 0.0518, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03511929512023926, "rewards/margins": 8.271900177001953, "rewards/rejected": -8.236780166625977, "step": 884 }, { "epoch": 1.16, "learning_rate": 3.577786692716886e-05, "logits/chosen": -1.6602346897125244, "logits/rejected": -1.683791995048523, "logps/chosen": -179.22866821289062, "logps/rejected": -301.5026550292969, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 1.2062432765960693, "rewards/margins": 9.739579200744629, "rewards/rejected": -8.533336639404297, "step": 885 }, { "epoch": 1.16, "learning_rate": 3.574552637440907e-05, "logits/chosen": -1.6858350038528442, "logits/rejected": -1.6984546184539795, "logps/chosen": -150.21563720703125, "logps/rejected": -213.68753051757812, "loss": 0.1469, "rewards/accuracies": 0.8125, "rewards/chosen": 0.48701292276382446, "rewards/margins": 6.276709079742432, "rewards/rejected": -5.789695739746094, "step": 886 }, { "epoch": 1.16, "learning_rate": 3.571316374941658e-05, "logits/chosen": -2.0152809619903564, "logits/rejected": -2.0364301204681396, "logps/chosen": -176.3936767578125, "logps/rejected": -234.19671630859375, "loss": 0.1341, "rewards/accuracies": 0.875, "rewards/chosen": -0.20730283856391907, "rewards/margins": 6.322017192840576, "rewards/rejected": -6.529320240020752, "step": 887 }, { "epoch": 1.16, "learning_rate": 3.568077911866703e-05, "logits/chosen": -1.8245972394943237, "logits/rejected": -1.8635644912719727, "logps/chosen": -180.6774444580078, "logps/rejected": -273.9190673828125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 0.14365682005882263, "rewards/margins": 8.138999938964844, "rewards/rejected": -7.995343208312988, "step": 888 }, { "epoch": 1.16, "learning_rate": 3.564837254868118e-05, "logits/chosen": -1.8740934133529663, "logits/rejected": -1.8795528411865234, "logps/chosen": -160.5602569580078, "logps/rejected": -250.400634765625, "loss": 0.0483, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7248006463050842, "rewards/margins": 7.648869514465332, "rewards/rejected": -8.373669624328613, "step": 889 }, { "epoch": 1.16, "learning_rate": 3.561594410602495e-05, "logits/chosen": -1.851047158241272, "logits/rejected": -1.8520572185516357, "logps/chosen": -202.7530059814453, "logps/rejected": -255.25729370117188, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 0.1345672458410263, "rewards/margins": 7.931034564971924, "rewards/rejected": -7.796467304229736, "step": 890 }, { "epoch": 1.17, "learning_rate": 3.558349385730913e-05, "logits/chosen": -1.9940862655639648, "logits/rejected": -1.9213124513626099, "logps/chosen": -200.50384521484375, "logps/rejected": -277.02764892578125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 0.5816916823387146, "rewards/margins": 8.134634971618652, "rewards/rejected": -7.552943229675293, "step": 891 }, { "epoch": 1.17, "learning_rate": 3.5551021869189286e-05, "logits/chosen": -1.916977047920227, "logits/rejected": -1.9621555805206299, "logps/chosen": -175.3245391845703, "logps/rejected": -248.02911376953125, "loss": 0.1087, "rewards/accuracies": 0.875, "rewards/chosen": 0.9088395833969116, "rewards/margins": 7.577523231506348, "rewards/rejected": -6.668683052062988, "step": 892 }, { "epoch": 1.17, "learning_rate": 3.55185282083657e-05, "logits/chosen": -1.830249547958374, "logits/rejected": -1.9227575063705444, "logps/chosen": -157.8826446533203, "logps/rejected": -272.9053649902344, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.1682499647140503, "rewards/margins": 9.896993637084961, "rewards/rejected": -8.728743553161621, "step": 893 }, { "epoch": 1.17, "learning_rate": 3.548601294158313e-05, "logits/chosen": -1.9553158283233643, "logits/rejected": -1.9773513078689575, "logps/chosen": -223.3098907470703, "logps/rejected": -267.57659912109375, "loss": 0.1966, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7544253468513489, "rewards/margins": 5.037443161010742, "rewards/rejected": -5.791868686676025, "step": 894 }, { "epoch": 1.17, "learning_rate": 3.5453476135630706e-05, "logits/chosen": -2.1675331592559814, "logits/rejected": -2.06319522857666, "logps/chosen": -168.5749053955078, "logps/rejected": -212.3041229248047, "loss": 0.0527, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8117193579673767, "rewards/margins": 7.594612121582031, "rewards/rejected": -6.782892227172852, "step": 895 }, { "epoch": 1.17, "learning_rate": 3.542091785734184e-05, "logits/chosen": -1.8826407194137573, "logits/rejected": -1.9465281963348389, "logps/chosen": -161.4238739013672, "logps/rejected": -249.60406494140625, "loss": 0.0825, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6117186546325684, "rewards/margins": 8.12980842590332, "rewards/rejected": -7.518089771270752, "step": 896 }, { "epoch": 1.17, "learning_rate": 3.538833817359401e-05, "logits/chosen": -1.9344408512115479, "logits/rejected": -1.9823689460754395, "logps/chosen": -188.45948791503906, "logps/rejected": -285.7137756347656, "loss": 0.1665, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4131191074848175, "rewards/margins": 9.048032760620117, "rewards/rejected": -8.63491439819336, "step": 897 }, { "epoch": 1.18, "learning_rate": 3.5355737151308686e-05, "logits/chosen": -1.7558889389038086, "logits/rejected": -1.7752621173858643, "logps/chosen": -162.7386016845703, "logps/rejected": -242.3275146484375, "loss": 0.0623, "rewards/accuracies": 1.0, "rewards/chosen": -0.5808318853378296, "rewards/margins": 6.759397029876709, "rewards/rejected": -7.340229034423828, "step": 898 }, { "epoch": 1.18, "learning_rate": 3.5323114857451174e-05, "logits/chosen": -2.1031816005706787, "logits/rejected": -2.0772862434387207, "logps/chosen": -179.40110778808594, "logps/rejected": -246.42173767089844, "loss": 0.0558, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9522780776023865, "rewards/margins": 8.00682258605957, "rewards/rejected": -7.054544448852539, "step": 899 }, { "epoch": 1.18, "learning_rate": 3.529047135903045e-05, "logits/chosen": -1.9245232343673706, "logits/rejected": -1.9527860879898071, "logps/chosen": -164.22299194335938, "logps/rejected": -268.86181640625, "loss": 0.0468, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6341921091079712, "rewards/margins": 9.523921012878418, "rewards/rejected": -8.889729499816895, "step": 900 }, { "epoch": 1.18, "learning_rate": 3.525780672309907e-05, "logits/chosen": -1.7722699642181396, "logits/rejected": -1.789467215538025, "logps/chosen": -152.01785278320312, "logps/rejected": -239.99957275390625, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": 1.3282092809677124, "rewards/margins": 9.678154945373535, "rewards/rejected": -8.349946022033691, "step": 901 }, { "epoch": 1.18, "learning_rate": 3.522512101675299e-05, "logits/chosen": -1.8270869255065918, "logits/rejected": -1.898951530456543, "logps/chosen": -140.39678955078125, "logps/rejected": -211.85061645507812, "loss": 0.1175, "rewards/accuracies": 0.875, "rewards/chosen": -0.8393736481666565, "rewards/margins": 6.014346122741699, "rewards/rejected": -6.853720188140869, "step": 902 }, { "epoch": 1.18, "learning_rate": 3.519241430713145e-05, "logits/chosen": -1.741802453994751, "logits/rejected": -1.7483961582183838, "logps/chosen": -193.8229217529297, "logps/rejected": -237.98211669921875, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": -0.886393129825592, "rewards/margins": 5.339400291442871, "rewards/rejected": -6.22579288482666, "step": 903 }, { "epoch": 1.18, "learning_rate": 3.5159686661416834e-05, "logits/chosen": -1.872272253036499, "logits/rejected": -1.914929747581482, "logps/chosen": -190.64895629882812, "logps/rejected": -272.30865478515625, "loss": 0.068, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09860727936029434, "rewards/margins": 7.040101051330566, "rewards/rejected": -6.941493988037109, "step": 904 }, { "epoch": 1.18, "learning_rate": 3.512693814683456e-05, "logits/chosen": -1.8728840351104736, "logits/rejected": -1.8658220767974854, "logps/chosen": -191.35928344726562, "logps/rejected": -261.27423095703125, "loss": 0.0753, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8436151146888733, "rewards/margins": 7.535486221313477, "rewards/rejected": -6.691871166229248, "step": 905 }, { "epoch": 1.19, "learning_rate": 3.5094168830652854e-05, "logits/chosen": -1.8646080493927002, "logits/rejected": -1.864649772644043, "logps/chosen": -172.74118041992188, "logps/rejected": -247.68109130859375, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": 0.3584592640399933, "rewards/margins": 7.375222206115723, "rewards/rejected": -7.016762733459473, "step": 906 }, { "epoch": 1.19, "learning_rate": 3.506137878018272e-05, "logits/chosen": -1.9659096002578735, "logits/rejected": -1.9836225509643555, "logps/chosen": -160.93516540527344, "logps/rejected": -274.54052734375, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 0.14322397112846375, "rewards/margins": 8.324197769165039, "rewards/rejected": -8.180973052978516, "step": 907 }, { "epoch": 1.19, "learning_rate": 3.502856806277773e-05, "logits/chosen": -1.8923836946487427, "logits/rejected": -1.8981733322143555, "logps/chosen": -187.4054412841797, "logps/rejected": -274.0010681152344, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 0.868060290813446, "rewards/margins": 10.109868049621582, "rewards/rejected": -9.24180793762207, "step": 908 }, { "epoch": 1.19, "learning_rate": 3.4995736745833895e-05, "logits/chosen": -2.094193696975708, "logits/rejected": -2.0947916507720947, "logps/chosen": -154.0366973876953, "logps/rejected": -235.6873016357422, "loss": 0.1445, "rewards/accuracies": 0.875, "rewards/chosen": 0.21441936492919922, "rewards/margins": 6.727463722229004, "rewards/rejected": -6.5130438804626465, "step": 909 }, { "epoch": 1.19, "learning_rate": 3.496288489678958e-05, "logits/chosen": -1.4902489185333252, "logits/rejected": -1.5242319107055664, "logps/chosen": -194.15325927734375, "logps/rejected": -288.6688537597656, "loss": 0.0442, "rewards/accuracies": 0.9375, "rewards/chosen": -0.15938527882099152, "rewards/margins": 8.332877159118652, "rewards/rejected": -8.492262840270996, "step": 910 }, { "epoch": 1.19, "learning_rate": 3.493001258312529e-05, "logits/chosen": -1.9223310947418213, "logits/rejected": -1.9442507028579712, "logps/chosen": -158.0222930908203, "logps/rejected": -258.5923767089844, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 0.13621219992637634, "rewards/margins": 8.879887580871582, "rewards/rejected": -8.743675231933594, "step": 911 }, { "epoch": 1.19, "learning_rate": 3.489711987236357e-05, "logits/chosen": -1.8931705951690674, "logits/rejected": -1.92643404006958, "logps/chosen": -189.59190368652344, "logps/rejected": -254.67291259765625, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": -0.908362090587616, "rewards/margins": 6.802270412445068, "rewards/rejected": -7.71063232421875, "step": 912 }, { "epoch": 1.19, "learning_rate": 3.4864206832068884e-05, "logits/chosen": -1.6683810949325562, "logits/rejected": -1.6903074979782104, "logps/chosen": -174.6500244140625, "logps/rejected": -238.3423614501953, "loss": 0.1404, "rewards/accuracies": 0.875, "rewards/chosen": -0.7650407552719116, "rewards/margins": 6.08333158493042, "rewards/rejected": -6.848372936248779, "step": 913 }, { "epoch": 1.2, "learning_rate": 3.483127352984742e-05, "logits/chosen": -1.5658330917358398, "logits/rejected": -1.5612378120422363, "logps/chosen": -191.35238647460938, "logps/rejected": -275.3499450683594, "loss": 0.0956, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4658517837524414, "rewards/margins": 7.195768356323242, "rewards/rejected": -7.661620140075684, "step": 914 }, { "epoch": 1.2, "learning_rate": 3.479832003334702e-05, "logits/chosen": -1.767727017402649, "logits/rejected": -1.743235468864441, "logps/chosen": -194.4393310546875, "logps/rejected": -251.51502990722656, "loss": 0.1065, "rewards/accuracies": 0.875, "rewards/chosen": -0.39378470182418823, "rewards/margins": 6.869220733642578, "rewards/rejected": -7.26300573348999, "step": 915 }, { "epoch": 1.2, "learning_rate": 3.476534641025698e-05, "logits/chosen": -1.7358970642089844, "logits/rejected": -1.6227883100509644, "logps/chosen": -164.82554626464844, "logps/rejected": -246.82467651367188, "loss": 0.0634, "rewards/accuracies": 0.9375, "rewards/chosen": 0.006584217771887779, "rewards/margins": 6.655069351196289, "rewards/rejected": -6.6484856605529785, "step": 916 }, { "epoch": 1.2, "learning_rate": 3.4732352728307966e-05, "logits/chosen": -1.9275317192077637, "logits/rejected": -1.9865800142288208, "logps/chosen": -209.73939514160156, "logps/rejected": -301.17333984375, "loss": 0.1449, "rewards/accuracies": 0.8125, "rewards/chosen": 0.43461543321609497, "rewards/margins": 7.842949390411377, "rewards/rejected": -7.408333778381348, "step": 917 }, { "epoch": 1.2, "learning_rate": 3.469933905527182e-05, "logits/chosen": -1.8539807796478271, "logits/rejected": -1.8448735475540161, "logps/chosen": -147.5707244873047, "logps/rejected": -214.45086669921875, "loss": 0.0996, "rewards/accuracies": 0.875, "rewards/chosen": -0.026866242289543152, "rewards/margins": 6.803654670715332, "rewards/rejected": -6.8305206298828125, "step": 918 }, { "epoch": 1.2, "learning_rate": 3.466630545896146e-05, "logits/chosen": -1.87417471408844, "logits/rejected": -1.8515712022781372, "logps/chosen": -181.49510192871094, "logps/rejected": -237.30181884765625, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 0.1314505785703659, "rewards/margins": 6.971901893615723, "rewards/rejected": -6.840450763702393, "step": 919 }, { "epoch": 1.2, "learning_rate": 3.463325200723071e-05, "logits/chosen": -1.7973699569702148, "logits/rejected": -1.896343469619751, "logps/chosen": -149.91091918945312, "logps/rejected": -225.79420471191406, "loss": 0.0931, "rewards/accuracies": 0.875, "rewards/chosen": -0.6555187106132507, "rewards/margins": 6.618897438049316, "rewards/rejected": -7.274415969848633, "step": 920 }, { "epoch": 1.21, "learning_rate": 3.460017876797422e-05, "logits/chosen": -1.7525713443756104, "logits/rejected": -1.6888906955718994, "logps/chosen": -205.05142211914062, "logps/rejected": -276.8188781738281, "loss": 0.1139, "rewards/accuracies": 0.875, "rewards/chosen": -1.1992167234420776, "rewards/margins": 6.552946090698242, "rewards/rejected": -7.752162933349609, "step": 921 }, { "epoch": 1.21, "learning_rate": 3.456708580912725e-05, "logits/chosen": -2.064255475997925, "logits/rejected": -2.0105979442596436, "logps/chosen": -180.6109619140625, "logps/rejected": -242.7440948486328, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": -0.47643887996673584, "rewards/margins": 6.825469493865967, "rewards/rejected": -7.301907539367676, "step": 922 }, { "epoch": 1.21, "learning_rate": 3.453397319866557e-05, "logits/chosen": -1.9662121534347534, "logits/rejected": -1.9616503715515137, "logps/chosen": -158.33319091796875, "logps/rejected": -220.38682556152344, "loss": 0.1106, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5450817346572876, "rewards/margins": 5.366931438446045, "rewards/rejected": -5.912014007568359, "step": 923 }, { "epoch": 1.21, "learning_rate": 3.4500841004605324e-05, "logits/chosen": -1.5947680473327637, "logits/rejected": -1.641003131866455, "logps/chosen": -191.16773986816406, "logps/rejected": -282.4808044433594, "loss": 0.1363, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3834429085254669, "rewards/margins": 9.609367370605469, "rewards/rejected": -9.225924491882324, "step": 924 }, { "epoch": 1.21, "learning_rate": 3.446768929500288e-05, "logits/chosen": -1.9656989574432373, "logits/rejected": -1.9933035373687744, "logps/chosen": -169.61203002929688, "logps/rejected": -272.0287170410156, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": 0.409053236246109, "rewards/margins": 9.076614379882812, "rewards/rejected": -8.667560577392578, "step": 925 }, { "epoch": 1.21, "learning_rate": 3.443451813795469e-05, "logits/chosen": -1.8398690223693848, "logits/rejected": -1.8831474781036377, "logps/chosen": -211.554931640625, "logps/rejected": -309.2585144042969, "loss": 0.0769, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6313567161560059, "rewards/margins": 8.130779266357422, "rewards/rejected": -8.762136459350586, "step": 926 }, { "epoch": 1.21, "learning_rate": 3.4401327601597174e-05, "logits/chosen": -1.985721468925476, "logits/rejected": -1.9450913667678833, "logps/chosen": -218.31907653808594, "logps/rejected": -290.0771484375, "loss": 0.0982, "rewards/accuracies": 0.875, "rewards/chosen": 0.06794175505638123, "rewards/margins": 7.096756935119629, "rewards/rejected": -7.028815746307373, "step": 927 }, { "epoch": 1.21, "learning_rate": 3.436811775410651e-05, "logits/chosen": -1.8266397714614868, "logits/rejected": -1.869195580482483, "logps/chosen": -158.58258056640625, "logps/rejected": -253.25064086914062, "loss": 0.0554, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1370869874954224, "rewards/margins": 9.21893310546875, "rewards/rejected": -8.081846237182617, "step": 928 }, { "epoch": 1.22, "learning_rate": 3.43348886636986e-05, "logits/chosen": -1.9212427139282227, "logits/rejected": -1.9511475563049316, "logps/chosen": -158.55897521972656, "logps/rejected": -246.16014099121094, "loss": 0.062, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13792043924331665, "rewards/margins": 8.233397483825684, "rewards/rejected": -8.095476150512695, "step": 929 }, { "epoch": 1.22, "learning_rate": 3.430164039862882e-05, "logits/chosen": -1.6417392492294312, "logits/rejected": -1.6897914409637451, "logps/chosen": -168.390625, "logps/rejected": -240.8642578125, "loss": 0.0492, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16894683241844177, "rewards/margins": 7.576196670532227, "rewards/rejected": -7.407250881195068, "step": 930 }, { "epoch": 1.22, "learning_rate": 3.426837302719197e-05, "logits/chosen": -1.8382885456085205, "logits/rejected": -1.816691517829895, "logps/chosen": -228.66392517089844, "logps/rejected": -333.4481201171875, "loss": 0.0483, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22568544745445251, "rewards/margins": 8.674262046813965, "rewards/rejected": -8.899946212768555, "step": 931 }, { "epoch": 1.22, "learning_rate": 3.42350866177221e-05, "logits/chosen": -1.8598941564559937, "logits/rejected": -1.8177353143692017, "logps/chosen": -168.64523315429688, "logps/rejected": -267.4147644042969, "loss": 0.0688, "rewards/accuracies": 0.9375, "rewards/chosen": -0.040854811668395996, "rewards/margins": 7.731595516204834, "rewards/rejected": -7.772449970245361, "step": 932 }, { "epoch": 1.22, "learning_rate": 3.420178123859233e-05, "logits/chosen": -1.762475609779358, "logits/rejected": -1.712884545326233, "logps/chosen": -189.55911254882812, "logps/rejected": -261.5013427734375, "loss": 0.0699, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7337988615036011, "rewards/margins": 6.591477870941162, "rewards/rejected": -7.3252763748168945, "step": 933 }, { "epoch": 1.22, "learning_rate": 3.416845695821476e-05, "logits/chosen": -1.8344154357910156, "logits/rejected": -1.8057382106781006, "logps/chosen": -180.87850952148438, "logps/rejected": -250.47337341308594, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -0.29066789150238037, "rewards/margins": 7.957887649536133, "rewards/rejected": -8.248556137084961, "step": 934 }, { "epoch": 1.22, "learning_rate": 3.413511384504034e-05, "logits/chosen": -2.0944817066192627, "logits/rejected": -2.08333420753479, "logps/chosen": -171.9430694580078, "logps/rejected": -253.50955200195312, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -0.2787511646747589, "rewards/margins": 8.49667739868164, "rewards/rejected": -8.775429725646973, "step": 935 }, { "epoch": 1.22, "learning_rate": 3.410175196755866e-05, "logits/chosen": -1.9048943519592285, "logits/rejected": -1.8727785348892212, "logps/chosen": -178.324951171875, "logps/rejected": -269.0649719238281, "loss": 0.0686, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0880991518497467, "rewards/margins": 8.45480728149414, "rewards/rejected": -8.542905807495117, "step": 936 }, { "epoch": 1.23, "learning_rate": 3.40683713942979e-05, "logits/chosen": -1.671642780303955, "logits/rejected": -1.6982778310775757, "logps/chosen": -176.78634643554688, "logps/rejected": -280.5889892578125, "loss": 0.0955, "rewards/accuracies": 0.875, "rewards/chosen": -0.5775284767150879, "rewards/margins": 8.944602966308594, "rewards/rejected": -9.52213191986084, "step": 937 }, { "epoch": 1.23, "learning_rate": 3.403497219382461e-05, "logits/chosen": -1.9782202243804932, "logits/rejected": -1.998267650604248, "logps/chosen": -163.75387573242188, "logps/rejected": -259.76483154296875, "loss": 0.0988, "rewards/accuracies": 0.875, "rewards/chosen": 0.2686743140220642, "rewards/margins": 8.010343551635742, "rewards/rejected": -7.741668701171875, "step": 938 }, { "epoch": 1.23, "learning_rate": 3.400155443474361e-05, "logits/chosen": -1.8512263298034668, "logits/rejected": -1.8064016103744507, "logps/chosen": -194.15982055664062, "logps/rejected": -293.0312194824219, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 0.759626567363739, "rewards/margins": 10.006168365478516, "rewards/rejected": -9.246540069580078, "step": 939 }, { "epoch": 1.23, "learning_rate": 3.396811818569785e-05, "logits/chosen": -1.7697315216064453, "logits/rejected": -1.7975068092346191, "logps/chosen": -168.0388946533203, "logps/rejected": -251.87774658203125, "loss": 0.0526, "rewards/accuracies": 0.9375, "rewards/chosen": 1.377260446548462, "rewards/margins": 9.335554122924805, "rewards/rejected": -7.9582929611206055, "step": 940 }, { "epoch": 1.23, "learning_rate": 3.3934663515368236e-05, "logits/chosen": -1.8283406496047974, "logits/rejected": -1.9062525033950806, "logps/chosen": -164.63389587402344, "logps/rejected": -247.40115356445312, "loss": 0.1111, "rewards/accuracies": 0.875, "rewards/chosen": -0.09177665412425995, "rewards/margins": 7.34284782409668, "rewards/rejected": -7.434624671936035, "step": 941 }, { "epoch": 1.23, "learning_rate": 3.3901190492473554e-05, "logits/chosen": -1.8158893585205078, "logits/rejected": -1.8978768587112427, "logps/chosen": -171.03732299804688, "logps/rejected": -265.78216552734375, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": -0.36732611060142517, "rewards/margins": 8.224309921264648, "rewards/rejected": -8.591635704040527, "step": 942 }, { "epoch": 1.23, "learning_rate": 3.3867699185770255e-05, "logits/chosen": -1.5865942239761353, "logits/rejected": -1.663236379623413, "logps/chosen": -207.1858367919922, "logps/rejected": -319.83319091796875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -0.15542030334472656, "rewards/margins": 10.122888565063477, "rewards/rejected": -10.278308868408203, "step": 943 }, { "epoch": 1.24, "learning_rate": 3.383418966405234e-05, "logits/chosen": -1.7222788333892822, "logits/rejected": -1.751545786857605, "logps/chosen": -174.07720947265625, "logps/rejected": -266.2615051269531, "loss": 0.1213, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24621230363845825, "rewards/margins": 8.12299919128418, "rewards/rejected": -7.876787185668945, "step": 944 }, { "epoch": 1.24, "learning_rate": 3.3800661996151264e-05, "logits/chosen": -1.7203212976455688, "logits/rejected": -1.7764899730682373, "logps/chosen": -164.42947387695312, "logps/rejected": -256.416259765625, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 0.7283108234405518, "rewards/margins": 8.709585189819336, "rewards/rejected": -7.981274604797363, "step": 945 }, { "epoch": 1.24, "learning_rate": 3.376711625093571e-05, "logits/chosen": -1.603279709815979, "logits/rejected": -1.5945706367492676, "logps/chosen": -192.04019165039062, "logps/rejected": -271.12579345703125, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -0.3455383777618408, "rewards/margins": 7.874174118041992, "rewards/rejected": -8.219711303710938, "step": 946 }, { "epoch": 1.24, "learning_rate": 3.373355249731153e-05, "logits/chosen": -1.7255451679229736, "logits/rejected": -1.7624372243881226, "logps/chosen": -164.78860473632812, "logps/rejected": -276.87115478515625, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 1.0089190006256104, "rewards/margins": 9.92430305480957, "rewards/rejected": -8.915383338928223, "step": 947 }, { "epoch": 1.24, "learning_rate": 3.369997080422155e-05, "logits/chosen": -1.7678481340408325, "logits/rejected": -1.833672285079956, "logps/chosen": -194.07901000976562, "logps/rejected": -296.79266357421875, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 0.4331166446208954, "rewards/margins": 10.406550407409668, "rewards/rejected": -9.973432540893555, "step": 948 }, { "epoch": 1.24, "learning_rate": 3.366637124064544e-05, "logits/chosen": -1.9094618558883667, "logits/rejected": -1.9102199077606201, "logps/chosen": -166.8306121826172, "logps/rejected": -290.0064697265625, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 0.8467321395874023, "rewards/margins": 10.709648132324219, "rewards/rejected": -9.8629150390625, "step": 949 }, { "epoch": 1.24, "learning_rate": 3.36327538755996e-05, "logits/chosen": -1.890424370765686, "logits/rejected": -1.8965479135513306, "logps/chosen": -203.60035705566406, "logps/rejected": -275.9346923828125, "loss": 0.1489, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8383345603942871, "rewards/margins": 6.4683966636657715, "rewards/rejected": -7.306731700897217, "step": 950 }, { "epoch": 1.24, "learning_rate": 3.3599118778136965e-05, "logits/chosen": -1.4926958084106445, "logits/rejected": -1.5053917169570923, "logps/chosen": -230.29598999023438, "logps/rejected": -295.8033752441406, "loss": 0.1053, "rewards/accuracies": 0.875, "rewards/chosen": -1.3924224376678467, "rewards/margins": 7.463953495025635, "rewards/rejected": -8.856375694274902, "step": 951 }, { "epoch": 1.25, "learning_rate": 3.356546601734692e-05, "logits/chosen": -1.5974106788635254, "logits/rejected": -1.6120538711547852, "logps/chosen": -199.6564178466797, "logps/rejected": -310.5819396972656, "loss": 0.0693, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2950756549835205, "rewards/margins": 6.976012229919434, "rewards/rejected": -8.271088600158691, "step": 952 }, { "epoch": 1.25, "learning_rate": 3.3531795662355115e-05, "logits/chosen": -1.8552980422973633, "logits/rejected": -1.8632795810699463, "logps/chosen": -192.36004638671875, "logps/rejected": -277.8472900390625, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": -1.0061737298965454, "rewards/margins": 8.26196002960205, "rewards/rejected": -9.268133163452148, "step": 953 }, { "epoch": 1.25, "learning_rate": 3.349810778232335e-05, "logits/chosen": -1.8003088235855103, "logits/rejected": -1.7985399961471558, "logps/chosen": -151.33642578125, "logps/rejected": -245.3261260986328, "loss": 0.1194, "rewards/accuracies": 0.9375, "rewards/chosen": 0.29827451705932617, "rewards/margins": 8.245087623596191, "rewards/rejected": -7.946812629699707, "step": 954 }, { "epoch": 1.25, "learning_rate": 3.346440244644942e-05, "logits/chosen": -1.7284401655197144, "logits/rejected": -1.7348697185516357, "logps/chosen": -169.25662231445312, "logps/rejected": -268.5526428222656, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -0.59054034948349, "rewards/margins": 8.547279357910156, "rewards/rejected": -9.137819290161133, "step": 955 }, { "epoch": 1.25, "learning_rate": 3.3430679723966976e-05, "logits/chosen": -1.776133418083191, "logits/rejected": -1.8338634967803955, "logps/chosen": -169.5686492919922, "logps/rejected": -286.97802734375, "loss": 0.125, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3752596378326416, "rewards/margins": 8.643117904663086, "rewards/rejected": -8.267858505249023, "step": 956 }, { "epoch": 1.25, "learning_rate": 3.339693968414538e-05, "logits/chosen": -1.6924769878387451, "logits/rejected": -1.69418466091156, "logps/chosen": -199.43775939941406, "logps/rejected": -265.88092041015625, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": -0.8435568809509277, "rewards/margins": 7.179770469665527, "rewards/rejected": -8.023327827453613, "step": 957 }, { "epoch": 1.25, "learning_rate": 3.336318239628956e-05, "logits/chosen": -1.7992136478424072, "logits/rejected": -1.7918800115585327, "logps/chosen": -173.89222717285156, "logps/rejected": -242.6941375732422, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -0.014464115723967552, "rewards/margins": 7.979608535766602, "rewards/rejected": -7.994071960449219, "step": 958 }, { "epoch": 1.26, "learning_rate": 3.3329407929739906e-05, "logits/chosen": -1.8405758142471313, "logits/rejected": -1.8193663358688354, "logps/chosen": -191.95260620117188, "logps/rejected": -312.0247802734375, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012057796120643616, "rewards/margins": 9.450359344482422, "rewards/rejected": -9.45156478881836, "step": 959 }, { "epoch": 1.26, "learning_rate": 3.3295616353872026e-05, "logits/chosen": -1.6075886487960815, "logits/rejected": -1.5258815288543701, "logps/chosen": -171.4676513671875, "logps/rejected": -256.88238525390625, "loss": 0.0715, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5326271057128906, "rewards/margins": 9.00959587097168, "rewards/rejected": -9.542222023010254, "step": 960 }, { "epoch": 1.26, "learning_rate": 3.326180773809676e-05, "logits/chosen": -1.7329258918762207, "logits/rejected": -1.7353699207305908, "logps/chosen": -167.04550170898438, "logps/rejected": -292.188232421875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 1.487869143486023, "rewards/margins": 11.961874008178711, "rewards/rejected": -10.474005699157715, "step": 961 }, { "epoch": 1.26, "learning_rate": 3.3227982151859873e-05, "logits/chosen": -1.8756731748580933, "logits/rejected": -1.9023237228393555, "logps/chosen": -167.3818359375, "logps/rejected": -231.07977294921875, "loss": 0.1076, "rewards/accuracies": 1.0, "rewards/chosen": -0.8674279451370239, "rewards/margins": 6.189681053161621, "rewards/rejected": -7.0571088790893555, "step": 962 }, { "epoch": 1.26, "learning_rate": 3.3194139664642035e-05, "logits/chosen": -1.8310325145721436, "logits/rejected": -1.843670129776001, "logps/chosen": -161.2306365966797, "logps/rejected": -294.9887390136719, "loss": 0.0467, "rewards/accuracies": 0.9375, "rewards/chosen": 0.30892035365104675, "rewards/margins": 11.805313110351562, "rewards/rejected": -11.496391296386719, "step": 963 }, { "epoch": 1.26, "learning_rate": 3.3160280345958614e-05, "logits/chosen": -1.7281033992767334, "logits/rejected": -1.7897300720214844, "logps/chosen": -142.33782958984375, "logps/rejected": -255.6434326171875, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 0.6555277109146118, "rewards/margins": 9.613879203796387, "rewards/rejected": -8.958351135253906, "step": 964 }, { "epoch": 1.26, "learning_rate": 3.3126404265359545e-05, "logits/chosen": -1.865938425064087, "logits/rejected": -1.8880172967910767, "logps/chosen": -183.08792114257812, "logps/rejected": -271.8849182128906, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -0.19245201349258423, "rewards/margins": 8.922394752502441, "rewards/rejected": -9.114846229553223, "step": 965 }, { "epoch": 1.26, "learning_rate": 3.3092511492429216e-05, "logits/chosen": -1.819749116897583, "logits/rejected": -1.7672382593154907, "logps/chosen": -193.35067749023438, "logps/rejected": -261.9708251953125, "loss": 0.0636, "rewards/accuracies": 0.9375, "rewards/chosen": -0.34252068400382996, "rewards/margins": 6.740435600280762, "rewards/rejected": -7.082956314086914, "step": 966 }, { "epoch": 1.27, "learning_rate": 3.305860209678628e-05, "logits/chosen": -1.6987472772598267, "logits/rejected": -1.7095977067947388, "logps/chosen": -139.22987365722656, "logps/rejected": -237.2021484375, "loss": 0.0886, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03404363989830017, "rewards/margins": 7.799056529998779, "rewards/rejected": -7.833099365234375, "step": 967 }, { "epoch": 1.27, "learning_rate": 3.3024676148083555e-05, "logits/chosen": -1.7296700477600098, "logits/rejected": -1.7009743452072144, "logps/chosen": -184.88128662109375, "logps/rejected": -294.8798522949219, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 0.7556076645851135, "rewards/margins": 10.581036567687988, "rewards/rejected": -9.825429916381836, "step": 968 }, { "epoch": 1.27, "learning_rate": 3.299073371600784e-05, "logits/chosen": -1.6321117877960205, "logits/rejected": -1.672057032585144, "logps/chosen": -181.47630310058594, "logps/rejected": -283.00860595703125, "loss": 0.0612, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08924361318349838, "rewards/margins": 9.234962463378906, "rewards/rejected": -9.324206352233887, "step": 969 }, { "epoch": 1.27, "learning_rate": 3.29567748702798e-05, "logits/chosen": -1.3825104236602783, "logits/rejected": -1.3645695447921753, "logps/chosen": -162.88233947753906, "logps/rejected": -217.79019165039062, "loss": 0.1412, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11819088459014893, "rewards/margins": 5.8656392097473145, "rewards/rejected": -5.983829975128174, "step": 970 }, { "epoch": 1.27, "learning_rate": 3.2922799680653816e-05, "logits/chosen": -1.5520780086517334, "logits/rejected": -1.5310850143432617, "logps/chosen": -230.1056671142578, "logps/rejected": -302.88671875, "loss": 0.1557, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2427598237991333, "rewards/margins": 6.7874555587768555, "rewards/rejected": -8.0302152633667, "step": 971 }, { "epoch": 1.27, "learning_rate": 3.288880821691785e-05, "logits/chosen": -1.3305836915969849, "logits/rejected": -1.3754537105560303, "logps/chosen": -189.15621948242188, "logps/rejected": -298.706298828125, "loss": 0.0487, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9303971529006958, "rewards/margins": 8.755258560180664, "rewards/rejected": -9.68565559387207, "step": 972 }, { "epoch": 1.27, "learning_rate": 3.285480054889327e-05, "logits/chosen": -1.5995906591415405, "logits/rejected": -1.7117842435836792, "logps/chosen": -147.44998168945312, "logps/rejected": -252.13072204589844, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 0.49690061807632446, "rewards/margins": 9.193412780761719, "rewards/rejected": -8.696512222290039, "step": 973 }, { "epoch": 1.27, "learning_rate": 3.2820776746434764e-05, "logits/chosen": -1.558459997177124, "logits/rejected": -1.5185562372207642, "logps/chosen": -221.10980224609375, "logps/rejected": -294.4140930175781, "loss": 0.0953, "rewards/accuracies": 0.9375, "rewards/chosen": -0.43937918543815613, "rewards/margins": 8.335892677307129, "rewards/rejected": -8.775272369384766, "step": 974 }, { "epoch": 1.28, "learning_rate": 3.278673687943011e-05, "logits/chosen": -1.3949395418167114, "logits/rejected": -1.476244568824768, "logps/chosen": -157.56048583984375, "logps/rejected": -242.78616333007812, "loss": 0.1012, "rewards/accuracies": 0.875, "rewards/chosen": -0.5070436596870422, "rewards/margins": 6.898143768310547, "rewards/rejected": -7.405187129974365, "step": 975 }, { "epoch": 1.28, "learning_rate": 3.2752681017800144e-05, "logits/chosen": -1.8722678422927856, "logits/rejected": -1.8975712060928345, "logps/chosen": -189.45339965820312, "logps/rejected": -278.3753662109375, "loss": 0.053, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11852896958589554, "rewards/margins": 8.716239929199219, "rewards/rejected": -8.834769248962402, "step": 976 }, { "epoch": 1.28, "learning_rate": 3.27186092314985e-05, "logits/chosen": -1.9218028783798218, "logits/rejected": -1.8684682846069336, "logps/chosen": -149.4844970703125, "logps/rejected": -228.8724365234375, "loss": 0.0556, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9821925163269043, "rewards/margins": 9.00061321258545, "rewards/rejected": -8.018420219421387, "step": 977 }, { "epoch": 1.28, "learning_rate": 3.2684521590511566e-05, "logits/chosen": -1.8193577527999878, "logits/rejected": -1.8498221635818481, "logps/chosen": -167.66262817382812, "logps/rejected": -270.91973876953125, "loss": 0.0469, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8246172666549683, "rewards/margins": 9.927530288696289, "rewards/rejected": -9.102913856506348, "step": 978 }, { "epoch": 1.28, "learning_rate": 3.2650418164858284e-05, "logits/chosen": -1.5121347904205322, "logits/rejected": -1.5500718355178833, "logps/chosen": -186.26876831054688, "logps/rejected": -265.0696105957031, "loss": 0.0578, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2416999489068985, "rewards/margins": 7.793680667877197, "rewards/rejected": -8.035380363464355, "step": 979 }, { "epoch": 1.28, "learning_rate": 3.261629902459e-05, "logits/chosen": -1.4318028688430786, "logits/rejected": -1.4715421199798584, "logps/chosen": -162.150146484375, "logps/rejected": -278.2225341796875, "loss": 0.0437, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4904072284698486, "rewards/margins": 11.90435791015625, "rewards/rejected": -10.41395092010498, "step": 980 }, { "epoch": 1.28, "learning_rate": 3.258216423979037e-05, "logits/chosen": -1.8608146905899048, "logits/rejected": -1.864013910293579, "logps/chosen": -269.37750244140625, "logps/rejected": -334.7197265625, "loss": 0.1331, "rewards/accuracies": 0.875, "rewards/chosen": -0.6033797860145569, "rewards/margins": 7.50750207901001, "rewards/rejected": -8.110881805419922, "step": 981 }, { "epoch": 1.29, "learning_rate": 3.254801388057514e-05, "logits/chosen": -1.7995353937149048, "logits/rejected": -1.7990977764129639, "logps/chosen": -202.12106323242188, "logps/rejected": -247.49078369140625, "loss": 0.2716, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1631054878234863, "rewards/margins": 4.216814994812012, "rewards/rejected": -5.37992000579834, "step": 982 }, { "epoch": 1.29, "learning_rate": 3.2513848017092113e-05, "logits/chosen": -1.7793301343917847, "logits/rejected": -1.8699491024017334, "logps/chosen": -148.6905059814453, "logps/rejected": -219.80819702148438, "loss": 0.1464, "rewards/accuracies": 0.875, "rewards/chosen": 0.534216046333313, "rewards/margins": 6.915114402770996, "rewards/rejected": -6.380898952484131, "step": 983 }, { "epoch": 1.29, "learning_rate": 3.2479666719520886e-05, "logits/chosen": -1.8794190883636475, "logits/rejected": -1.856317400932312, "logps/chosen": -180.57888793945312, "logps/rejected": -245.49722290039062, "loss": 0.1261, "rewards/accuracies": 0.875, "rewards/chosen": -0.5208327174186707, "rewards/margins": 7.4504923820495605, "rewards/rejected": -7.971325874328613, "step": 984 }, { "epoch": 1.29, "learning_rate": 3.2445470058072766e-05, "logits/chosen": -1.51198410987854, "logits/rejected": -1.5328192710876465, "logps/chosen": -184.41305541992188, "logps/rejected": -261.3963623046875, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 0.18090005218982697, "rewards/margins": 8.79633903503418, "rewards/rejected": -8.615439414978027, "step": 985 }, { "epoch": 1.29, "learning_rate": 3.2411258102990646e-05, "logits/chosen": -1.5592520236968994, "logits/rejected": -1.514370083808899, "logps/chosen": -200.31686401367188, "logps/rejected": -272.80242919921875, "loss": 0.157, "rewards/accuracies": 0.875, "rewards/chosen": 0.8379793167114258, "rewards/margins": 7.916141986846924, "rewards/rejected": -7.078163146972656, "step": 986 }, { "epoch": 1.29, "learning_rate": 3.23770309245488e-05, "logits/chosen": -1.639445424079895, "logits/rejected": -1.7446305751800537, "logps/chosen": -166.3204803466797, "logps/rejected": -287.1727294921875, "loss": 0.0799, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19196566939353943, "rewards/margins": 9.632253646850586, "rewards/rejected": -9.82421875, "step": 987 }, { "epoch": 1.29, "learning_rate": 3.23427885930528e-05, "logits/chosen": -1.686637043952942, "logits/rejected": -1.6766071319580078, "logps/chosen": -166.44894409179688, "logps/rejected": -271.5466003417969, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -0.21667608618736267, "rewards/margins": 8.782600402832031, "rewards/rejected": -8.999276161193848, "step": 988 }, { "epoch": 1.29, "learning_rate": 3.230853117883933e-05, "logits/chosen": -1.733090877532959, "logits/rejected": -1.7348949909210205, "logps/chosen": -174.36004638671875, "logps/rejected": -263.1109313964844, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": 0.019744902849197388, "rewards/margins": 8.265658378601074, "rewards/rejected": -8.2459135055542, "step": 989 }, { "epoch": 1.3, "learning_rate": 3.227425875227605e-05, "logits/chosen": -1.705898642539978, "logits/rejected": -1.6789770126342773, "logps/chosen": -163.54830932617188, "logps/rejected": -250.91964721679688, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/chosen": 0.3017692565917969, "rewards/margins": 8.326699256896973, "rewards/rejected": -8.024930000305176, "step": 990 }, { "epoch": 1.3, "learning_rate": 3.223997138376146e-05, "logits/chosen": -1.7970457077026367, "logits/rejected": -1.8647854328155518, "logps/chosen": -165.6417236328125, "logps/rejected": -269.0453186035156, "loss": 0.1124, "rewards/accuracies": 0.875, "rewards/chosen": 0.12270615994930267, "rewards/margins": 9.013164520263672, "rewards/rejected": -8.890458106994629, "step": 991 }, { "epoch": 1.3, "learning_rate": 3.220566914372477e-05, "logits/chosen": -1.5819454193115234, "logits/rejected": -1.594781517982483, "logps/chosen": -245.6867218017578, "logps/rejected": -308.18817138671875, "loss": 0.1232, "rewards/accuracies": 0.875, "rewards/chosen": -2.10953688621521, "rewards/margins": 5.188804626464844, "rewards/rejected": -7.298341274261475, "step": 992 }, { "epoch": 1.3, "learning_rate": 3.2171352102625716e-05, "logits/chosen": -1.5325591564178467, "logits/rejected": -1.6081345081329346, "logps/chosen": -203.92449951171875, "logps/rejected": -325.21148681640625, "loss": 0.0911, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15414515137672424, "rewards/margins": 8.53257942199707, "rewards/rejected": -8.378433227539062, "step": 993 }, { "epoch": 1.3, "learning_rate": 3.213702033095444e-05, "logits/chosen": -1.7691090106964111, "logits/rejected": -1.7098833322525024, "logps/chosen": -187.43008422851562, "logps/rejected": -263.68438720703125, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 0.04066956788301468, "rewards/margins": 8.466104507446289, "rewards/rejected": -8.425435066223145, "step": 994 }, { "epoch": 1.3, "learning_rate": 3.210267389923135e-05, "logits/chosen": -1.6959140300750732, "logits/rejected": -1.5891695022583008, "logps/chosen": -178.58628845214844, "logps/rejected": -301.677978515625, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 0.10630902647972107, "rewards/margins": 10.849169731140137, "rewards/rejected": -10.74285888671875, "step": 995 }, { "epoch": 1.3, "learning_rate": 3.2068312878006955e-05, "logits/chosen": -1.8529281616210938, "logits/rejected": -1.8796100616455078, "logps/chosen": -184.71463012695312, "logps/rejected": -256.54901123046875, "loss": 0.1309, "rewards/accuracies": 0.875, "rewards/chosen": 0.16027602553367615, "rewards/margins": 8.256537437438965, "rewards/rejected": -8.096261978149414, "step": 996 }, { "epoch": 1.3, "learning_rate": 3.2033937337861744e-05, "logits/chosen": -1.5665572881698608, "logits/rejected": -1.5762784481048584, "logps/chosen": -153.64761352539062, "logps/rejected": -258.431396484375, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 0.6890445947647095, "rewards/margins": 9.799956321716309, "rewards/rejected": -9.110910415649414, "step": 997 }, { "epoch": 1.31, "learning_rate": 3.199954734940603e-05, "logits/chosen": -1.528867244720459, "logits/rejected": -1.5182762145996094, "logps/chosen": -205.97349548339844, "logps/rejected": -284.24505615234375, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": -1.1888141632080078, "rewards/margins": 8.177652359008789, "rewards/rejected": -9.366466522216797, "step": 998 }, { "epoch": 1.31, "learning_rate": 3.196514298327979e-05, "logits/chosen": -1.818434476852417, "logits/rejected": -1.8412598371505737, "logps/chosen": -169.71066284179688, "logps/rejected": -250.16668701171875, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": -0.2685055732727051, "rewards/margins": 7.847587585449219, "rewards/rejected": -8.116093635559082, "step": 999 }, { "epoch": 1.31, "learning_rate": 3.193072431015254e-05, "logits/chosen": -1.796441674232483, "logits/rejected": -1.8247101306915283, "logps/chosen": -242.6537628173828, "logps/rejected": -339.2400817871094, "loss": 0.1228, "rewards/accuracies": 0.875, "rewards/chosen": -1.054071068763733, "rewards/margins": 8.617059707641602, "rewards/rejected": -9.671130180358887, "step": 1000 } ], "logging_steps": 1, "max_steps": 2292, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }