{ "best_metric": null, "best_model_checkpoint": null, "epoch": 16.0, "eval_steps": 20, "global_step": 2864, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00558659217877095, "grad_norm": 28.603565649816808, "learning_rate": 3.4843205574912892e-09, "logits/chosen": -2.4004337787628174, "logits/rejected": -2.5142080783843994, "logps/chosen": -60.584632873535156, "logps/rejected": -42.787071228027344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0111731843575419, "grad_norm": 29.276010564093625, "learning_rate": 6.9686411149825785e-09, "logits/chosen": -2.4949634075164795, "logits/rejected": -2.4507484436035156, "logps/chosen": -19.39887046813965, "logps/rejected": -25.19920539855957, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.01675977653631285, "grad_norm": 28.859494722690478, "learning_rate": 1.0452961672473868e-08, "logits/chosen": -2.2804834842681885, "logits/rejected": -2.27510666847229, "logps/chosen": -27.924236297607422, "logps/rejected": -39.671104431152344, "loss": 0.689, "rewards/accuracies": 0.75, "rewards/chosen": 0.009498548693954945, "rewards/margins": 0.009498548693954945, "rewards/rejected": 0.0, "step": 3 }, { "epoch": 0.0223463687150838, "grad_norm": 32.50888450037989, "learning_rate": 1.3937282229965157e-08, "logits/chosen": -2.056849718093872, "logits/rejected": -2.039930820465088, "logps/chosen": -50.10342788696289, "logps/rejected": -44.76918029785156, "loss": 0.6933, "rewards/accuracies": 0.75, "rewards/chosen": 0.02802419662475586, "rewards/margins": 0.02802419662475586, "rewards/rejected": 0.0, "step": 4 }, { "epoch": 0.027932960893854747, "grad_norm": 28.857154004570457, "learning_rate": 1.7421602787456446e-08, "logits/chosen": -2.272435426712036, "logits/rejected": -2.301142454147339, "logps/chosen": -38.176116943359375, "logps/rejected": -50.210296630859375, "loss": 0.6965, "rewards/accuracies": 0.25, "rewards/chosen": -0.011784981936216354, "rewards/margins": -0.011784981936216354, "rewards/rejected": 0.0, "step": 5 }, { "epoch": 0.0335195530726257, "grad_norm": 26.415733391767393, "learning_rate": 2.0905923344947736e-08, "logits/chosen": -2.793938159942627, "logits/rejected": -2.4558026790618896, "logps/chosen": -18.068748474121094, "logps/rejected": -63.842552185058594, "loss": 0.6959, "rewards/accuracies": 0.75, "rewards/chosen": 0.01914348639547825, "rewards/margins": 0.01914348639547825, "rewards/rejected": 0.0, "step": 6 }, { "epoch": 0.03910614525139665, "grad_norm": 30.429564005701334, "learning_rate": 2.4390243902439023e-08, "logits/chosen": -2.6528186798095703, "logits/rejected": -2.5782175064086914, "logps/chosen": -18.585830688476562, "logps/rejected": -21.081623077392578, "loss": 0.6907, "rewards/accuracies": 0.0, "rewards/chosen": -0.010954786092042923, "rewards/margins": -0.010954786092042923, "rewards/rejected": 0.0, "step": 7 }, { "epoch": 0.0446927374301676, "grad_norm": 31.759150028352973, "learning_rate": 2.7874564459930314e-08, "logits/chosen": -2.546023368835449, "logits/rejected": -2.566352128982544, "logps/chosen": -20.15776824951172, "logps/rejected": -29.43256950378418, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": 0.008025838062167168, "rewards/margins": 0.008025838062167168, "rewards/rejected": 0.0, "step": 8 }, { "epoch": 0.05027932960893855, "grad_norm": 26.754315287337864, "learning_rate": 3.13588850174216e-08, "logits/chosen": -2.5980911254882812, "logits/rejected": -2.507265567779541, "logps/chosen": -28.086103439331055, "logps/rejected": -36.260292053222656, "loss": 0.6946, "rewards/accuracies": 0.75, "rewards/chosen": -0.005537462420761585, "rewards/margins": -0.005537462420761585, "rewards/rejected": 0.0, "step": 9 }, { "epoch": 0.055865921787709494, "grad_norm": 27.21359249234941, "learning_rate": 3.484320557491289e-08, "logits/chosen": -2.3490583896636963, "logits/rejected": -2.2292792797088623, "logps/chosen": -53.409996032714844, "logps/rejected": -55.07990264892578, "loss": 0.693, "rewards/accuracies": 0.25, "rewards/chosen": -0.013922978192567825, "rewards/margins": -0.013922978192567825, "rewards/rejected": 0.0, "step": 10 }, { "epoch": 0.061452513966480445, "grad_norm": 28.43356949422403, "learning_rate": 3.832752613240418e-08, "logits/chosen": -2.4062752723693848, "logits/rejected": -2.2725396156311035, "logps/chosen": -18.990951538085938, "logps/rejected": -27.86153793334961, "loss": 0.6949, "rewards/accuracies": 0.75, "rewards/chosen": 0.00549395103007555, "rewards/margins": 0.00549395103007555, "rewards/rejected": 0.0, "step": 11 }, { "epoch": 0.0670391061452514, "grad_norm": 26.03024884762362, "learning_rate": 4.181184668989547e-08, "logits/chosen": -2.7887275218963623, "logits/rejected": -2.4660379886627197, "logps/chosen": -15.426248550415039, "logps/rejected": -43.07337188720703, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.004401779733598232, "rewards/margins": -0.004401779733598232, "rewards/rejected": 0.0, "step": 12 }, { "epoch": 0.07262569832402235, "grad_norm": 29.15535304010801, "learning_rate": 4.529616724738676e-08, "logits/chosen": -2.372344493865967, "logits/rejected": -2.0893101692199707, "logps/chosen": -33.84700393676758, "logps/rejected": -52.90555953979492, "loss": 0.6907, "rewards/accuracies": 0.75, "rewards/chosen": 0.025128602981567383, "rewards/margins": 0.025128602981567383, "rewards/rejected": 0.0, "step": 13 }, { "epoch": 0.0782122905027933, "grad_norm": 29.514245218176903, "learning_rate": 4.878048780487805e-08, "logits/chosen": -2.565272331237793, "logits/rejected": -2.392062187194824, "logps/chosen": -18.883819580078125, "logps/rejected": -48.590614318847656, "loss": 0.6956, "rewards/accuracies": 1.0, "rewards/chosen": 0.01730327680706978, "rewards/margins": 0.01730327680706978, "rewards/rejected": 0.0, "step": 14 }, { "epoch": 0.08379888268156424, "grad_norm": 30.166208831944306, "learning_rate": 5.2264808362369334e-08, "logits/chosen": -2.1422746181488037, "logits/rejected": -2.060894012451172, "logps/chosen": -42.46976852416992, "logps/rejected": -50.680320739746094, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.0040832520462572575, "rewards/margins": -0.0040832520462572575, "rewards/rejected": 0.0, "step": 15 }, { "epoch": 0.0893854748603352, "grad_norm": 27.010877795355334, "learning_rate": 5.574912891986063e-08, "logits/chosen": -2.048851728439331, "logits/rejected": -2.078244686126709, "logps/chosen": -32.976806640625, "logps/rejected": -39.53431701660156, "loss": 0.6963, "rewards/accuracies": 0.25, "rewards/chosen": -0.01168208196759224, "rewards/margins": -0.01168208196759224, "rewards/rejected": 0.0, "step": 16 }, { "epoch": 0.09497206703910614, "grad_norm": 31.303112360150603, "learning_rate": 5.9233449477351915e-08, "logits/chosen": -2.792558193206787, "logits/rejected": -2.1472291946411133, "logps/chosen": -24.31830596923828, "logps/rejected": -71.05191040039062, "loss": 0.6949, "rewards/accuracies": 0.75, "rewards/chosen": 0.008655214682221413, "rewards/margins": 0.008655214682221413, "rewards/rejected": 0.0, "step": 17 }, { "epoch": 0.1005586592178771, "grad_norm": 30.389870821612302, "learning_rate": 6.27177700348432e-08, "logits/chosen": -2.4866530895233154, "logits/rejected": -2.375359296798706, "logps/chosen": -37.48762512207031, "logps/rejected": -38.95714569091797, "loss": 0.6976, "rewards/accuracies": 0.25, "rewards/chosen": -0.0314456932246685, "rewards/margins": -0.0314456932246685, "rewards/rejected": 0.0, "step": 18 }, { "epoch": 0.10614525139664804, "grad_norm": 29.563734417452384, "learning_rate": 6.620209059233449e-08, "logits/chosen": -2.5353310108184814, "logits/rejected": -2.5976669788360596, "logps/chosen": -24.29471778869629, "logps/rejected": -25.59542465209961, "loss": 0.694, "rewards/accuracies": 0.75, "rewards/chosen": 0.017024923115968704, "rewards/margins": 0.017024923115968704, "rewards/rejected": 0.0, "step": 19 }, { "epoch": 0.11173184357541899, "grad_norm": 28.023653019850883, "learning_rate": 6.968641114982578e-08, "logits/chosen": -2.481630563735962, "logits/rejected": -2.3800740242004395, "logps/chosen": -43.142112731933594, "logps/rejected": -45.48561096191406, "loss": 0.6912, "rewards/accuracies": 0.25, "rewards/chosen": -0.030663300305604935, "rewards/margins": -0.030663300305604935, "rewards/rejected": 0.0, "step": 20 }, { "epoch": 0.11173184357541899, "eval_logits/chosen": -2.378596305847168, "eval_logits/rejected": -2.231410264968872, "eval_logps/chosen": -30.04729652404785, "eval_logps/rejected": -41.63738250732422, "eval_loss": 0.6928394436836243, "eval_rewards/accuracies": 0.5, "eval_rewards/chosen": 0.002274421276524663, "eval_rewards/margins": 0.002274421276524663, "eval_rewards/rejected": 0.0, "eval_runtime": 36.2639, "eval_samples_per_second": 8.548, "eval_steps_per_second": 0.276, "step": 20 }, { "epoch": 0.11731843575418995, "grad_norm": 28.713464267987312, "learning_rate": 7.317073170731706e-08, "logits/chosen": -2.1179280281066895, "logits/rejected": -1.9675930738449097, "logps/chosen": -39.224002838134766, "logps/rejected": -41.40558624267578, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 9.074225090444088e-05, "rewards/margins": 9.074225090444088e-05, "rewards/rejected": 0.0, "step": 21 }, { "epoch": 0.12290502793296089, "grad_norm": 29.03402105884427, "learning_rate": 7.665505226480836e-08, "logits/chosen": -2.4489192962646484, "logits/rejected": -2.542243242263794, "logps/chosen": -33.14081954956055, "logps/rejected": -38.48543167114258, "loss": 0.6947, "rewards/accuracies": 0.25, "rewards/chosen": -0.018711712211370468, "rewards/margins": -0.018711712211370468, "rewards/rejected": 0.0, "step": 22 }, { "epoch": 0.12849162011173185, "grad_norm": 33.33325134064739, "learning_rate": 8.013937282229964e-08, "logits/chosen": -2.0170164108276367, "logits/rejected": -2.1128463745117188, "logps/chosen": -98.69719696044922, "logps/rejected": -58.296714782714844, "loss": 0.6915, "rewards/accuracies": 0.5, "rewards/chosen": -0.018199540674686432, "rewards/margins": -0.018199540674686432, "rewards/rejected": 0.0, "step": 23 }, { "epoch": 0.1340782122905028, "grad_norm": 27.061983809004143, "learning_rate": 8.362369337979095e-08, "logits/chosen": -2.227400302886963, "logits/rejected": -2.2094178199768066, "logps/chosen": -33.232845306396484, "logps/rejected": -34.66838836669922, "loss": 0.6937, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005850789602845907, "rewards/margins": 0.0005850789602845907, "rewards/rejected": 0.0, "step": 24 }, { "epoch": 0.13966480446927373, "grad_norm": 30.631315818668092, "learning_rate": 8.710801393728223e-08, "logits/chosen": -2.4377002716064453, "logits/rejected": -2.3588991165161133, "logps/chosen": -21.7973575592041, "logps/rejected": -40.57929611206055, "loss": 0.6949, "rewards/accuracies": 0.75, "rewards/chosen": 0.02112269401550293, "rewards/margins": 0.02112269401550293, "rewards/rejected": 0.0, "step": 25 }, { "epoch": 0.1452513966480447, "grad_norm": 29.378197798170234, "learning_rate": 9.059233449477352e-08, "logits/chosen": -2.628924608230591, "logits/rejected": -2.482461452484131, "logps/chosen": -23.92325210571289, "logps/rejected": -44.77006530761719, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.004933738149702549, "rewards/margins": 0.004933738149702549, "rewards/rejected": 0.0, "step": 26 }, { "epoch": 0.15083798882681565, "grad_norm": 29.535605005914892, "learning_rate": 9.407665505226481e-08, "logits/chosen": -2.4817450046539307, "logits/rejected": -2.485926866531372, "logps/chosen": -59.53948211669922, "logps/rejected": -56.88829040527344, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.02714724652469158, "rewards/margins": 0.02714724652469158, "rewards/rejected": 0.0, "step": 27 }, { "epoch": 0.1564245810055866, "grad_norm": 27.59267993779762, "learning_rate": 9.75609756097561e-08, "logits/chosen": -2.586944580078125, "logits/rejected": -2.4643876552581787, "logps/chosen": -31.718379974365234, "logps/rejected": -36.230384826660156, "loss": 0.6909, "rewards/accuracies": 0.25, "rewards/chosen": -0.027640867978334427, "rewards/margins": -0.027640867978334427, "rewards/rejected": 0.0, "step": 28 }, { "epoch": 0.16201117318435754, "grad_norm": 29.667583017088273, "learning_rate": 1.0104529616724739e-07, "logits/chosen": -2.30635404586792, "logits/rejected": -2.212883710861206, "logps/chosen": -34.53228759765625, "logps/rejected": -37.26000213623047, "loss": 0.6898, "rewards/accuracies": 0.75, "rewards/chosen": 0.021092604845762253, "rewards/margins": 0.021092604845762253, "rewards/rejected": 0.0, "step": 29 }, { "epoch": 0.16759776536312848, "grad_norm": 29.98028101754352, "learning_rate": 1.0452961672473867e-07, "logits/chosen": -2.4045355319976807, "logits/rejected": -2.4857683181762695, "logps/chosen": -34.87571716308594, "logps/rejected": -52.67020034790039, "loss": 0.689, "rewards/accuracies": 0.25, "rewards/chosen": -0.011652231216430664, "rewards/margins": -0.011652231216430664, "rewards/rejected": 0.0, "step": 30 }, { "epoch": 0.17318435754189945, "grad_norm": 30.079938013679776, "learning_rate": 1.0801393728222996e-07, "logits/chosen": -2.4215786457061768, "logits/rejected": -2.1780693531036377, "logps/chosen": -20.111202239990234, "logps/rejected": -49.6775016784668, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008932588389143348, "rewards/margins": -0.0008932588389143348, "rewards/rejected": 0.0, "step": 31 }, { "epoch": 0.1787709497206704, "grad_norm": 29.0354625721042, "learning_rate": 1.1149825783972126e-07, "logits/chosen": -2.314462423324585, "logits/rejected": -2.171380043029785, "logps/chosen": -28.89542007446289, "logps/rejected": -44.24406433105469, "loss": 0.6917, "rewards/accuracies": 0.75, "rewards/chosen": 0.028475143015384674, "rewards/margins": 0.028475143015384674, "rewards/rejected": 0.0, "step": 32 }, { "epoch": 0.18435754189944134, "grad_norm": 28.82319834972392, "learning_rate": 1.1498257839721254e-07, "logits/chosen": -2.473206043243408, "logits/rejected": -2.5393941402435303, "logps/chosen": -28.577171325683594, "logps/rejected": -34.01260757446289, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": 0.00867919996380806, "rewards/margins": 0.00867919996380806, "rewards/rejected": 0.0, "step": 33 }, { "epoch": 0.18994413407821228, "grad_norm": 26.99801083582208, "learning_rate": 1.1846689895470383e-07, "logits/chosen": -2.5871434211730957, "logits/rejected": -2.204688310623169, "logps/chosen": -24.449928283691406, "logps/rejected": -66.65013122558594, "loss": 0.6854, "rewards/accuracies": 0.75, "rewards/chosen": 0.022472620010375977, "rewards/margins": 0.022472620010375977, "rewards/rejected": 0.0, "step": 34 }, { "epoch": 0.19553072625698323, "grad_norm": 28.371452721285568, "learning_rate": 1.219512195121951e-07, "logits/chosen": -2.438382148742676, "logits/rejected": -2.376980781555176, "logps/chosen": -20.550439834594727, "logps/rejected": -30.104656219482422, "loss": 0.6828, "rewards/accuracies": 0.5, "rewards/chosen": 0.009609270840883255, "rewards/margins": 0.009609270840883255, "rewards/rejected": 0.0, "step": 35 }, { "epoch": 0.2011173184357542, "grad_norm": 27.7150904468452, "learning_rate": 1.254355400696864e-07, "logits/chosen": -2.1353628635406494, "logits/rejected": -2.0030341148376465, "logps/chosen": -52.302974700927734, "logps/rejected": -63.30662536621094, "loss": 0.6824, "rewards/accuracies": 0.5, "rewards/chosen": 0.006544208619743586, "rewards/margins": 0.006544208619743586, "rewards/rejected": 0.0, "step": 36 }, { "epoch": 0.20670391061452514, "grad_norm": 28.68285689295653, "learning_rate": 1.289198606271777e-07, "logits/chosen": -2.4385170936584473, "logits/rejected": -2.4843199253082275, "logps/chosen": -30.546653747558594, "logps/rejected": -37.64432907104492, "loss": 0.6874, "rewards/accuracies": 0.75, "rewards/chosen": 0.010574293322861195, "rewards/margins": 0.010574293322861195, "rewards/rejected": 0.0, "step": 37 }, { "epoch": 0.2122905027932961, "grad_norm": 25.934450282985118, "learning_rate": 1.3240418118466898e-07, "logits/chosen": -2.391115665435791, "logits/rejected": -2.1517765522003174, "logps/chosen": -36.59046936035156, "logps/rejected": -54.95623016357422, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.036093570291996, "rewards/margins": 0.036093570291996, "rewards/rejected": 0.0, "step": 38 }, { "epoch": 0.21787709497206703, "grad_norm": 29.575773366216787, "learning_rate": 1.3588850174216026e-07, "logits/chosen": -2.0830588340759277, "logits/rejected": -2.0630950927734375, "logps/chosen": -51.38600158691406, "logps/rejected": -51.52277374267578, "loss": 0.6753, "rewards/accuracies": 0.75, "rewards/chosen": 0.047041941434144974, "rewards/margins": 0.047041941434144974, "rewards/rejected": 0.0, "step": 39 }, { "epoch": 0.22346368715083798, "grad_norm": 27.986584327852498, "learning_rate": 1.3937282229965157e-07, "logits/chosen": -2.1863017082214355, "logits/rejected": -2.054046630859375, "logps/chosen": -36.82038879394531, "logps/rejected": -28.519256591796875, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.05012211948633194, "rewards/margins": 0.05012211948633194, "rewards/rejected": 0.0, "step": 40 }, { "epoch": 0.22346368715083798, "eval_logits/chosen": -2.383514881134033, "eval_logits/rejected": -2.2417750358581543, "eval_logps/chosen": -29.78684425354004, "eval_logps/rejected": -41.39722442626953, "eval_loss": 0.6780919432640076, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.028319690376520157, "eval_rewards/margins": 0.028319690376520157, "eval_rewards/rejected": 0.0, "eval_runtime": 33.55, "eval_samples_per_second": 9.24, "eval_steps_per_second": 0.298, "step": 40 }, { "epoch": 0.22905027932960895, "grad_norm": 27.886526350000842, "learning_rate": 1.4285714285714285e-07, "logits/chosen": -2.4471418857574463, "logits/rejected": -2.276179552078247, "logps/chosen": -38.71142578125, "logps/rejected": -44.0977783203125, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 0.045044660568237305, "rewards/margins": 0.045044660568237305, "rewards/rejected": 0.0, "step": 41 }, { "epoch": 0.2346368715083799, "grad_norm": 28.19236917478761, "learning_rate": 1.4634146341463413e-07, "logits/chosen": -2.6012725830078125, "logits/rejected": -2.3238584995269775, "logps/chosen": -21.031005859375, "logps/rejected": -43.29925537109375, "loss": 0.6763, "rewards/accuracies": 1.0, "rewards/chosen": 0.051784299314022064, "rewards/margins": 0.051784299314022064, "rewards/rejected": 0.0, "step": 42 }, { "epoch": 0.24022346368715083, "grad_norm": 27.00095263618661, "learning_rate": 1.498257839721254e-07, "logits/chosen": -2.532641887664795, "logits/rejected": -2.5259850025177, "logps/chosen": -46.74769592285156, "logps/rejected": -31.371822357177734, "loss": 0.6785, "rewards/accuracies": 0.75, "rewards/chosen": 0.03024010732769966, "rewards/margins": 0.03024010732769966, "rewards/rejected": 0.0, "step": 43 }, { "epoch": 0.24581005586592178, "grad_norm": 27.735960230898932, "learning_rate": 1.5331010452961672e-07, "logits/chosen": -2.554621934890747, "logits/rejected": -2.3309123516082764, "logps/chosen": -29.774852752685547, "logps/rejected": -50.249359130859375, "loss": 0.6735, "rewards/accuracies": 0.75, "rewards/chosen": 0.015763472765684128, "rewards/margins": 0.015763472765684128, "rewards/rejected": 0.0, "step": 44 }, { "epoch": 0.25139664804469275, "grad_norm": 25.760071781763443, "learning_rate": 1.56794425087108e-07, "logits/chosen": -2.679527521133423, "logits/rejected": -2.3180019855499268, "logps/chosen": -29.519779205322266, "logps/rejected": -51.563804626464844, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.019712496548891068, "rewards/margins": 0.019712496548891068, "rewards/rejected": 0.0, "step": 45 }, { "epoch": 0.2569832402234637, "grad_norm": 27.542371432519047, "learning_rate": 1.6027874564459928e-07, "logits/chosen": -1.9994746446609497, "logits/rejected": -2.045717239379883, "logps/chosen": -67.6978530883789, "logps/rejected": -36.14336395263672, "loss": 0.67, "rewards/accuracies": 1.0, "rewards/chosen": 0.0912964791059494, "rewards/margins": 0.0912964791059494, "rewards/rejected": 0.0, "step": 46 }, { "epoch": 0.26256983240223464, "grad_norm": 26.745703810276925, "learning_rate": 1.6376306620209058e-07, "logits/chosen": -2.550858736038208, "logits/rejected": -2.3014934062957764, "logps/chosen": -26.110837936401367, "logps/rejected": -42.080474853515625, "loss": 0.6742, "rewards/accuracies": 0.75, "rewards/chosen": 0.017519522458314896, "rewards/margins": 0.017519522458314896, "rewards/rejected": 0.0, "step": 47 }, { "epoch": 0.2681564245810056, "grad_norm": 26.81973377757222, "learning_rate": 1.672473867595819e-07, "logits/chosen": -2.5700764656066895, "logits/rejected": -2.4172418117523193, "logps/chosen": -29.44329071044922, "logps/rejected": -34.067237854003906, "loss": 0.6669, "rewards/accuracies": 1.0, "rewards/chosen": 0.0359685905277729, "rewards/margins": 0.0359685905277729, "rewards/rejected": 0.0, "step": 48 }, { "epoch": 0.2737430167597765, "grad_norm": 27.593766502688794, "learning_rate": 1.7073170731707317e-07, "logits/chosen": -2.271967887878418, "logits/rejected": -2.3903441429138184, "logps/chosen": -56.53013610839844, "logps/rejected": -41.234901428222656, "loss": 0.6624, "rewards/accuracies": 1.0, "rewards/chosen": 0.06545233726501465, "rewards/margins": 0.06545233726501465, "rewards/rejected": 0.0, "step": 49 }, { "epoch": 0.27932960893854747, "grad_norm": 23.1080148340995, "learning_rate": 1.7421602787456445e-07, "logits/chosen": -2.7415006160736084, "logits/rejected": -2.479442834854126, "logps/chosen": -22.101638793945312, "logps/rejected": -32.386566162109375, "loss": 0.6633, "rewards/accuracies": 1.0, "rewards/chosen": 0.07915637642145157, "rewards/margins": 0.07915637642145157, "rewards/rejected": 0.0, "step": 50 }, { "epoch": 0.2849162011173184, "grad_norm": 26.221650919673365, "learning_rate": 1.7770034843205576e-07, "logits/chosen": -2.509009599685669, "logits/rejected": -1.923651933670044, "logps/chosen": -25.04236602783203, "logps/rejected": -69.1952133178711, "loss": 0.6521, "rewards/accuracies": 1.0, "rewards/chosen": 0.09428238868713379, "rewards/margins": 0.09428238868713379, "rewards/rejected": 0.0, "step": 51 }, { "epoch": 0.2905027932960894, "grad_norm": 24.728560083374884, "learning_rate": 1.8118466898954704e-07, "logits/chosen": -2.480639696121216, "logits/rejected": -2.4123823642730713, "logps/chosen": -23.67394256591797, "logps/rejected": -28.476255416870117, "loss": 0.6529, "rewards/accuracies": 1.0, "rewards/chosen": 0.08266758918762207, "rewards/margins": 0.08266758918762207, "rewards/rejected": 0.0, "step": 52 }, { "epoch": 0.29608938547486036, "grad_norm": 24.21861955215401, "learning_rate": 1.8466898954703832e-07, "logits/chosen": -2.817721128463745, "logits/rejected": -2.583847761154175, "logps/chosen": -28.653152465820312, "logps/rejected": -53.04334259033203, "loss": 0.6493, "rewards/accuracies": 1.0, "rewards/chosen": 0.12885236740112305, "rewards/margins": 0.12885236740112305, "rewards/rejected": 0.0, "step": 53 }, { "epoch": 0.3016759776536313, "grad_norm": 27.927454062977144, "learning_rate": 1.8815331010452963e-07, "logits/chosen": -2.7345516681671143, "logits/rejected": -2.321146011352539, "logps/chosen": -23.42495346069336, "logps/rejected": -48.8504524230957, "loss": 0.6421, "rewards/accuracies": 1.0, "rewards/chosen": 0.08252206444740295, "rewards/margins": 0.08252206444740295, "rewards/rejected": 0.0, "step": 54 }, { "epoch": 0.30726256983240224, "grad_norm": 26.878483310239424, "learning_rate": 1.916376306620209e-07, "logits/chosen": -2.289569616317749, "logits/rejected": -2.479778289794922, "logps/chosen": -55.436302185058594, "logps/rejected": -37.89727020263672, "loss": 0.6379, "rewards/accuracies": 1.0, "rewards/chosen": 0.15553730726242065, "rewards/margins": 0.15553730726242065, "rewards/rejected": 0.0, "step": 55 }, { "epoch": 0.3128491620111732, "grad_norm": 26.65427216585537, "learning_rate": 1.951219512195122e-07, "logits/chosen": -2.139517068862915, "logits/rejected": -2.0566482543945312, "logps/chosen": -48.204933166503906, "logps/rejected": -44.711669921875, "loss": 0.6426, "rewards/accuracies": 1.0, "rewards/chosen": 0.1732596606016159, "rewards/margins": 0.1732596606016159, "rewards/rejected": 0.0, "step": 56 }, { "epoch": 0.31843575418994413, "grad_norm": 24.87856002674328, "learning_rate": 1.9860627177700347e-07, "logits/chosen": -2.4646410942077637, "logits/rejected": -2.167778491973877, "logps/chosen": -24.157302856445312, "logps/rejected": -48.043636322021484, "loss": 0.6335, "rewards/accuracies": 1.0, "rewards/chosen": 0.1372363567352295, "rewards/margins": 0.1372363567352295, "rewards/rejected": 0.0, "step": 57 }, { "epoch": 0.3240223463687151, "grad_norm": 24.632709992488646, "learning_rate": 2.0209059233449478e-07, "logits/chosen": -2.522116184234619, "logits/rejected": -2.328913927078247, "logps/chosen": -35.4097900390625, "logps/rejected": -47.49260711669922, "loss": 0.64, "rewards/accuracies": 1.0, "rewards/chosen": 0.11406522244215012, "rewards/margins": 0.11406522244215012, "rewards/rejected": 0.0, "step": 58 }, { "epoch": 0.329608938547486, "grad_norm": 26.555932940372706, "learning_rate": 2.0557491289198606e-07, "logits/chosen": -2.312364339828491, "logits/rejected": -2.3537187576293945, "logps/chosen": -58.54545593261719, "logps/rejected": -63.535945892333984, "loss": 0.6279, "rewards/accuracies": 1.0, "rewards/chosen": 0.10076670348644257, "rewards/margins": 0.10076670348644257, "rewards/rejected": 0.0, "step": 59 }, { "epoch": 0.33519553072625696, "grad_norm": 23.362895588849486, "learning_rate": 2.0905923344947734e-07, "logits/chosen": -2.3553497791290283, "logits/rejected": -2.3863284587860107, "logps/chosen": -27.702287673950195, "logps/rejected": -33.98295211791992, "loss": 0.6352, "rewards/accuracies": 1.0, "rewards/chosen": 0.09926161170005798, "rewards/margins": 0.09926161170005798, "rewards/rejected": 0.0, "step": 60 }, { "epoch": 0.33519553072625696, "eval_logits/chosen": -2.450570821762085, "eval_logits/rejected": -2.313969850540161, "eval_logps/chosen": -28.935455322265625, "eval_logps/rejected": -40.358070373535156, "eval_loss": 0.633419930934906, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 0.11345873773097992, "eval_rewards/margins": 0.11345873773097992, "eval_rewards/rejected": 0.0, "eval_runtime": 33.9021, "eval_samples_per_second": 9.144, "eval_steps_per_second": 0.295, "step": 60 }, { "epoch": 0.3407821229050279, "grad_norm": 23.278035097694193, "learning_rate": 2.1254355400696864e-07, "logits/chosen": -2.9201314449310303, "logits/rejected": -2.522038221359253, "logps/chosen": -22.127304077148438, "logps/rejected": -34.22399139404297, "loss": 0.6292, "rewards/accuracies": 1.0, "rewards/chosen": 0.1686132699251175, "rewards/margins": 0.1686132699251175, "rewards/rejected": 0.0, "step": 61 }, { "epoch": 0.3463687150837989, "grad_norm": 24.907424656036728, "learning_rate": 2.1602787456445992e-07, "logits/chosen": -2.494243621826172, "logits/rejected": -2.46530818939209, "logps/chosen": -25.39417839050293, "logps/rejected": -51.274375915527344, "loss": 0.6286, "rewards/accuracies": 1.0, "rewards/chosen": 0.12252993881702423, "rewards/margins": 0.12252993881702423, "rewards/rejected": 0.0, "step": 62 }, { "epoch": 0.35195530726256985, "grad_norm": 23.084074118066138, "learning_rate": 2.195121951219512e-07, "logits/chosen": -2.5782864093780518, "logits/rejected": -2.543471574783325, "logps/chosen": -19.811384201049805, "logps/rejected": -32.101165771484375, "loss": 0.6353, "rewards/accuracies": 1.0, "rewards/chosen": 0.1308431625366211, "rewards/margins": 0.1308431625366211, "rewards/rejected": 0.0, "step": 63 }, { "epoch": 0.3575418994413408, "grad_norm": 22.982956154022173, "learning_rate": 2.229965156794425e-07, "logits/chosen": -2.518967390060425, "logits/rejected": -2.424222230911255, "logps/chosen": -21.03902816772461, "logps/rejected": -33.82353973388672, "loss": 0.6248, "rewards/accuracies": 1.0, "rewards/chosen": 0.11641013622283936, "rewards/margins": 0.11641013622283936, "rewards/rejected": 0.0, "step": 64 }, { "epoch": 0.36312849162011174, "grad_norm": 22.587929792055263, "learning_rate": 2.264808362369338e-07, "logits/chosen": -2.4381473064422607, "logits/rejected": -2.238905191421509, "logps/chosen": -23.96572494506836, "logps/rejected": -44.4195671081543, "loss": 0.6319, "rewards/accuracies": 1.0, "rewards/chosen": 0.10577362030744553, "rewards/margins": 0.10577362030744553, "rewards/rejected": 0.0, "step": 65 }, { "epoch": 0.3687150837988827, "grad_norm": 24.445932086362088, "learning_rate": 2.2996515679442507e-07, "logits/chosen": -2.3218841552734375, "logits/rejected": -1.4848906993865967, "logps/chosen": -41.72569274902344, "logps/rejected": -65.5929183959961, "loss": 0.6254, "rewards/accuracies": 0.75, "rewards/chosen": 0.17065949738025665, "rewards/margins": 0.17065949738025665, "rewards/rejected": 0.0, "step": 66 }, { "epoch": 0.3743016759776536, "grad_norm": 23.886191675716017, "learning_rate": 2.3344947735191635e-07, "logits/chosen": -2.294743299484253, "logits/rejected": -2.3369648456573486, "logps/chosen": -36.33357620239258, "logps/rejected": -31.636873245239258, "loss": 0.6213, "rewards/accuracies": 1.0, "rewards/chosen": 0.11421214044094086, "rewards/margins": 0.11421214044094086, "rewards/rejected": 0.0, "step": 67 }, { "epoch": 0.37988826815642457, "grad_norm": 23.951577576185752, "learning_rate": 2.3693379790940766e-07, "logits/chosen": -2.117687463760376, "logits/rejected": -2.0123000144958496, "logps/chosen": -51.87607955932617, "logps/rejected": -49.09307098388672, "loss": 0.6109, "rewards/accuracies": 1.0, "rewards/chosen": 0.21755558252334595, "rewards/margins": 0.21755558252334595, "rewards/rejected": 0.0, "step": 68 }, { "epoch": 0.3854748603351955, "grad_norm": 22.7328773168266, "learning_rate": 2.4041811846689894e-07, "logits/chosen": -2.36024808883667, "logits/rejected": -2.2724578380584717, "logps/chosen": -22.848896026611328, "logps/rejected": -43.429954528808594, "loss": 0.6047, "rewards/accuracies": 1.0, "rewards/chosen": 0.18426723778247833, "rewards/margins": 0.18426723778247833, "rewards/rejected": 0.0, "step": 69 }, { "epoch": 0.39106145251396646, "grad_norm": 22.123233307547665, "learning_rate": 2.439024390243902e-07, "logits/chosen": -2.68394136428833, "logits/rejected": -2.5219666957855225, "logps/chosen": -25.577144622802734, "logps/rejected": -36.436180114746094, "loss": 0.5927, "rewards/accuracies": 1.0, "rewards/chosen": 0.18651428818702698, "rewards/margins": 0.18651428818702698, "rewards/rejected": 0.0, "step": 70 }, { "epoch": 0.39664804469273746, "grad_norm": 20.74545972228492, "learning_rate": 2.473867595818815e-07, "logits/chosen": -2.5408196449279785, "logits/rejected": -2.420463800430298, "logps/chosen": -24.5079288482666, "logps/rejected": -39.61929702758789, "loss": 0.5961, "rewards/accuracies": 1.0, "rewards/chosen": 0.17850160598754883, "rewards/margins": 0.17850160598754883, "rewards/rejected": 0.0, "step": 71 }, { "epoch": 0.4022346368715084, "grad_norm": 22.381070649511372, "learning_rate": 2.508710801393728e-07, "logits/chosen": -2.9514803886413574, "logits/rejected": -2.8442459106445312, "logps/chosen": -19.637069702148438, "logps/rejected": -32.112022399902344, "loss": 0.5775, "rewards/accuracies": 1.0, "rewards/chosen": 0.24960695207118988, "rewards/margins": 0.24960695207118988, "rewards/rejected": 0.0, "step": 72 }, { "epoch": 0.40782122905027934, "grad_norm": 19.79207116570799, "learning_rate": 2.543554006968641e-07, "logits/chosen": -2.584350109100342, "logits/rejected": -2.4062235355377197, "logps/chosen": -13.208507537841797, "logps/rejected": -39.940975189208984, "loss": 0.5829, "rewards/accuracies": 1.0, "rewards/chosen": 0.1439269483089447, "rewards/margins": 0.1439269483089447, "rewards/rejected": 0.0, "step": 73 }, { "epoch": 0.4134078212290503, "grad_norm": 17.54968620695586, "learning_rate": 2.578397212543554e-07, "logits/chosen": -2.2153613567352295, "logits/rejected": -2.2630043029785156, "logps/chosen": -33.766082763671875, "logps/rejected": -40.740177154541016, "loss": 0.5786, "rewards/accuracies": 1.0, "rewards/chosen": 0.20816154778003693, "rewards/margins": 0.20816154778003693, "rewards/rejected": 0.0, "step": 74 }, { "epoch": 0.41899441340782123, "grad_norm": 18.12325013292998, "learning_rate": 2.613240418118467e-07, "logits/chosen": -2.317336082458496, "logits/rejected": -2.2485673427581787, "logps/chosen": -31.588523864746094, "logps/rejected": -34.93479537963867, "loss": 0.5801, "rewards/accuracies": 1.0, "rewards/chosen": 0.2886280119419098, "rewards/margins": 0.2886280119419098, "rewards/rejected": 0.0, "step": 75 }, { "epoch": 0.4245810055865922, "grad_norm": 17.81810199398241, "learning_rate": 2.6480836236933796e-07, "logits/chosen": -2.9728236198425293, "logits/rejected": -2.7282276153564453, "logps/chosen": -21.17477035522461, "logps/rejected": -34.339134216308594, "loss": 0.5673, "rewards/accuracies": 1.0, "rewards/chosen": 0.34478840231895447, "rewards/margins": 0.34478840231895447, "rewards/rejected": 0.0, "step": 76 }, { "epoch": 0.4301675977653631, "grad_norm": 18.57560846576237, "learning_rate": 2.682926829268293e-07, "logits/chosen": -2.258829355239868, "logits/rejected": -2.3700168132781982, "logps/chosen": -40.9664192199707, "logps/rejected": -43.83318328857422, "loss": 0.5462, "rewards/accuracies": 1.0, "rewards/chosen": 0.47076869010925293, "rewards/margins": 0.47076869010925293, "rewards/rejected": 0.0, "step": 77 }, { "epoch": 0.43575418994413406, "grad_norm": 20.06020648655684, "learning_rate": 2.717770034843205e-07, "logits/chosen": -2.5632386207580566, "logits/rejected": -2.5703723430633545, "logps/chosen": -29.538589477539062, "logps/rejected": -38.035972595214844, "loss": 0.5599, "rewards/accuracies": 1.0, "rewards/chosen": 0.33969616889953613, "rewards/margins": 0.33969616889953613, "rewards/rejected": 0.0, "step": 78 }, { "epoch": 0.441340782122905, "grad_norm": 17.522456596520726, "learning_rate": 2.7526132404181185e-07, "logits/chosen": -2.571640729904175, "logits/rejected": -2.5123398303985596, "logps/chosen": -23.64766502380371, "logps/rejected": -38.227935791015625, "loss": 0.5603, "rewards/accuracies": 1.0, "rewards/chosen": 0.30658215284347534, "rewards/margins": 0.30658215284347534, "rewards/rejected": 0.0, "step": 79 }, { "epoch": 0.44692737430167595, "grad_norm": 17.609953811055686, "learning_rate": 2.7874564459930313e-07, "logits/chosen": -2.55928897857666, "logits/rejected": -2.609794855117798, "logps/chosen": -29.678600311279297, "logps/rejected": -34.138824462890625, "loss": 0.5527, "rewards/accuracies": 1.0, "rewards/chosen": 0.2765759229660034, "rewards/margins": 0.2765759229660034, "rewards/rejected": 0.0, "step": 80 }, { "epoch": 0.44692737430167595, "eval_logits/chosen": -2.5961174964904785, "eval_logits/rejected": -2.481539011001587, "eval_logps/chosen": -26.89170265197754, "eval_logps/rejected": -38.134918212890625, "eval_loss": 0.5455653071403503, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.3178338408470154, "eval_rewards/margins": 0.3178338408470154, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5365, "eval_samples_per_second": 9.244, "eval_steps_per_second": 0.298, "step": 80 }, { "epoch": 0.45251396648044695, "grad_norm": 17.243619602321587, "learning_rate": 2.822299651567944e-07, "logits/chosen": -2.766360282897949, "logits/rejected": -2.7481331825256348, "logps/chosen": -34.953697204589844, "logps/rejected": -44.08012008666992, "loss": 0.5556, "rewards/accuracies": 1.0, "rewards/chosen": 0.386406809091568, "rewards/margins": 0.386406809091568, "rewards/rejected": 0.0, "step": 81 }, { "epoch": 0.4581005586592179, "grad_norm": 18.12598041761261, "learning_rate": 2.857142857142857e-07, "logits/chosen": -2.5122439861297607, "logits/rejected": -2.2476606369018555, "logps/chosen": -38.5035285949707, "logps/rejected": -58.14997482299805, "loss": 0.5337, "rewards/accuracies": 1.0, "rewards/chosen": 0.6213464736938477, "rewards/margins": 0.6213464736938477, "rewards/rejected": 0.0, "step": 82 }, { "epoch": 0.46368715083798884, "grad_norm": 18.26018052160107, "learning_rate": 2.8919860627177703e-07, "logits/chosen": -3.008633613586426, "logits/rejected": -2.7243096828460693, "logps/chosen": -23.37417984008789, "logps/rejected": -25.58895492553711, "loss": 0.5336, "rewards/accuracies": 1.0, "rewards/chosen": 0.45575347542762756, "rewards/margins": 0.45575347542762756, "rewards/rejected": 0.0, "step": 83 }, { "epoch": 0.4692737430167598, "grad_norm": 17.95021613887715, "learning_rate": 2.9268292682926825e-07, "logits/chosen": -2.8492836952209473, "logits/rejected": -2.724342107772827, "logps/chosen": -22.531551361083984, "logps/rejected": -83.26819610595703, "loss": 0.5373, "rewards/accuracies": 1.0, "rewards/chosen": 0.4083956778049469, "rewards/margins": 0.4083956778049469, "rewards/rejected": 0.0, "step": 84 }, { "epoch": 0.4748603351955307, "grad_norm": 19.55540877246056, "learning_rate": 2.961672473867596e-07, "logits/chosen": -2.547548770904541, "logits/rejected": -2.4949235916137695, "logps/chosen": -39.774715423583984, "logps/rejected": -45.983741760253906, "loss": 0.519, "rewards/accuracies": 1.0, "rewards/chosen": 0.38765764236450195, "rewards/margins": 0.38765764236450195, "rewards/rejected": 0.0, "step": 85 }, { "epoch": 0.48044692737430167, "grad_norm": 17.44629190598895, "learning_rate": 2.996515679442508e-07, "logits/chosen": -2.9293198585510254, "logits/rejected": -2.554246187210083, "logps/chosen": -15.574745178222656, "logps/rejected": -54.118019104003906, "loss": 0.5206, "rewards/accuracies": 1.0, "rewards/chosen": 0.3024306893348694, "rewards/margins": 0.3024306893348694, "rewards/rejected": 0.0, "step": 86 }, { "epoch": 0.4860335195530726, "grad_norm": 17.640304319217552, "learning_rate": 3.0313588850174215e-07, "logits/chosen": -2.509176015853882, "logits/rejected": -2.6400609016418457, "logps/chosen": -26.335580825805664, "logps/rejected": -23.034029006958008, "loss": 0.5128, "rewards/accuracies": 1.0, "rewards/chosen": 0.2293638437986374, "rewards/margins": 0.2293638437986374, "rewards/rejected": 0.0, "step": 87 }, { "epoch": 0.49162011173184356, "grad_norm": 17.076730441032804, "learning_rate": 3.0662020905923343e-07, "logits/chosen": -2.748363733291626, "logits/rejected": -2.727149248123169, "logps/chosen": -39.85935974121094, "logps/rejected": -41.72587203979492, "loss": 0.5257, "rewards/accuracies": 1.0, "rewards/chosen": 0.3664461374282837, "rewards/margins": 0.3664461374282837, "rewards/rejected": 0.0, "step": 88 }, { "epoch": 0.4972067039106145, "grad_norm": 16.7596222624691, "learning_rate": 3.101045296167247e-07, "logits/chosen": -2.7122080326080322, "logits/rejected": -2.6718027591705322, "logps/chosen": -21.423099517822266, "logps/rejected": -39.17164611816406, "loss": 0.5254, "rewards/accuracies": 1.0, "rewards/chosen": 0.3437221646308899, "rewards/margins": 0.3437221646308899, "rewards/rejected": 0.0, "step": 89 }, { "epoch": 0.5027932960893855, "grad_norm": 16.370624963433137, "learning_rate": 3.13588850174216e-07, "logits/chosen": -2.7640321254730225, "logits/rejected": -2.6971404552459717, "logps/chosen": -27.607778549194336, "logps/rejected": -29.204383850097656, "loss": 0.5203, "rewards/accuracies": 1.0, "rewards/chosen": 0.34216684103012085, "rewards/margins": 0.34216684103012085, "rewards/rejected": 0.0, "step": 90 }, { "epoch": 0.5083798882681564, "grad_norm": 16.501196308487856, "learning_rate": 3.170731707317073e-07, "logits/chosen": -2.977828025817871, "logits/rejected": -2.7632179260253906, "logps/chosen": -25.994243621826172, "logps/rejected": -50.56151580810547, "loss": 0.5309, "rewards/accuracies": 1.0, "rewards/chosen": 0.41055914759635925, "rewards/margins": 0.41055914759635925, "rewards/rejected": 0.0, "step": 91 }, { "epoch": 0.5139664804469274, "grad_norm": 16.668279912747, "learning_rate": 3.2055749128919855e-07, "logits/chosen": -2.616750955581665, "logits/rejected": -2.6325173377990723, "logps/chosen": -27.63555908203125, "logps/rejected": -26.495452880859375, "loss": 0.5155, "rewards/accuracies": 1.0, "rewards/chosen": 0.44802695512771606, "rewards/margins": 0.44802695512771606, "rewards/rejected": 0.0, "step": 92 }, { "epoch": 0.5195530726256983, "grad_norm": 16.585791801396, "learning_rate": 3.240418118466899e-07, "logits/chosen": -2.691551446914673, "logits/rejected": -2.4979159832000732, "logps/chosen": -26.484764099121094, "logps/rejected": -34.90871810913086, "loss": 0.5193, "rewards/accuracies": 1.0, "rewards/chosen": 0.4211455285549164, "rewards/margins": 0.4211455285549164, "rewards/rejected": 0.0, "step": 93 }, { "epoch": 0.5251396648044693, "grad_norm": 15.508507664030144, "learning_rate": 3.2752613240418117e-07, "logits/chosen": -2.3586292266845703, "logits/rejected": -2.530395746231079, "logps/chosen": -26.607080459594727, "logps/rejected": -29.511329650878906, "loss": 0.5221, "rewards/accuracies": 1.0, "rewards/chosen": 0.381470650434494, "rewards/margins": 0.381470650434494, "rewards/rejected": 0.0, "step": 94 }, { "epoch": 0.5307262569832403, "grad_norm": 14.73280514705423, "learning_rate": 3.3101045296167245e-07, "logits/chosen": -2.9987075328826904, "logits/rejected": -2.319053888320923, "logps/chosen": -15.4478178024292, "logps/rejected": -50.605716705322266, "loss": 0.5068, "rewards/accuracies": 1.0, "rewards/chosen": 0.46311700344085693, "rewards/margins": 0.46311700344085693, "rewards/rejected": 0.0, "step": 95 }, { "epoch": 0.5363128491620112, "grad_norm": 15.464939165348971, "learning_rate": 3.344947735191638e-07, "logits/chosen": -3.0241072177886963, "logits/rejected": -2.82523775100708, "logps/chosen": -15.733403205871582, "logps/rejected": -23.26158332824707, "loss": 0.4811, "rewards/accuracies": 1.0, "rewards/chosen": 0.3380926251411438, "rewards/margins": 0.3380926251411438, "rewards/rejected": 0.0, "step": 96 }, { "epoch": 0.5418994413407822, "grad_norm": 15.07425710254498, "learning_rate": 3.3797909407665506e-07, "logits/chosen": -2.829176425933838, "logits/rejected": -2.7734498977661133, "logps/chosen": -26.00864028930664, "logps/rejected": -61.76011276245117, "loss": 0.4938, "rewards/accuracies": 1.0, "rewards/chosen": 0.47422951459884644, "rewards/margins": 0.47422951459884644, "rewards/rejected": 0.0, "step": 97 }, { "epoch": 0.547486033519553, "grad_norm": 15.292202698963973, "learning_rate": 3.4146341463414634e-07, "logits/chosen": -3.094583749771118, "logits/rejected": -2.988278388977051, "logps/chosen": -15.285747528076172, "logps/rejected": -23.24346923828125, "loss": 0.4904, "rewards/accuracies": 1.0, "rewards/chosen": 0.3607971668243408, "rewards/margins": 0.3607971668243408, "rewards/rejected": 0.0, "step": 98 }, { "epoch": 0.553072625698324, "grad_norm": 14.280871491578443, "learning_rate": 3.449477351916376e-07, "logits/chosen": -3.0072247982025146, "logits/rejected": -2.7647087574005127, "logps/chosen": -17.79582405090332, "logps/rejected": -30.290449142456055, "loss": 0.4646, "rewards/accuracies": 1.0, "rewards/chosen": 0.43808504939079285, "rewards/margins": 0.43808504939079285, "rewards/rejected": 0.0, "step": 99 }, { "epoch": 0.5586592178770949, "grad_norm": 16.045439012707615, "learning_rate": 3.484320557491289e-07, "logits/chosen": -2.7985477447509766, "logits/rejected": -2.6149985790252686, "logps/chosen": -41.18121337890625, "logps/rejected": -41.32271194458008, "loss": 0.4155, "rewards/accuracies": 1.0, "rewards/chosen": 0.6721265316009521, "rewards/margins": 0.6721265316009521, "rewards/rejected": 0.0, "step": 100 }, { "epoch": 0.5586592178770949, "eval_logits/chosen": -2.7876787185668945, "eval_logits/rejected": -2.70434308052063, "eval_logps/chosen": -24.622821807861328, "eval_logps/rejected": -35.58250045776367, "eval_loss": 0.4617948532104492, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.5447221994400024, "eval_rewards/margins": 0.5447221994400024, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5634, "eval_samples_per_second": 9.236, "eval_steps_per_second": 0.298, "step": 100 }, { "epoch": 0.5642458100558659, "grad_norm": 12.573137728140779, "learning_rate": 3.519163763066202e-07, "logits/chosen": -2.688979387283325, "logits/rejected": -2.658362865447998, "logps/chosen": -16.801599502563477, "logps/rejected": -17.431743621826172, "loss": 0.4702, "rewards/accuracies": 1.0, "rewards/chosen": 0.346937894821167, "rewards/margins": 0.346937894821167, "rewards/rejected": 0.0, "step": 101 }, { "epoch": 0.5698324022346368, "grad_norm": 13.331911966929777, "learning_rate": 3.554006968641115e-07, "logits/chosen": -2.7960867881774902, "logits/rejected": -2.776906728744507, "logps/chosen": -37.72350311279297, "logps/rejected": -41.60508728027344, "loss": 0.4448, "rewards/accuracies": 1.0, "rewards/chosen": 0.7360150814056396, "rewards/margins": 0.7360150814056396, "rewards/rejected": 0.0, "step": 102 }, { "epoch": 0.5754189944134078, "grad_norm": 13.730862772517419, "learning_rate": 3.5888501742160274e-07, "logits/chosen": -3.160386562347412, "logits/rejected": -3.0882933139801025, "logps/chosen": -14.691383361816406, "logps/rejected": -32.46586608886719, "loss": 0.4682, "rewards/accuracies": 1.0, "rewards/chosen": 0.44620683789253235, "rewards/margins": 0.44620683789253235, "rewards/rejected": 0.0, "step": 103 }, { "epoch": 0.5810055865921788, "grad_norm": 12.958763580555999, "learning_rate": 3.623693379790941e-07, "logits/chosen": -2.806156873703003, "logits/rejected": -2.7647111415863037, "logps/chosen": -26.239046096801758, "logps/rejected": -27.40913963317871, "loss": 0.4521, "rewards/accuracies": 1.0, "rewards/chosen": 0.6825430393218994, "rewards/margins": 0.6825430393218994, "rewards/rejected": 0.0, "step": 104 }, { "epoch": 0.5865921787709497, "grad_norm": 13.360824364314807, "learning_rate": 3.6585365853658536e-07, "logits/chosen": -2.783945083618164, "logits/rejected": -2.7649834156036377, "logps/chosen": -23.880428314208984, "logps/rejected": -24.65614128112793, "loss": 0.4493, "rewards/accuracies": 1.0, "rewards/chosen": 0.5743331909179688, "rewards/margins": 0.5743331909179688, "rewards/rejected": 0.0, "step": 105 }, { "epoch": 0.5921787709497207, "grad_norm": 13.104227770358209, "learning_rate": 3.6933797909407664e-07, "logits/chosen": -3.279587507247925, "logits/rejected": -3.1188621520996094, "logps/chosen": -18.71685791015625, "logps/rejected": -33.31108474731445, "loss": 0.437, "rewards/accuracies": 1.0, "rewards/chosen": 0.6923595070838928, "rewards/margins": 0.6923595070838928, "rewards/rejected": 0.0, "step": 106 }, { "epoch": 0.5977653631284916, "grad_norm": 14.148746367362307, "learning_rate": 3.728222996515679e-07, "logits/chosen": -3.1686055660247803, "logits/rejected": -3.0254077911376953, "logps/chosen": -20.358896255493164, "logps/rejected": -36.1294059753418, "loss": 0.4331, "rewards/accuracies": 1.0, "rewards/chosen": 0.6534121036529541, "rewards/margins": 0.6534121036529541, "rewards/rejected": 0.0, "step": 107 }, { "epoch": 0.6033519553072626, "grad_norm": 11.985725221178399, "learning_rate": 3.7630662020905925e-07, "logits/chosen": -2.710590124130249, "logits/rejected": -2.8439455032348633, "logps/chosen": -40.90412139892578, "logps/rejected": -32.95746994018555, "loss": 0.4134, "rewards/accuracies": 1.0, "rewards/chosen": 1.029143214225769, "rewards/margins": 1.029143214225769, "rewards/rejected": 0.0, "step": 108 }, { "epoch": 0.6089385474860335, "grad_norm": 13.248918349041723, "learning_rate": 3.797909407665505e-07, "logits/chosen": -2.8310110569000244, "logits/rejected": -2.5916903018951416, "logps/chosen": -28.208520889282227, "logps/rejected": -52.245033264160156, "loss": 0.4174, "rewards/accuracies": 1.0, "rewards/chosen": 0.562522292137146, "rewards/margins": 0.562522292137146, "rewards/rejected": 0.0, "step": 109 }, { "epoch": 0.6145251396648045, "grad_norm": 12.5998803430947, "learning_rate": 3.832752613240418e-07, "logits/chosen": -3.032209634780884, "logits/rejected": -3.0512664318084717, "logps/chosen": -19.49001693725586, "logps/rejected": -72.57357788085938, "loss": 0.4215, "rewards/accuracies": 1.0, "rewards/chosen": 0.7006881237030029, "rewards/margins": 0.7006881237030029, "rewards/rejected": 0.0, "step": 110 }, { "epoch": 0.6201117318435754, "grad_norm": 13.65187382845731, "learning_rate": 3.867595818815331e-07, "logits/chosen": -2.8330180644989014, "logits/rejected": -2.093581199645996, "logps/chosen": -25.720317840576172, "logps/rejected": -62.531890869140625, "loss": 0.431, "rewards/accuracies": 1.0, "rewards/chosen": 0.6129947900772095, "rewards/margins": 0.6129947900772095, "rewards/rejected": 0.0, "step": 111 }, { "epoch": 0.6256983240223464, "grad_norm": 13.466950507028253, "learning_rate": 3.902439024390244e-07, "logits/chosen": -3.234790325164795, "logits/rejected": -3.256101369857788, "logps/chosen": -21.19486427307129, "logps/rejected": -40.322139739990234, "loss": 0.451, "rewards/accuracies": 0.75, "rewards/chosen": 0.36941981315612793, "rewards/margins": 0.36941981315612793, "rewards/rejected": 0.0, "step": 112 }, { "epoch": 0.6312849162011173, "grad_norm": 13.653645461327002, "learning_rate": 3.9372822299651566e-07, "logits/chosen": -3.116182804107666, "logits/rejected": -2.931743860244751, "logps/chosen": -37.29005813598633, "logps/rejected": -53.360008239746094, "loss": 0.45, "rewards/accuracies": 1.0, "rewards/chosen": 0.6139600872993469, "rewards/margins": 0.6139600872993469, "rewards/rejected": 0.0, "step": 113 }, { "epoch": 0.6368715083798883, "grad_norm": 12.366194224921594, "learning_rate": 3.9721254355400694e-07, "logits/chosen": -3.1295180320739746, "logits/rejected": -3.1512725353240967, "logps/chosen": -22.38964080810547, "logps/rejected": -30.870153427124023, "loss": 0.4218, "rewards/accuracies": 1.0, "rewards/chosen": 0.8451684713363647, "rewards/margins": 0.8451684713363647, "rewards/rejected": 0.0, "step": 114 }, { "epoch": 0.6424581005586593, "grad_norm": 12.820270690808119, "learning_rate": 4.006968641114982e-07, "logits/chosen": -2.5849902629852295, "logits/rejected": -2.5268383026123047, "logps/chosen": -39.322906494140625, "logps/rejected": -31.824663162231445, "loss": 0.435, "rewards/accuracies": 1.0, "rewards/chosen": 0.6132646799087524, "rewards/margins": 0.6132646799087524, "rewards/rejected": 0.0, "step": 115 }, { "epoch": 0.6480446927374302, "grad_norm": 12.544434650205087, "learning_rate": 4.0418118466898955e-07, "logits/chosen": -3.1694047451019287, "logits/rejected": -3.014833688735962, "logps/chosen": -32.03730010986328, "logps/rejected": -36.730979919433594, "loss": 0.407, "rewards/accuracies": 1.0, "rewards/chosen": 0.9823371171951294, "rewards/margins": 0.9823371171951294, "rewards/rejected": 0.0, "step": 116 }, { "epoch": 0.6536312849162011, "grad_norm": 12.963061338666149, "learning_rate": 4.076655052264808e-07, "logits/chosen": -3.153256893157959, "logits/rejected": -3.0860118865966797, "logps/chosen": -28.097692489624023, "logps/rejected": -47.145748138427734, "loss": 0.3859, "rewards/accuracies": 1.0, "rewards/chosen": 0.9587994813919067, "rewards/margins": 0.9587994813919067, "rewards/rejected": 0.0, "step": 117 }, { "epoch": 0.659217877094972, "grad_norm": 13.019617331540443, "learning_rate": 4.111498257839721e-07, "logits/chosen": -3.1117141246795654, "logits/rejected": -2.981959581375122, "logps/chosen": -29.46117401123047, "logps/rejected": -54.425132751464844, "loss": 0.3843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9660426378250122, "rewards/margins": 0.9660426378250122, "rewards/rejected": 0.0, "step": 118 }, { "epoch": 0.664804469273743, "grad_norm": 12.09790313144843, "learning_rate": 4.146341463414634e-07, "logits/chosen": -3.2568607330322266, "logits/rejected": -3.243248701095581, "logps/chosen": -35.24617004394531, "logps/rejected": -28.864641189575195, "loss": 0.4081, "rewards/accuracies": 1.0, "rewards/chosen": 0.5788088440895081, "rewards/margins": 0.5788088440895081, "rewards/rejected": 0.0, "step": 119 }, { "epoch": 0.6703910614525139, "grad_norm": 12.299789893471367, "learning_rate": 4.1811846689895467e-07, "logits/chosen": -3.4779090881347656, "logits/rejected": -3.5990796089172363, "logps/chosen": -27.420997619628906, "logps/rejected": -29.504467010498047, "loss": 0.4148, "rewards/accuracies": 1.0, "rewards/chosen": 0.7487576603889465, "rewards/margins": 0.7487576603889465, "rewards/rejected": 0.0, "step": 120 }, { "epoch": 0.6703910614525139, "eval_logits/chosen": -2.985457420349121, "eval_logits/rejected": -2.9356820583343506, "eval_logps/chosen": -22.773021697998047, "eval_logps/rejected": -33.96015548706055, "eval_loss": 0.41244789958000183, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 0.7297021746635437, "eval_rewards/margins": 0.7297021746635437, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5893, "eval_samples_per_second": 9.229, "eval_steps_per_second": 0.298, "step": 120 }, { "epoch": 0.6759776536312849, "grad_norm": 11.918901658967625, "learning_rate": 4.2160278745644595e-07, "logits/chosen": -3.361621618270874, "logits/rejected": -3.4122440814971924, "logps/chosen": -24.095653533935547, "logps/rejected": -39.066993713378906, "loss": 0.4248, "rewards/accuracies": 1.0, "rewards/chosen": 0.7933073043823242, "rewards/margins": 0.7933073043823242, "rewards/rejected": 0.0, "step": 121 }, { "epoch": 0.6815642458100558, "grad_norm": 12.09693560581, "learning_rate": 4.250871080139373e-07, "logits/chosen": -3.1030969619750977, "logits/rejected": -3.078477382659912, "logps/chosen": -31.073591232299805, "logps/rejected": -18.581279754638672, "loss": 0.4445, "rewards/accuracies": 1.0, "rewards/chosen": 0.20919391512870789, "rewards/margins": 0.20919391512870789, "rewards/rejected": 0.0, "step": 122 }, { "epoch": 0.6871508379888268, "grad_norm": 13.230174521061821, "learning_rate": 4.285714285714285e-07, "logits/chosen": -2.724034547805786, "logits/rejected": -2.712747573852539, "logps/chosen": -38.76531982421875, "logps/rejected": -40.227027893066406, "loss": 0.4477, "rewards/accuracies": 1.0, "rewards/chosen": 1.081432819366455, "rewards/margins": 1.081432819366455, "rewards/rejected": 0.0, "step": 123 }, { "epoch": 0.6927374301675978, "grad_norm": 11.029542893807358, "learning_rate": 4.3205574912891985e-07, "logits/chosen": -3.2009103298187256, "logits/rejected": -3.1295859813690186, "logps/chosen": -17.012771606445312, "logps/rejected": -38.768104553222656, "loss": 0.3632, "rewards/accuracies": 1.0, "rewards/chosen": 0.8287367820739746, "rewards/margins": 0.8287367820739746, "rewards/rejected": 0.0, "step": 124 }, { "epoch": 0.6983240223463687, "grad_norm": 12.92541684214979, "learning_rate": 4.3554006968641113e-07, "logits/chosen": -3.3594982624053955, "logits/rejected": -3.184117317199707, "logps/chosen": -15.785029411315918, "logps/rejected": -44.183937072753906, "loss": 0.4085, "rewards/accuracies": 1.0, "rewards/chosen": 0.5601776242256165, "rewards/margins": 0.5601776242256165, "rewards/rejected": 0.0, "step": 125 }, { "epoch": 0.7039106145251397, "grad_norm": 12.418840197192134, "learning_rate": 4.390243902439024e-07, "logits/chosen": -3.018588066101074, "logits/rejected": -2.9847147464752197, "logps/chosen": -16.75999641418457, "logps/rejected": -46.10063934326172, "loss": 0.3922, "rewards/accuracies": 1.0, "rewards/chosen": 0.6556053161621094, "rewards/margins": 0.6556053161621094, "rewards/rejected": 0.0, "step": 126 }, { "epoch": 0.7094972067039106, "grad_norm": 10.889083863099927, "learning_rate": 4.425087108013937e-07, "logits/chosen": -3.1690027713775635, "logits/rejected": -3.025820016860962, "logps/chosen": -35.54032516479492, "logps/rejected": -34.36103820800781, "loss": 0.3746, "rewards/accuracies": 1.0, "rewards/chosen": 1.0566309690475464, "rewards/margins": 1.0566309690475464, "rewards/rejected": 0.0, "step": 127 }, { "epoch": 0.7150837988826816, "grad_norm": 12.906782926741812, "learning_rate": 4.45993031358885e-07, "logits/chosen": -3.336862087249756, "logits/rejected": -3.1122004985809326, "logps/chosen": -16.735675811767578, "logps/rejected": -49.08568572998047, "loss": 0.4002, "rewards/accuracies": 1.0, "rewards/chosen": 0.7246649861335754, "rewards/margins": 0.7246649861335754, "rewards/rejected": 0.0, "step": 128 }, { "epoch": 0.7206703910614525, "grad_norm": 11.359331301126964, "learning_rate": 4.494773519163763e-07, "logits/chosen": -3.3350188732147217, "logits/rejected": -3.3323018550872803, "logps/chosen": -20.424598693847656, "logps/rejected": -26.444061279296875, "loss": 0.4034, "rewards/accuracies": 1.0, "rewards/chosen": 0.9043171405792236, "rewards/margins": 0.9043171405792236, "rewards/rejected": 0.0, "step": 129 }, { "epoch": 0.7262569832402235, "grad_norm": 11.530762525487518, "learning_rate": 4.529616724738676e-07, "logits/chosen": -3.4406161308288574, "logits/rejected": -3.4307963848114014, "logps/chosen": -30.73476791381836, "logps/rejected": -31.74262809753418, "loss": 0.385, "rewards/accuracies": 1.0, "rewards/chosen": 0.6756500005722046, "rewards/margins": 0.6756500005722046, "rewards/rejected": 0.0, "step": 130 }, { "epoch": 0.7318435754189944, "grad_norm": 15.939552333565983, "learning_rate": 4.5644599303135886e-07, "logits/chosen": -3.0016088485717773, "logits/rejected": -3.0341885089874268, "logps/chosen": -19.028980255126953, "logps/rejected": -35.876129150390625, "loss": 0.4286, "rewards/accuracies": 1.0, "rewards/chosen": 0.7295345067977905, "rewards/margins": 0.7295345067977905, "rewards/rejected": 0.0, "step": 131 }, { "epoch": 0.7374301675977654, "grad_norm": 12.180173279607684, "learning_rate": 4.5993031358885015e-07, "logits/chosen": -3.136824369430542, "logits/rejected": -2.9175052642822266, "logps/chosen": -22.140581130981445, "logps/rejected": -34.302146911621094, "loss": 0.4175, "rewards/accuracies": 1.0, "rewards/chosen": 0.5968088507652283, "rewards/margins": 0.5968088507652283, "rewards/rejected": 0.0, "step": 132 }, { "epoch": 0.7430167597765364, "grad_norm": 13.82002718598598, "learning_rate": 4.634146341463415e-07, "logits/chosen": -2.8966777324676514, "logits/rejected": -2.788705587387085, "logps/chosen": -30.218425750732422, "logps/rejected": -40.37165832519531, "loss": 0.4183, "rewards/accuracies": 1.0, "rewards/chosen": 1.0006399154663086, "rewards/margins": 1.0006399154663086, "rewards/rejected": 0.0, "step": 133 }, { "epoch": 0.7486033519553073, "grad_norm": 11.279996388030757, "learning_rate": 4.668989547038327e-07, "logits/chosen": -3.263176441192627, "logits/rejected": -3.253405809402466, "logps/chosen": -22.389724731445312, "logps/rejected": -28.712936401367188, "loss": 0.3617, "rewards/accuracies": 1.0, "rewards/chosen": 0.8534319996833801, "rewards/margins": 0.8534319996833801, "rewards/rejected": 0.0, "step": 134 }, { "epoch": 0.7541899441340782, "grad_norm": 13.31664654210755, "learning_rate": 4.7038327526132404e-07, "logits/chosen": -3.196460008621216, "logits/rejected": -3.055830478668213, "logps/chosen": -13.232107162475586, "logps/rejected": -26.673044204711914, "loss": 0.3906, "rewards/accuracies": 1.0, "rewards/chosen": 0.5391906499862671, "rewards/margins": 0.5391906499862671, "rewards/rejected": 0.0, "step": 135 }, { "epoch": 0.7597765363128491, "grad_norm": 13.644525356031862, "learning_rate": 4.738675958188153e-07, "logits/chosen": -3.086418867111206, "logits/rejected": -3.050128221511841, "logps/chosen": -63.31258773803711, "logps/rejected": -45.431800842285156, "loss": 0.4112, "rewards/accuracies": 1.0, "rewards/chosen": 0.6270837783813477, "rewards/margins": 0.6270837783813477, "rewards/rejected": 0.0, "step": 136 }, { "epoch": 0.7653631284916201, "grad_norm": 11.75120264957957, "learning_rate": 4.773519163763066e-07, "logits/chosen": -3.3567633628845215, "logits/rejected": -3.200810432434082, "logps/chosen": -23.672664642333984, "logps/rejected": -31.51905059814453, "loss": 0.4161, "rewards/accuracies": 1.0, "rewards/chosen": 0.5618863105773926, "rewards/margins": 0.5618863105773926, "rewards/rejected": 0.0, "step": 137 }, { "epoch": 0.770949720670391, "grad_norm": 12.836460989737148, "learning_rate": 4.808362369337979e-07, "logits/chosen": -3.219993829727173, "logits/rejected": -3.0484225749969482, "logps/chosen": -51.30001449584961, "logps/rejected": -43.791709899902344, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 1.059951901435852, "rewards/margins": 1.059951901435852, "rewards/rejected": 0.0, "step": 138 }, { "epoch": 0.776536312849162, "grad_norm": 13.634756786440422, "learning_rate": 4.843205574912892e-07, "logits/chosen": -2.9743027687072754, "logits/rejected": -2.8742432594299316, "logps/chosen": -31.341638565063477, "logps/rejected": -30.591285705566406, "loss": 0.3963, "rewards/accuracies": 1.0, "rewards/chosen": 1.0887293815612793, "rewards/margins": 1.0887293815612793, "rewards/rejected": 0.0, "step": 139 }, { "epoch": 0.7821229050279329, "grad_norm": 13.257090900452434, "learning_rate": 4.878048780487804e-07, "logits/chosen": -3.090188503265381, "logits/rejected": -2.824709415435791, "logps/chosen": -37.55943298339844, "logps/rejected": -23.54281234741211, "loss": 0.4048, "rewards/accuracies": 0.75, "rewards/chosen": 0.8553612232208252, "rewards/margins": 0.8553612232208252, "rewards/rejected": 0.0, "step": 140 }, { "epoch": 0.7821229050279329, "eval_logits/chosen": -3.085087299346924, "eval_logits/rejected": -3.0553345680236816, "eval_logps/chosen": -21.801071166992188, "eval_logps/rejected": -33.04508972167969, "eval_loss": 0.3879704773426056, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 0.8268970251083374, "eval_rewards/margins": 0.8268970251083374, "eval_rewards/rejected": 0.0, "eval_runtime": 33.6022, "eval_samples_per_second": 9.226, "eval_steps_per_second": 0.298, "step": 140 }, { "epoch": 0.7877094972067039, "grad_norm": 13.170388084647799, "learning_rate": 4.912891986062717e-07, "logits/chosen": -3.280094861984253, "logits/rejected": -3.313344955444336, "logps/chosen": -16.471755981445312, "logps/rejected": -47.39679718017578, "loss": 0.4417, "rewards/accuracies": 1.0, "rewards/chosen": 0.6201029419898987, "rewards/margins": 0.6201029419898987, "rewards/rejected": 0.0, "step": 141 }, { "epoch": 0.7932960893854749, "grad_norm": 12.471190495621682, "learning_rate": 4.94773519163763e-07, "logits/chosen": -3.4723424911499023, "logits/rejected": -3.2591540813446045, "logps/chosen": -12.425056457519531, "logps/rejected": -45.91838836669922, "loss": 0.3801, "rewards/accuracies": 1.0, "rewards/chosen": 0.765860378742218, "rewards/margins": 0.765860378742218, "rewards/rejected": 0.0, "step": 142 }, { "epoch": 0.7988826815642458, "grad_norm": 11.753324312801528, "learning_rate": 4.982578397212544e-07, "logits/chosen": -3.0344104766845703, "logits/rejected": -3.033134698867798, "logps/chosen": -27.046661376953125, "logps/rejected": -30.028705596923828, "loss": 0.352, "rewards/accuracies": 1.0, "rewards/chosen": 1.2494921684265137, "rewards/margins": 1.2494921684265137, "rewards/rejected": 0.0, "step": 143 }, { "epoch": 0.8044692737430168, "grad_norm": 12.753673627655559, "learning_rate": 5.017421602787456e-07, "logits/chosen": -3.360502004623413, "logits/rejected": -3.2027170658111572, "logps/chosen": -18.972614288330078, "logps/rejected": -27.654964447021484, "loss": 0.3963, "rewards/accuracies": 1.0, "rewards/chosen": 0.8680293560028076, "rewards/margins": 0.8680293560028076, "rewards/rejected": 0.0, "step": 144 }, { "epoch": 0.8100558659217877, "grad_norm": 14.327923210822185, "learning_rate": 5.052264808362368e-07, "logits/chosen": -3.0389859676361084, "logits/rejected": -3.176422595977783, "logps/chosen": -27.511253356933594, "logps/rejected": -49.270713806152344, "loss": 0.3575, "rewards/accuracies": 1.0, "rewards/chosen": 1.1916924715042114, "rewards/margins": 1.1916924715042114, "rewards/rejected": 0.0, "step": 145 }, { "epoch": 0.8156424581005587, "grad_norm": 11.956878046826485, "learning_rate": 5.087108013937282e-07, "logits/chosen": -3.2901556491851807, "logits/rejected": -3.1521785259246826, "logps/chosen": -21.122501373291016, "logps/rejected": -32.07401657104492, "loss": 0.3995, "rewards/accuracies": 1.0, "rewards/chosen": 0.6621261835098267, "rewards/margins": 0.6621261835098267, "rewards/rejected": 0.0, "step": 146 }, { "epoch": 0.8212290502793296, "grad_norm": 13.441070658644444, "learning_rate": 5.121951219512195e-07, "logits/chosen": -3.182860851287842, "logits/rejected": -3.0047945976257324, "logps/chosen": -25.162214279174805, "logps/rejected": -86.56019592285156, "loss": 0.3656, "rewards/accuracies": 1.0, "rewards/chosen": 0.7732621431350708, "rewards/margins": 0.7732621431350708, "rewards/rejected": 0.0, "step": 147 }, { "epoch": 0.8268156424581006, "grad_norm": 12.43524346477179, "learning_rate": 5.156794425087108e-07, "logits/chosen": -3.2569799423217773, "logits/rejected": -2.9275951385498047, "logps/chosen": -18.716459274291992, "logps/rejected": -23.8824462890625, "loss": 0.3594, "rewards/accuracies": 1.0, "rewards/chosen": 0.7202714681625366, "rewards/margins": 0.7202714681625366, "rewards/rejected": 0.0, "step": 148 }, { "epoch": 0.8324022346368715, "grad_norm": 10.290562092635898, "learning_rate": 5.191637630662021e-07, "logits/chosen": -3.4133076667785645, "logits/rejected": -3.531622886657715, "logps/chosen": -18.537792205810547, "logps/rejected": -52.4014892578125, "loss": 0.3551, "rewards/accuracies": 1.0, "rewards/chosen": 0.7665916681289673, "rewards/margins": 0.7665916681289673, "rewards/rejected": 0.0, "step": 149 }, { "epoch": 0.8379888268156425, "grad_norm": 12.698613392610968, "learning_rate": 5.226480836236934e-07, "logits/chosen": -3.232588529586792, "logits/rejected": -3.436432361602783, "logps/chosen": -30.869352340698242, "logps/rejected": -27.366065979003906, "loss": 0.3824, "rewards/accuracies": 1.0, "rewards/chosen": 1.0566002130508423, "rewards/margins": 1.0566002130508423, "rewards/rejected": 0.0, "step": 150 }, { "epoch": 0.8435754189944135, "grad_norm": 10.300001343681256, "learning_rate": 5.261324041811846e-07, "logits/chosen": -3.4442994594573975, "logits/rejected": -3.2811100482940674, "logps/chosen": -20.960033416748047, "logps/rejected": -44.81601333618164, "loss": 0.3153, "rewards/accuracies": 1.0, "rewards/chosen": 1.3567603826522827, "rewards/margins": 1.3567603826522827, "rewards/rejected": 0.0, "step": 151 }, { "epoch": 0.8491620111731844, "grad_norm": 10.426058542643384, "learning_rate": 5.296167247386759e-07, "logits/chosen": -3.2275266647338867, "logits/rejected": -2.9595117568969727, "logps/chosen": -23.1798095703125, "logps/rejected": -30.18008804321289, "loss": 0.319, "rewards/accuracies": 1.0, "rewards/chosen": 1.275942325592041, "rewards/margins": 1.275942325592041, "rewards/rejected": 0.0, "step": 152 }, { "epoch": 0.8547486033519553, "grad_norm": 14.089341146153586, "learning_rate": 5.331010452961672e-07, "logits/chosen": -3.4573311805725098, "logits/rejected": -3.4678773880004883, "logps/chosen": -28.501304626464844, "logps/rejected": -15.408483505249023, "loss": 0.4136, "rewards/accuracies": 0.75, "rewards/chosen": 0.659926176071167, "rewards/margins": 0.659926176071167, "rewards/rejected": 0.0, "step": 153 }, { "epoch": 0.8603351955307262, "grad_norm": 13.935787499891406, "learning_rate": 5.365853658536586e-07, "logits/chosen": -3.2246856689453125, "logits/rejected": -3.205880880355835, "logps/chosen": -10.82982063293457, "logps/rejected": -18.569210052490234, "loss": 0.3684, "rewards/accuracies": 1.0, "rewards/chosen": 0.7554144263267517, "rewards/margins": 0.7554144263267517, "rewards/rejected": 0.0, "step": 154 }, { "epoch": 0.8659217877094972, "grad_norm": 11.205255809753025, "learning_rate": 5.400696864111498e-07, "logits/chosen": -3.4092469215393066, "logits/rejected": -3.27697491645813, "logps/chosen": -19.24517250061035, "logps/rejected": -27.456302642822266, "loss": 0.3581, "rewards/accuracies": 1.0, "rewards/chosen": 0.9654081463813782, "rewards/margins": 0.9654081463813782, "rewards/rejected": 0.0, "step": 155 }, { "epoch": 0.8715083798882681, "grad_norm": 13.326177301871766, "learning_rate": 5.43554006968641e-07, "logits/chosen": -3.5050294399261475, "logits/rejected": -3.224290609359741, "logps/chosen": -14.973371505737305, "logps/rejected": -39.803672790527344, "loss": 0.414, "rewards/accuracies": 1.0, "rewards/chosen": 0.8258927464485168, "rewards/margins": 0.8258927464485168, "rewards/rejected": 0.0, "step": 156 }, { "epoch": 0.8770949720670391, "grad_norm": 13.809466960643286, "learning_rate": 5.470383275261323e-07, "logits/chosen": -3.009963035583496, "logits/rejected": -2.6967086791992188, "logps/chosen": -26.160964965820312, "logps/rejected": -51.31365966796875, "loss": 0.3614, "rewards/accuracies": 1.0, "rewards/chosen": 1.279192566871643, "rewards/margins": 1.279192566871643, "rewards/rejected": 0.0, "step": 157 }, { "epoch": 0.88268156424581, "grad_norm": 14.909423246602014, "learning_rate": 5.505226480836237e-07, "logits/chosen": -3.4372055530548096, "logits/rejected": -3.270108938217163, "logps/chosen": -13.904682159423828, "logps/rejected": -23.854835510253906, "loss": 0.3709, "rewards/accuracies": 0.75, "rewards/chosen": 0.46371743083000183, "rewards/margins": 0.46371743083000183, "rewards/rejected": 0.0, "step": 158 }, { "epoch": 0.888268156424581, "grad_norm": 11.144765174986764, "learning_rate": 5.54006968641115e-07, "logits/chosen": -3.4928090572357178, "logits/rejected": -3.3432681560516357, "logps/chosen": -12.412818908691406, "logps/rejected": -23.834762573242188, "loss": 0.3319, "rewards/accuracies": 1.0, "rewards/chosen": 0.7286128997802734, "rewards/margins": 0.7286128997802734, "rewards/rejected": 0.0, "step": 159 }, { "epoch": 0.8938547486033519, "grad_norm": 13.863594250814781, "learning_rate": 5.574912891986063e-07, "logits/chosen": -3.4417412281036377, "logits/rejected": -3.0007429122924805, "logps/chosen": -31.18167495727539, "logps/rejected": -49.418609619140625, "loss": 0.367, "rewards/accuracies": 1.0, "rewards/chosen": 1.020141363143921, "rewards/margins": 1.020141363143921, "rewards/rejected": 0.0, "step": 160 }, { "epoch": 0.8938547486033519, "eval_logits/chosen": -3.1731648445129395, "eval_logits/rejected": -3.1657137870788574, "eval_logps/chosen": -20.766986846923828, "eval_logps/rejected": -32.35370635986328, "eval_loss": 0.3678908944129944, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.9303056001663208, "eval_rewards/margins": 0.9303056001663208, "eval_rewards/rejected": 0.0, "eval_runtime": 33.697, "eval_samples_per_second": 9.2, "eval_steps_per_second": 0.297, "step": 160 }, { "epoch": 0.8994413407821229, "grad_norm": 10.547321075543701, "learning_rate": 5.609756097560975e-07, "logits/chosen": -3.3858096599578857, "logits/rejected": -3.1765737533569336, "logps/chosen": -18.24430274963379, "logps/rejected": -43.009735107421875, "loss": 0.3706, "rewards/accuracies": 1.0, "rewards/chosen": 1.0767275094985962, "rewards/margins": 1.0767275094985962, "rewards/rejected": 0.0, "step": 161 }, { "epoch": 0.9050279329608939, "grad_norm": 13.31104092875125, "learning_rate": 5.644599303135888e-07, "logits/chosen": -3.298539876937866, "logits/rejected": -2.8914856910705566, "logps/chosen": -17.463916778564453, "logps/rejected": -20.251060485839844, "loss": 0.3565, "rewards/accuracies": 1.0, "rewards/chosen": 0.924486517906189, "rewards/margins": 0.924486517906189, "rewards/rejected": 0.0, "step": 162 }, { "epoch": 0.9106145251396648, "grad_norm": 12.189074268400924, "learning_rate": 5.679442508710801e-07, "logits/chosen": -3.2942914962768555, "logits/rejected": -3.2542693614959717, "logps/chosen": -21.734283447265625, "logps/rejected": -25.956695556640625, "loss": 0.364, "rewards/accuracies": 1.0, "rewards/chosen": 0.8711822032928467, "rewards/margins": 0.8711822032928467, "rewards/rejected": 0.0, "step": 163 }, { "epoch": 0.9162011173184358, "grad_norm": 10.739512007740988, "learning_rate": 5.714285714285714e-07, "logits/chosen": -3.2355618476867676, "logits/rejected": -3.083498239517212, "logps/chosen": -21.765317916870117, "logps/rejected": -21.472946166992188, "loss": 0.3473, "rewards/accuracies": 1.0, "rewards/chosen": 1.1917572021484375, "rewards/margins": 1.1917572021484375, "rewards/rejected": 0.0, "step": 164 }, { "epoch": 0.9217877094972067, "grad_norm": 11.746475390473849, "learning_rate": 5.749128919860628e-07, "logits/chosen": -3.3734073638916016, "logits/rejected": -3.3708062171936035, "logps/chosen": -37.94599151611328, "logps/rejected": -33.02565383911133, "loss": 0.3433, "rewards/accuracies": 1.0, "rewards/chosen": 1.279693603515625, "rewards/margins": 1.279693603515625, "rewards/rejected": 0.0, "step": 165 }, { "epoch": 0.9273743016759777, "grad_norm": 11.414057455521112, "learning_rate": 5.783972125435541e-07, "logits/chosen": -3.4064152240753174, "logits/rejected": -3.3076775074005127, "logps/chosen": -8.735708236694336, "logps/rejected": -26.29998016357422, "loss": 0.377, "rewards/accuracies": 1.0, "rewards/chosen": 0.45992982387542725, "rewards/margins": 0.45992982387542725, "rewards/rejected": 0.0, "step": 166 }, { "epoch": 0.9329608938547486, "grad_norm": 10.752674294933225, "learning_rate": 5.818815331010452e-07, "logits/chosen": -3.4760870933532715, "logits/rejected": -3.190138101577759, "logps/chosen": -19.67440414428711, "logps/rejected": -27.942874908447266, "loss": 0.3322, "rewards/accuracies": 1.0, "rewards/chosen": 1.020792007446289, "rewards/margins": 1.020792007446289, "rewards/rejected": 0.0, "step": 167 }, { "epoch": 0.9385474860335196, "grad_norm": 20.059327017371768, "learning_rate": 5.853658536585365e-07, "logits/chosen": -3.1289055347442627, "logits/rejected": -3.1791739463806152, "logps/chosen": -14.388964653015137, "logps/rejected": -24.04684829711914, "loss": 0.458, "rewards/accuracies": 1.0, "rewards/chosen": 0.6628751754760742, "rewards/margins": 0.6628751754760742, "rewards/rejected": 0.0, "step": 168 }, { "epoch": 0.9441340782122905, "grad_norm": 19.40231327977891, "learning_rate": 5.888501742160279e-07, "logits/chosen": -3.560133934020996, "logits/rejected": -3.58990216255188, "logps/chosen": -19.82522201538086, "logps/rejected": -28.30991554260254, "loss": 0.4138, "rewards/accuracies": 1.0, "rewards/chosen": 0.792276918888092, "rewards/margins": 0.792276918888092, "rewards/rejected": 0.0, "step": 169 }, { "epoch": 0.9497206703910615, "grad_norm": 12.42947365843963, "learning_rate": 5.923344947735192e-07, "logits/chosen": -3.2126545906066895, "logits/rejected": -3.3703675270080566, "logps/chosen": -29.75718116760254, "logps/rejected": -31.64392852783203, "loss": 0.3579, "rewards/accuracies": 0.75, "rewards/chosen": 0.7188339829444885, "rewards/margins": 0.7188339829444885, "rewards/rejected": 0.0, "step": 170 }, { "epoch": 0.9553072625698324, "grad_norm": 18.202343205238847, "learning_rate": 5.958188153310105e-07, "logits/chosen": -3.717379570007324, "logits/rejected": -3.5793988704681396, "logps/chosen": -20.444347381591797, "logps/rejected": -33.93192672729492, "loss": 0.3897, "rewards/accuracies": 1.0, "rewards/chosen": 0.9968618750572205, "rewards/margins": 0.9968618750572205, "rewards/rejected": 0.0, "step": 171 }, { "epoch": 0.9608938547486033, "grad_norm": 12.832841852369326, "learning_rate": 5.993031358885016e-07, "logits/chosen": -3.4382286071777344, "logits/rejected": -3.317105770111084, "logps/chosen": -18.771831512451172, "logps/rejected": -34.331298828125, "loss": 0.3651, "rewards/accuracies": 1.0, "rewards/chosen": 0.8643215894699097, "rewards/margins": 0.8643215894699097, "rewards/rejected": 0.0, "step": 172 }, { "epoch": 0.9664804469273743, "grad_norm": 11.857640826249451, "learning_rate": 6.02787456445993e-07, "logits/chosen": -3.6196374893188477, "logits/rejected": -3.3753185272216797, "logps/chosen": -19.114072799682617, "logps/rejected": -29.907459259033203, "loss": 0.3557, "rewards/accuracies": 1.0, "rewards/chosen": 0.9212259650230408, "rewards/margins": 0.9212259650230408, "rewards/rejected": 0.0, "step": 173 }, { "epoch": 0.9720670391061452, "grad_norm": 11.028463714669611, "learning_rate": 6.062717770034843e-07, "logits/chosen": -3.2672202587127686, "logits/rejected": -3.3163845539093018, "logps/chosen": -23.178640365600586, "logps/rejected": -55.991661071777344, "loss": 0.3474, "rewards/accuracies": 1.0, "rewards/chosen": 0.8489004969596863, "rewards/margins": 0.8489004969596863, "rewards/rejected": 0.0, "step": 174 }, { "epoch": 0.9776536312849162, "grad_norm": 12.810488081859203, "learning_rate": 6.097560975609756e-07, "logits/chosen": -3.150639057159424, "logits/rejected": -3.3473706245422363, "logps/chosen": -55.005409240722656, "logps/rejected": -28.745824813842773, "loss": 0.3478, "rewards/accuracies": 1.0, "rewards/chosen": 1.4712138175964355, "rewards/margins": 1.4712138175964355, "rewards/rejected": 0.0, "step": 175 }, { "epoch": 0.9832402234636871, "grad_norm": 13.180414956676238, "learning_rate": 6.132404181184669e-07, "logits/chosen": -3.620823621749878, "logits/rejected": -3.6447391510009766, "logps/chosen": -10.457490921020508, "logps/rejected": -29.693206787109375, "loss": 0.3509, "rewards/accuracies": 1.0, "rewards/chosen": 0.6984259486198425, "rewards/margins": 0.6984259486198425, "rewards/rejected": 0.0, "step": 176 }, { "epoch": 0.9888268156424581, "grad_norm": 22.420225225210046, "learning_rate": 6.167247386759582e-07, "logits/chosen": -3.5637428760528564, "logits/rejected": -3.59216046333313, "logps/chosen": -16.807857513427734, "logps/rejected": -19.454391479492188, "loss": 0.4543, "rewards/accuracies": 1.0, "rewards/chosen": 0.7911770343780518, "rewards/margins": 0.7911770343780518, "rewards/rejected": 0.0, "step": 177 }, { "epoch": 0.994413407821229, "grad_norm": 11.80586886342996, "learning_rate": 6.202090592334494e-07, "logits/chosen": -3.3847668170928955, "logits/rejected": -3.4201109409332275, "logps/chosen": -14.308084487915039, "logps/rejected": -50.307579040527344, "loss": 0.3606, "rewards/accuracies": 1.0, "rewards/chosen": 1.090038776397705, "rewards/margins": 1.090038776397705, "rewards/rejected": 0.0, "step": 178 }, { "epoch": 1.0, "grad_norm": 16.952966215261824, "learning_rate": 6.236933797909407e-07, "logits/chosen": -3.4724905490875244, "logits/rejected": -3.308558702468872, "logps/chosen": -27.244325637817383, "logps/rejected": -37.93553161621094, "loss": 0.3722, "rewards/accuracies": 1.0, "rewards/chosen": 1.1800241470336914, "rewards/margins": 1.1800241470336914, "rewards/rejected": 0.0, "step": 179 }, { "epoch": 1.005586592178771, "grad_norm": 9.105872976664521, "learning_rate": 6.27177700348432e-07, "logits/chosen": -3.6326866149902344, "logits/rejected": -3.5780582427978516, "logps/chosen": -12.944976806640625, "logps/rejected": -18.96871566772461, "loss": 0.3302, "rewards/accuracies": 1.0, "rewards/chosen": 1.0247759819030762, "rewards/margins": 1.0247759819030762, "rewards/rejected": 0.0, "step": 180 }, { "epoch": 1.005586592178771, "eval_logits/chosen": -3.2065823078155518, "eval_logits/rejected": -3.2086079120635986, "eval_logps/chosen": -20.46588897705078, "eval_logps/rejected": -31.99078369140625, "eval_loss": 0.36149224638938904, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 0.960415244102478, "eval_rewards/margins": 0.960415244102478, "eval_rewards/rejected": 0.0, "eval_runtime": 33.6009, "eval_samples_per_second": 9.226, "eval_steps_per_second": 0.298, "step": 180 }, { "epoch": 1.011173184357542, "grad_norm": 13.19416012523754, "learning_rate": 6.306620209059234e-07, "logits/chosen": -3.2777421474456787, "logits/rejected": -3.3989064693450928, "logps/chosen": -14.293594360351562, "logps/rejected": -17.12541389465332, "loss": 0.3666, "rewards/accuracies": 1.0, "rewards/chosen": 0.5113266706466675, "rewards/margins": 0.5113266706466675, "rewards/rejected": 0.0, "step": 181 }, { "epoch": 1.0167597765363128, "grad_norm": 10.887028146578567, "learning_rate": 6.341463414634146e-07, "logits/chosen": -3.1348888874053955, "logits/rejected": -3.224384307861328, "logps/chosen": -21.554080963134766, "logps/rejected": -20.96063232421875, "loss": 0.3146, "rewards/accuracies": 1.0, "rewards/chosen": 0.9010570645332336, "rewards/margins": 0.9010570645332336, "rewards/rejected": 0.0, "step": 182 }, { "epoch": 1.0223463687150838, "grad_norm": 8.963970647137387, "learning_rate": 6.376306620209058e-07, "logits/chosen": -3.125688076019287, "logits/rejected": -3.066281795501709, "logps/chosen": -17.731704711914062, "logps/rejected": -16.59673309326172, "loss": 0.2865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9751240611076355, "rewards/margins": 0.9751240611076355, "rewards/rejected": 0.0, "step": 183 }, { "epoch": 1.0279329608938548, "grad_norm": 10.288390190064316, "learning_rate": 6.411149825783971e-07, "logits/chosen": -3.3919193744659424, "logits/rejected": -3.0586678981781006, "logps/chosen": -16.1546688079834, "logps/rejected": -39.81340026855469, "loss": 0.3408, "rewards/accuracies": 1.0, "rewards/chosen": 1.247828722000122, "rewards/margins": 1.247828722000122, "rewards/rejected": 0.0, "step": 184 }, { "epoch": 1.0335195530726258, "grad_norm": 11.02350587817682, "learning_rate": 6.445993031358885e-07, "logits/chosen": -3.785944938659668, "logits/rejected": -3.5418670177459717, "logps/chosen": -16.335508346557617, "logps/rejected": -20.58266830444336, "loss": 0.3442, "rewards/accuracies": 1.0, "rewards/chosen": 1.0386725664138794, "rewards/margins": 1.0386725664138794, "rewards/rejected": 0.0, "step": 185 }, { "epoch": 1.0391061452513966, "grad_norm": 16.289447666313198, "learning_rate": 6.480836236933798e-07, "logits/chosen": -3.668846845626831, "logits/rejected": -3.4984891414642334, "logps/chosen": -25.417245864868164, "logps/rejected": -39.81985092163086, "loss": 0.4073, "rewards/accuracies": 1.0, "rewards/chosen": 1.1094753742218018, "rewards/margins": 1.1094753742218018, "rewards/rejected": 0.0, "step": 186 }, { "epoch": 1.0446927374301676, "grad_norm": 15.073431374332136, "learning_rate": 6.51567944250871e-07, "logits/chosen": -3.4357309341430664, "logits/rejected": -3.447826623916626, "logps/chosen": -21.84659194946289, "logps/rejected": -23.085220336914062, "loss": 0.3643, "rewards/accuracies": 1.0, "rewards/chosen": 1.3426122665405273, "rewards/margins": 1.3426122665405273, "rewards/rejected": 0.0, "step": 187 }, { "epoch": 1.0502793296089385, "grad_norm": 10.116314298618896, "learning_rate": 6.550522648083623e-07, "logits/chosen": -3.488616704940796, "logits/rejected": -3.547907590866089, "logps/chosen": -20.659770965576172, "logps/rejected": -44.12837219238281, "loss": 0.3277, "rewards/accuracies": 1.0, "rewards/chosen": 0.8681479096412659, "rewards/margins": 0.8681479096412659, "rewards/rejected": 0.0, "step": 188 }, { "epoch": 1.0558659217877095, "grad_norm": 10.04057046229293, "learning_rate": 6.585365853658536e-07, "logits/chosen": -3.3828887939453125, "logits/rejected": -3.5210986137390137, "logps/chosen": -15.001754760742188, "logps/rejected": -18.628276824951172, "loss": 0.3095, "rewards/accuracies": 1.0, "rewards/chosen": 0.7176334857940674, "rewards/margins": 0.7176334857940674, "rewards/rejected": 0.0, "step": 189 }, { "epoch": 1.0614525139664805, "grad_norm": 15.007274677524912, "learning_rate": 6.620209059233449e-07, "logits/chosen": -3.0460798740386963, "logits/rejected": -3.2319722175598145, "logps/chosen": -32.32198715209961, "logps/rejected": -35.442230224609375, "loss": 0.379, "rewards/accuracies": 1.0, "rewards/chosen": 0.6987197399139404, "rewards/margins": 0.6987197399139404, "rewards/rejected": 0.0, "step": 190 }, { "epoch": 1.0670391061452513, "grad_norm": 9.10362018167136, "learning_rate": 6.655052264808362e-07, "logits/chosen": -3.4198787212371826, "logits/rejected": -3.2804646492004395, "logps/chosen": -10.96152114868164, "logps/rejected": -31.262535095214844, "loss": 0.3236, "rewards/accuracies": 1.0, "rewards/chosen": 0.6560508012771606, "rewards/margins": 0.6560508012771606, "rewards/rejected": 0.0, "step": 191 }, { "epoch": 1.0726256983240223, "grad_norm": 9.590605833890953, "learning_rate": 6.689895470383276e-07, "logits/chosen": -3.66292142868042, "logits/rejected": -3.4462697505950928, "logps/chosen": -16.839336395263672, "logps/rejected": -34.59306716918945, "loss": 0.3152, "rewards/accuracies": 1.0, "rewards/chosen": 1.2217531204223633, "rewards/margins": 1.2217531204223633, "rewards/rejected": 0.0, "step": 192 }, { "epoch": 1.0782122905027933, "grad_norm": 14.136300690926081, "learning_rate": 6.724738675958188e-07, "logits/chosen": -3.2333831787109375, "logits/rejected": -3.228888750076294, "logps/chosen": -20.69516372680664, "logps/rejected": -31.679576873779297, "loss": 0.3615, "rewards/accuracies": 0.75, "rewards/chosen": 0.5942341685295105, "rewards/margins": 0.5942341685295105, "rewards/rejected": 0.0, "step": 193 }, { "epoch": 1.0837988826815643, "grad_norm": 9.73848287694558, "learning_rate": 6.759581881533101e-07, "logits/chosen": -2.731956720352173, "logits/rejected": -2.8023910522460938, "logps/chosen": -15.538613319396973, "logps/rejected": -35.99656677246094, "loss": 0.3398, "rewards/accuracies": 1.0, "rewards/chosen": 1.0924439430236816, "rewards/margins": 1.0924439430236816, "rewards/rejected": 0.0, "step": 194 }, { "epoch": 1.089385474860335, "grad_norm": 13.504403269737955, "learning_rate": 6.794425087108013e-07, "logits/chosen": -3.1872870922088623, "logits/rejected": -3.1580259799957275, "logps/chosen": -48.09290313720703, "logps/rejected": -29.916027069091797, "loss": 0.3314, "rewards/accuracies": 1.0, "rewards/chosen": 1.0472760200500488, "rewards/margins": 1.0472760200500488, "rewards/rejected": 0.0, "step": 195 }, { "epoch": 1.094972067039106, "grad_norm": 9.013293702308625, "learning_rate": 6.829268292682927e-07, "logits/chosen": -3.5854601860046387, "logits/rejected": -3.320793867111206, "logps/chosen": -15.177647590637207, "logps/rejected": -46.74102783203125, "loss": 0.3142, "rewards/accuracies": 1.0, "rewards/chosen": 0.8823977112770081, "rewards/margins": 0.8823977112770081, "rewards/rejected": 0.0, "step": 196 }, { "epoch": 1.100558659217877, "grad_norm": 9.944518309167927, "learning_rate": 6.86411149825784e-07, "logits/chosen": -3.352811574935913, "logits/rejected": -3.3195114135742188, "logps/chosen": -14.030511856079102, "logps/rejected": -51.092716217041016, "loss": 0.3007, "rewards/accuracies": 1.0, "rewards/chosen": 0.9310657978057861, "rewards/margins": 0.9310657978057861, "rewards/rejected": 0.0, "step": 197 }, { "epoch": 1.106145251396648, "grad_norm": 9.619422555624382, "learning_rate": 6.898954703832752e-07, "logits/chosen": -3.5843019485473633, "logits/rejected": -3.1107637882232666, "logps/chosen": -5.203390121459961, "logps/rejected": -27.75282096862793, "loss": 0.3516, "rewards/accuracies": 1.0, "rewards/chosen": 0.6825684905052185, "rewards/margins": 0.6825684905052185, "rewards/rejected": 0.0, "step": 198 }, { "epoch": 1.111731843575419, "grad_norm": 12.838451492103111, "learning_rate": 6.933797909407665e-07, "logits/chosen": -3.3617866039276123, "logits/rejected": -3.123973846435547, "logps/chosen": -53.03567886352539, "logps/rejected": -36.53765106201172, "loss": 0.3275, "rewards/accuracies": 1.0, "rewards/chosen": 0.8718262314796448, "rewards/margins": 0.8718262314796448, "rewards/rejected": 0.0, "step": 199 }, { "epoch": 1.1173184357541899, "grad_norm": 10.727712294253553, "learning_rate": 6.968641114982578e-07, "logits/chosen": -2.0662670135498047, "logits/rejected": -2.08182692527771, "logps/chosen": -52.00320816040039, "logps/rejected": -73.17282104492188, "loss": 0.3375, "rewards/accuracies": 0.75, "rewards/chosen": 0.8044001460075378, "rewards/margins": 0.8044001460075378, "rewards/rejected": 0.0, "step": 200 }, { "epoch": 1.1173184357541899, "eval_logits/chosen": -3.246267795562744, "eval_logits/rejected": -3.2582263946533203, "eval_logps/chosen": -20.083738327026367, "eval_logps/rejected": -31.633167266845703, "eval_loss": 0.3558894991874695, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 0.9986303448677063, "eval_rewards/margins": 0.9986303448677063, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5976, "eval_samples_per_second": 9.227, "eval_steps_per_second": 0.298, "step": 200 }, { "epoch": 1.1229050279329609, "grad_norm": 10.59758427732295, "learning_rate": 7.003484320557491e-07, "logits/chosen": -3.505729913711548, "logits/rejected": -3.1870007514953613, "logps/chosen": -10.572619438171387, "logps/rejected": -52.28015899658203, "loss": 0.3449, "rewards/accuracies": 1.0, "rewards/chosen": 0.7204795479774475, "rewards/margins": 0.7204795479774475, "rewards/rejected": 0.0, "step": 201 }, { "epoch": 1.1284916201117319, "grad_norm": 15.850633721679678, "learning_rate": 7.038327526132404e-07, "logits/chosen": -3.2468059062957764, "logits/rejected": -3.3311328887939453, "logps/chosen": -19.04383659362793, "logps/rejected": -15.986388206481934, "loss": 0.3689, "rewards/accuracies": 1.0, "rewards/chosen": 0.9080885052680969, "rewards/margins": 0.9080885052680969, "rewards/rejected": 0.0, "step": 202 }, { "epoch": 1.1340782122905029, "grad_norm": 12.773768896781231, "learning_rate": 7.073170731707316e-07, "logits/chosen": -3.3263590335845947, "logits/rejected": -3.251199722290039, "logps/chosen": -32.89419937133789, "logps/rejected": -29.640453338623047, "loss": 0.3193, "rewards/accuracies": 0.75, "rewards/chosen": 0.5567541122436523, "rewards/margins": 0.5567541122436523, "rewards/rejected": 0.0, "step": 203 }, { "epoch": 1.1396648044692737, "grad_norm": 10.169395063409917, "learning_rate": 7.10801393728223e-07, "logits/chosen": -3.357675075531006, "logits/rejected": -3.479375123977661, "logps/chosen": -18.883747100830078, "logps/rejected": -26.06609344482422, "loss": 0.2824, "rewards/accuracies": 1.0, "rewards/chosen": 1.0959761142730713, "rewards/margins": 1.0959761142730713, "rewards/rejected": 0.0, "step": 204 }, { "epoch": 1.1452513966480447, "grad_norm": 11.535769216754757, "learning_rate": 7.142857142857143e-07, "logits/chosen": -3.1873953342437744, "logits/rejected": -3.218331813812256, "logps/chosen": -43.96869659423828, "logps/rejected": -32.669837951660156, "loss": 0.3341, "rewards/accuracies": 1.0, "rewards/chosen": 1.4571388959884644, "rewards/margins": 1.4571388959884644, "rewards/rejected": 0.0, "step": 205 }, { "epoch": 1.1508379888268156, "grad_norm": 12.745600341451343, "learning_rate": 7.177700348432055e-07, "logits/chosen": -3.6761059761047363, "logits/rejected": -3.639869213104248, "logps/chosen": -19.122093200683594, "logps/rejected": -23.711383819580078, "loss": 0.3439, "rewards/accuracies": 1.0, "rewards/chosen": 0.8524357080459595, "rewards/margins": 0.8524357080459595, "rewards/rejected": 0.0, "step": 206 }, { "epoch": 1.1564245810055866, "grad_norm": 9.378824141835013, "learning_rate": 7.212543554006968e-07, "logits/chosen": -3.5651609897613525, "logits/rejected": -3.3586459159851074, "logps/chosen": -21.362688064575195, "logps/rejected": -30.987037658691406, "loss": 0.3087, "rewards/accuracies": 1.0, "rewards/chosen": 1.2608428001403809, "rewards/margins": 1.2608428001403809, "rewards/rejected": 0.0, "step": 207 }, { "epoch": 1.1620111731843576, "grad_norm": 14.298346495114684, "learning_rate": 7.247386759581882e-07, "logits/chosen": -3.445509910583496, "logits/rejected": -3.559532403945923, "logps/chosen": -15.627222061157227, "logps/rejected": -48.640472412109375, "loss": 0.3454, "rewards/accuracies": 1.0, "rewards/chosen": 0.47943437099456787, "rewards/margins": 0.47943437099456787, "rewards/rejected": 0.0, "step": 208 }, { "epoch": 1.1675977653631284, "grad_norm": 17.065773568044825, "learning_rate": 7.282229965156794e-07, "logits/chosen": -3.4207494258880615, "logits/rejected": -3.4317336082458496, "logps/chosen": -15.158013343811035, "logps/rejected": -26.077003479003906, "loss": 0.3768, "rewards/accuracies": 1.0, "rewards/chosen": 0.6948219537734985, "rewards/margins": 0.6948219537734985, "rewards/rejected": 0.0, "step": 209 }, { "epoch": 1.1731843575418994, "grad_norm": 10.558484474657567, "learning_rate": 7.317073170731707e-07, "logits/chosen": -3.489943265914917, "logits/rejected": -3.3792459964752197, "logps/chosen": -13.803016662597656, "logps/rejected": -21.387771606445312, "loss": 0.2991, "rewards/accuracies": 1.0, "rewards/chosen": 0.9277787208557129, "rewards/margins": 0.9277787208557129, "rewards/rejected": 0.0, "step": 210 }, { "epoch": 1.1787709497206704, "grad_norm": 14.878823416144341, "learning_rate": 7.351916376306619e-07, "logits/chosen": -3.1514551639556885, "logits/rejected": -2.85518741607666, "logps/chosen": -17.18071746826172, "logps/rejected": -34.49565887451172, "loss": 0.3657, "rewards/accuracies": 1.0, "rewards/chosen": 1.2792776823043823, "rewards/margins": 1.2792776823043823, "rewards/rejected": 0.0, "step": 211 }, { "epoch": 1.1843575418994414, "grad_norm": 9.848084960506721, "learning_rate": 7.386759581881533e-07, "logits/chosen": -3.138712167739868, "logits/rejected": -3.03584361076355, "logps/chosen": -15.721365928649902, "logps/rejected": -16.612977981567383, "loss": 0.302, "rewards/accuracies": 1.0, "rewards/chosen": 1.1289442777633667, "rewards/margins": 1.1289442777633667, "rewards/rejected": 0.0, "step": 212 }, { "epoch": 1.1899441340782122, "grad_norm": 10.805227371998999, "learning_rate": 7.421602787456446e-07, "logits/chosen": -3.6576991081237793, "logits/rejected": -3.5775070190429688, "logps/chosen": -22.21623992919922, "logps/rejected": -49.09683609008789, "loss": 0.315, "rewards/accuracies": 1.0, "rewards/chosen": 1.1914067268371582, "rewards/margins": 1.1914067268371582, "rewards/rejected": 0.0, "step": 213 }, { "epoch": 1.1955307262569832, "grad_norm": 10.320243828582568, "learning_rate": 7.456445993031358e-07, "logits/chosen": -3.409869909286499, "logits/rejected": -3.401746988296509, "logps/chosen": -26.289350509643555, "logps/rejected": -28.945384979248047, "loss": 0.3352, "rewards/accuracies": 1.0, "rewards/chosen": 1.3901445865631104, "rewards/margins": 1.3901445865631104, "rewards/rejected": 0.0, "step": 214 }, { "epoch": 1.2011173184357542, "grad_norm": 14.18558383515029, "learning_rate": 7.491289198606271e-07, "logits/chosen": -3.62727427482605, "logits/rejected": -3.4659011363983154, "logps/chosen": -36.72397994995117, "logps/rejected": -21.145355224609375, "loss": 0.321, "rewards/accuracies": 1.0, "rewards/chosen": 0.7905348539352417, "rewards/margins": 0.7905348539352417, "rewards/rejected": 0.0, "step": 215 }, { "epoch": 1.2067039106145252, "grad_norm": 10.488275048991248, "learning_rate": 7.526132404181185e-07, "logits/chosen": -3.6146132946014404, "logits/rejected": -3.3445773124694824, "logps/chosen": -22.47472381591797, "logps/rejected": -28.731056213378906, "loss": 0.2963, "rewards/accuracies": 1.0, "rewards/chosen": 1.4101334810256958, "rewards/margins": 1.4101334810256958, "rewards/rejected": 0.0, "step": 216 }, { "epoch": 1.2122905027932962, "grad_norm": 8.704684794484079, "learning_rate": 7.560975609756097e-07, "logits/chosen": -3.5806429386138916, "logits/rejected": -3.7555086612701416, "logps/chosen": -16.60782814025879, "logps/rejected": -26.24691390991211, "loss": 0.2961, "rewards/accuracies": 1.0, "rewards/chosen": 1.0293834209442139, "rewards/margins": 1.0293834209442139, "rewards/rejected": 0.0, "step": 217 }, { "epoch": 1.217877094972067, "grad_norm": 9.698408791817338, "learning_rate": 7.59581881533101e-07, "logits/chosen": -3.216071605682373, "logits/rejected": -3.0813839435577393, "logps/chosen": -23.708539962768555, "logps/rejected": -29.808490753173828, "loss": 0.3132, "rewards/accuracies": 1.0, "rewards/chosen": 1.2182135581970215, "rewards/margins": 1.2182135581970215, "rewards/rejected": 0.0, "step": 218 }, { "epoch": 1.223463687150838, "grad_norm": 9.811614676428968, "learning_rate": 7.630662020905922e-07, "logits/chosen": -3.2354507446289062, "logits/rejected": -3.1731200218200684, "logps/chosen": -28.39469337463379, "logps/rejected": -34.51610565185547, "loss": 0.2879, "rewards/accuracies": 1.0, "rewards/chosen": 1.0406968593597412, "rewards/margins": 1.0406968593597412, "rewards/rejected": 0.0, "step": 219 }, { "epoch": 1.229050279329609, "grad_norm": 11.451301511630755, "learning_rate": 7.665505226480836e-07, "logits/chosen": -3.4079675674438477, "logits/rejected": -3.562708616256714, "logps/chosen": -50.24363708496094, "logps/rejected": -36.007076263427734, "loss": 0.3172, "rewards/accuracies": 1.0, "rewards/chosen": 0.8625876903533936, "rewards/margins": 0.8625876903533936, "rewards/rejected": 0.0, "step": 220 }, { "epoch": 1.229050279329609, "eval_logits/chosen": -3.2746100425720215, "eval_logits/rejected": -3.2954063415527344, "eval_logps/chosen": -19.88897705078125, "eval_logps/rejected": -31.52643394470215, "eval_loss": 0.35166430473327637, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 1.0181065797805786, "eval_rewards/margins": 1.0181065797805786, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5481, "eval_samples_per_second": 9.24, "eval_steps_per_second": 0.298, "step": 220 }, { "epoch": 1.23463687150838, "grad_norm": 9.326879656223309, "learning_rate": 7.700348432055749e-07, "logits/chosen": -3.5331428050994873, "logits/rejected": -3.4841206073760986, "logps/chosen": -15.98175048828125, "logps/rejected": -30.6879940032959, "loss": 0.3262, "rewards/accuracies": 1.0, "rewards/chosen": 1.2041575908660889, "rewards/margins": 1.2041575908660889, "rewards/rejected": 0.0, "step": 221 }, { "epoch": 1.2402234636871508, "grad_norm": 12.3079719112879, "learning_rate": 7.735191637630662e-07, "logits/chosen": -3.518021583557129, "logits/rejected": -3.4510629177093506, "logps/chosen": -27.463146209716797, "logps/rejected": -17.00018310546875, "loss": 0.3618, "rewards/accuracies": 0.75, "rewards/chosen": 0.3559202551841736, "rewards/margins": 0.3559202551841736, "rewards/rejected": 0.0, "step": 222 }, { "epoch": 1.2458100558659218, "grad_norm": 11.308218355800076, "learning_rate": 7.770034843205574e-07, "logits/chosen": -3.3539228439331055, "logits/rejected": -3.5053908824920654, "logps/chosen": -9.660523414611816, "logps/rejected": -34.22919464111328, "loss": 0.3076, "rewards/accuracies": 1.0, "rewards/chosen": 0.7283318638801575, "rewards/margins": 0.7283318638801575, "rewards/rejected": 0.0, "step": 223 }, { "epoch": 1.2513966480446927, "grad_norm": 15.695720858566277, "learning_rate": 7.804878048780488e-07, "logits/chosen": -3.4702861309051514, "logits/rejected": -3.6778104305267334, "logps/chosen": -17.774089813232422, "logps/rejected": -22.740568161010742, "loss": 0.3975, "rewards/accuracies": 1.0, "rewards/chosen": 1.1346460580825806, "rewards/margins": 1.1346460580825806, "rewards/rejected": 0.0, "step": 224 }, { "epoch": 1.2569832402234637, "grad_norm": 9.243920919200582, "learning_rate": 7.8397212543554e-07, "logits/chosen": -3.1038174629211426, "logits/rejected": -3.1146140098571777, "logps/chosen": -45.7878303527832, "logps/rejected": -30.96743392944336, "loss": 0.2978, "rewards/accuracies": 1.0, "rewards/chosen": 0.9360488057136536, "rewards/margins": 0.9360488057136536, "rewards/rejected": 0.0, "step": 225 }, { "epoch": 1.2625698324022347, "grad_norm": 11.13129466714581, "learning_rate": 7.874564459930313e-07, "logits/chosen": -3.5532631874084473, "logits/rejected": -3.4976348876953125, "logps/chosen": -30.162281036376953, "logps/rejected": -29.01420783996582, "loss": 0.3294, "rewards/accuracies": 0.75, "rewards/chosen": 0.7946181893348694, "rewards/margins": 0.7946181893348694, "rewards/rejected": 0.0, "step": 226 }, { "epoch": 1.2681564245810055, "grad_norm": 10.721423149528743, "learning_rate": 7.909407665505227e-07, "logits/chosen": -3.399590253829956, "logits/rejected": -3.3550968170166016, "logps/chosen": -28.896617889404297, "logps/rejected": -39.47148895263672, "loss": 0.3103, "rewards/accuracies": 1.0, "rewards/chosen": 1.5518330335617065, "rewards/margins": 1.5518330335617065, "rewards/rejected": 0.0, "step": 227 }, { "epoch": 1.2737430167597765, "grad_norm": 10.794643046108307, "learning_rate": 7.944250871080139e-07, "logits/chosen": -3.1023850440979004, "logits/rejected": -3.070626974105835, "logps/chosen": -26.629072189331055, "logps/rejected": -31.59149742126465, "loss": 0.308, "rewards/accuracies": 1.0, "rewards/chosen": 0.7542858719825745, "rewards/margins": 0.7542858719825745, "rewards/rejected": 0.0, "step": 228 }, { "epoch": 1.2793296089385475, "grad_norm": 9.710935244235399, "learning_rate": 7.979094076655052e-07, "logits/chosen": -3.3577988147735596, "logits/rejected": -3.3791284561157227, "logps/chosen": -34.757232666015625, "logps/rejected": -45.71214294433594, "loss": 0.2734, "rewards/accuracies": 1.0, "rewards/chosen": 1.129239559173584, "rewards/margins": 1.129239559173584, "rewards/rejected": 0.0, "step": 229 }, { "epoch": 1.2849162011173183, "grad_norm": 10.214850798494435, "learning_rate": 8.013937282229964e-07, "logits/chosen": -3.270902633666992, "logits/rejected": -3.060398578643799, "logps/chosen": -14.818272590637207, "logps/rejected": -36.052574157714844, "loss": 0.3036, "rewards/accuracies": 1.0, "rewards/chosen": 1.0607706308364868, "rewards/margins": 1.0607706308364868, "rewards/rejected": 0.0, "step": 230 }, { "epoch": 1.2905027932960893, "grad_norm": 15.427190116461567, "learning_rate": 8.048780487804878e-07, "logits/chosen": -3.6726341247558594, "logits/rejected": -3.629931926727295, "logps/chosen": -15.026057243347168, "logps/rejected": -25.69697380065918, "loss": 0.3639, "rewards/accuracies": 1.0, "rewards/chosen": 0.7771012783050537, "rewards/margins": 0.7771012783050537, "rewards/rejected": 0.0, "step": 231 }, { "epoch": 1.2960893854748603, "grad_norm": 16.71952562743431, "learning_rate": 8.083623693379791e-07, "logits/chosen": -3.426694631576538, "logits/rejected": -3.055060863494873, "logps/chosen": -23.176193237304688, "logps/rejected": -103.94121551513672, "loss": 0.3559, "rewards/accuracies": 0.75, "rewards/chosen": 0.7330259680747986, "rewards/margins": 0.7330259680747986, "rewards/rejected": 0.0, "step": 232 }, { "epoch": 1.3016759776536313, "grad_norm": 13.460670617842231, "learning_rate": 8.118466898954704e-07, "logits/chosen": -2.560324192047119, "logits/rejected": -2.674717903137207, "logps/chosen": -43.27853775024414, "logps/rejected": -37.60259246826172, "loss": 0.3469, "rewards/accuracies": 1.0, "rewards/chosen": 1.79660964012146, "rewards/margins": 1.79660964012146, "rewards/rejected": 0.0, "step": 233 }, { "epoch": 1.3072625698324023, "grad_norm": 9.312146717749025, "learning_rate": 8.153310104529616e-07, "logits/chosen": -3.848695755004883, "logits/rejected": -3.8059310913085938, "logps/chosen": -16.219894409179688, "logps/rejected": -28.943775177001953, "loss": 0.2984, "rewards/accuracies": 1.0, "rewards/chosen": 1.3555338382720947, "rewards/margins": 1.3555338382720947, "rewards/rejected": 0.0, "step": 234 }, { "epoch": 1.3128491620111733, "grad_norm": 14.641951211411621, "learning_rate": 8.188153310104529e-07, "logits/chosen": -2.8487823009490967, "logits/rejected": -3.1932733058929443, "logps/chosen": -52.92893981933594, "logps/rejected": -19.277240753173828, "loss": 0.383, "rewards/accuracies": 1.0, "rewards/chosen": 0.8276066780090332, "rewards/margins": 0.8276066780090332, "rewards/rejected": 0.0, "step": 235 }, { "epoch": 1.318435754189944, "grad_norm": 8.445490926381995, "learning_rate": 8.222996515679442e-07, "logits/chosen": -3.67144775390625, "logits/rejected": -3.691436529159546, "logps/chosen": -31.82388687133789, "logps/rejected": -26.39704132080078, "loss": 0.2616, "rewards/accuracies": 1.0, "rewards/chosen": 1.5748435258865356, "rewards/margins": 1.5748435258865356, "rewards/rejected": 0.0, "step": 236 }, { "epoch": 1.324022346368715, "grad_norm": 9.763384731492138, "learning_rate": 8.257839721254355e-07, "logits/chosen": -3.3791232109069824, "logits/rejected": -3.5130560398101807, "logps/chosen": -10.164708137512207, "logps/rejected": -49.38196563720703, "loss": 0.2883, "rewards/accuracies": 1.0, "rewards/chosen": 0.7769471406936646, "rewards/margins": 0.7769471406936646, "rewards/rejected": 0.0, "step": 237 }, { "epoch": 1.329608938547486, "grad_norm": 15.350243675651864, "learning_rate": 8.292682926829268e-07, "logits/chosen": -3.604556083679199, "logits/rejected": -3.3559699058532715, "logps/chosen": -32.71151351928711, "logps/rejected": -22.76406478881836, "loss": 0.3143, "rewards/accuracies": 1.0, "rewards/chosen": 1.1828868389129639, "rewards/margins": 1.1828868389129639, "rewards/rejected": 0.0, "step": 238 }, { "epoch": 1.3351955307262569, "grad_norm": 11.556880663671324, "learning_rate": 8.327526132404182e-07, "logits/chosen": -3.514572858810425, "logits/rejected": -3.5072760581970215, "logps/chosen": -12.79554557800293, "logps/rejected": -48.96214294433594, "loss": 0.3218, "rewards/accuracies": 1.0, "rewards/chosen": 1.2840112447738647, "rewards/margins": 1.2840112447738647, "rewards/rejected": 0.0, "step": 239 }, { "epoch": 1.3407821229050279, "grad_norm": 11.486460847498583, "learning_rate": 8.362369337979093e-07, "logits/chosen": -3.382394552230835, "logits/rejected": -3.1789932250976562, "logps/chosen": -19.617023468017578, "logps/rejected": -38.0474853515625, "loss": 0.3177, "rewards/accuracies": 1.0, "rewards/chosen": 0.8296595215797424, "rewards/margins": 0.8296595215797424, "rewards/rejected": 0.0, "step": 240 }, { "epoch": 1.3407821229050279, "eval_logits/chosen": -3.2941277027130127, "eval_logits/rejected": -3.319910764694214, "eval_logps/chosen": -19.708602905273438, "eval_logps/rejected": -31.385814666748047, "eval_loss": 0.34877967834472656, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 1.0361440181732178, "eval_rewards/margins": 1.0361440181732178, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5929, "eval_samples_per_second": 9.228, "eval_steps_per_second": 0.298, "step": 240 }, { "epoch": 1.3463687150837989, "grad_norm": 10.085361199189645, "learning_rate": 8.397212543554006e-07, "logits/chosen": -3.6434545516967773, "logits/rejected": -3.575937032699585, "logps/chosen": -16.716463088989258, "logps/rejected": -34.58618927001953, "loss": 0.3216, "rewards/accuracies": 1.0, "rewards/chosen": 1.2842942476272583, "rewards/margins": 1.2842942476272583, "rewards/rejected": 0.0, "step": 241 }, { "epoch": 1.3519553072625698, "grad_norm": 13.10261943018632, "learning_rate": 8.432055749128919e-07, "logits/chosen": -2.625068426132202, "logits/rejected": -2.675497055053711, "logps/chosen": -23.205856323242188, "logps/rejected": -25.172626495361328, "loss": 0.343, "rewards/accuracies": 1.0, "rewards/chosen": 0.9943975210189819, "rewards/margins": 0.9943975210189819, "rewards/rejected": 0.0, "step": 242 }, { "epoch": 1.3575418994413408, "grad_norm": 9.178930009945624, "learning_rate": 8.466898954703833e-07, "logits/chosen": -3.7004106044769287, "logits/rejected": -3.8466243743896484, "logps/chosen": -11.391777992248535, "logps/rejected": -27.08580780029297, "loss": 0.2945, "rewards/accuracies": 1.0, "rewards/chosen": 0.8786574006080627, "rewards/margins": 0.8786574006080627, "rewards/rejected": 0.0, "step": 243 }, { "epoch": 1.3631284916201118, "grad_norm": 12.984989957600904, "learning_rate": 8.501742160278746e-07, "logits/chosen": -3.639615297317505, "logits/rejected": -3.54256272315979, "logps/chosen": -18.723716735839844, "logps/rejected": -28.6210880279541, "loss": 0.3179, "rewards/accuracies": 1.0, "rewards/chosen": 0.9797879457473755, "rewards/margins": 0.9797879457473755, "rewards/rejected": 0.0, "step": 244 }, { "epoch": 1.3687150837988826, "grad_norm": 10.364666344721781, "learning_rate": 8.536585365853657e-07, "logits/chosen": -3.396254062652588, "logits/rejected": -3.1957669258117676, "logps/chosen": -17.962038040161133, "logps/rejected": -25.107196807861328, "loss": 0.2727, "rewards/accuracies": 1.0, "rewards/chosen": 1.0272819995880127, "rewards/margins": 1.0272819995880127, "rewards/rejected": 0.0, "step": 245 }, { "epoch": 1.3743016759776536, "grad_norm": 9.651011599680983, "learning_rate": 8.57142857142857e-07, "logits/chosen": -3.681082248687744, "logits/rejected": -3.4493260383605957, "logps/chosen": -24.472599029541016, "logps/rejected": -26.830467224121094, "loss": 0.2953, "rewards/accuracies": 1.0, "rewards/chosen": 1.5167380571365356, "rewards/margins": 1.5167380571365356, "rewards/rejected": 0.0, "step": 246 }, { "epoch": 1.3798882681564246, "grad_norm": 10.838107337723281, "learning_rate": 8.606271777003484e-07, "logits/chosen": -3.1495206356048584, "logits/rejected": -3.2228331565856934, "logps/chosen": -12.363944053649902, "logps/rejected": -28.788822174072266, "loss": 0.346, "rewards/accuracies": 0.75, "rewards/chosen": 0.5605226159095764, "rewards/margins": 0.5605226159095764, "rewards/rejected": 0.0, "step": 247 }, { "epoch": 1.3854748603351954, "grad_norm": 11.827297529444412, "learning_rate": 8.641114982578397e-07, "logits/chosen": -3.316638231277466, "logits/rejected": -3.3069708347320557, "logps/chosen": -18.166549682617188, "logps/rejected": -42.66516876220703, "loss": 0.2912, "rewards/accuracies": 1.0, "rewards/chosen": 1.2724140882492065, "rewards/margins": 1.2724140882492065, "rewards/rejected": 0.0, "step": 248 }, { "epoch": 1.3910614525139664, "grad_norm": 11.01082188378406, "learning_rate": 8.67595818815331e-07, "logits/chosen": -3.594625949859619, "logits/rejected": -3.4195151329040527, "logps/chosen": -17.03807830810547, "logps/rejected": -22.575963973999023, "loss": 0.2864, "rewards/accuracies": 1.0, "rewards/chosen": 1.2216452360153198, "rewards/margins": 1.2216452360153198, "rewards/rejected": 0.0, "step": 249 }, { "epoch": 1.3966480446927374, "grad_norm": 11.611478895373097, "learning_rate": 8.710801393728223e-07, "logits/chosen": -3.7455551624298096, "logits/rejected": -3.686239719390869, "logps/chosen": -11.092876434326172, "logps/rejected": -62.6893310546875, "loss": 0.2938, "rewards/accuracies": 1.0, "rewards/chosen": 0.8907960653305054, "rewards/margins": 0.8907960653305054, "rewards/rejected": 0.0, "step": 250 }, { "epoch": 1.4022346368715084, "grad_norm": 12.031609061258749, "learning_rate": 8.745644599303135e-07, "logits/chosen": -3.03202486038208, "logits/rejected": -3.108268976211548, "logps/chosen": -51.005760192871094, "logps/rejected": -26.438575744628906, "loss": 0.3098, "rewards/accuracies": 0.75, "rewards/chosen": 0.7945677042007446, "rewards/margins": 0.7945677042007446, "rewards/rejected": 0.0, "step": 251 }, { "epoch": 1.4078212290502794, "grad_norm": 8.882604544624783, "learning_rate": 8.780487804878048e-07, "logits/chosen": -3.4373087882995605, "logits/rejected": -3.1588633060455322, "logps/chosen": -30.948307037353516, "logps/rejected": -26.18224334716797, "loss": 0.2956, "rewards/accuracies": 1.0, "rewards/chosen": 1.5699577331542969, "rewards/margins": 1.5699577331542969, "rewards/rejected": 0.0, "step": 252 }, { "epoch": 1.4134078212290504, "grad_norm": 11.334729057967122, "learning_rate": 8.815331010452961e-07, "logits/chosen": -3.963742256164551, "logits/rejected": -3.785430908203125, "logps/chosen": -18.78699493408203, "logps/rejected": -36.91880798339844, "loss": 0.3319, "rewards/accuracies": 1.0, "rewards/chosen": 1.2210891246795654, "rewards/margins": 1.2210891246795654, "rewards/rejected": 0.0, "step": 253 }, { "epoch": 1.4189944134078212, "grad_norm": 12.317771739780161, "learning_rate": 8.850174216027874e-07, "logits/chosen": -3.338820219039917, "logits/rejected": -3.345203399658203, "logps/chosen": -11.049318313598633, "logps/rejected": -27.54586410522461, "loss": 0.3411, "rewards/accuracies": 1.0, "rewards/chosen": 1.1689317226409912, "rewards/margins": 1.1689317226409912, "rewards/rejected": 0.0, "step": 254 }, { "epoch": 1.4245810055865922, "grad_norm": 12.878542430550699, "learning_rate": 8.885017421602788e-07, "logits/chosen": -3.6580207347869873, "logits/rejected": -3.5710203647613525, "logps/chosen": -15.82120418548584, "logps/rejected": -26.688724517822266, "loss": 0.3185, "rewards/accuracies": 1.0, "rewards/chosen": 1.1673306226730347, "rewards/margins": 1.1673306226730347, "rewards/rejected": 0.0, "step": 255 }, { "epoch": 1.4301675977653632, "grad_norm": 13.135079685475205, "learning_rate": 8.9198606271777e-07, "logits/chosen": -3.5650031566619873, "logits/rejected": -3.6054866313934326, "logps/chosen": -10.30579662322998, "logps/rejected": -19.772520065307617, "loss": 0.3444, "rewards/accuracies": 1.0, "rewards/chosen": 0.7571910619735718, "rewards/margins": 0.7571910619735718, "rewards/rejected": 0.0, "step": 256 }, { "epoch": 1.435754189944134, "grad_norm": 13.055299569071433, "learning_rate": 8.954703832752612e-07, "logits/chosen": -3.363001823425293, "logits/rejected": -3.2151830196380615, "logps/chosen": -21.344371795654297, "logps/rejected": -34.520687103271484, "loss": 0.3164, "rewards/accuracies": 1.0, "rewards/chosen": 1.5332088470458984, "rewards/margins": 1.5332088470458984, "rewards/rejected": 0.0, "step": 257 }, { "epoch": 1.441340782122905, "grad_norm": 13.31330440079475, "learning_rate": 8.989547038327526e-07, "logits/chosen": -3.4222359657287598, "logits/rejected": -3.3897311687469482, "logps/chosen": -26.0482120513916, "logps/rejected": -31.58816146850586, "loss": 0.3375, "rewards/accuracies": 1.0, "rewards/chosen": 1.5745426416397095, "rewards/margins": 1.5745426416397095, "rewards/rejected": 0.0, "step": 258 }, { "epoch": 1.446927374301676, "grad_norm": 9.644841054252685, "learning_rate": 9.024390243902439e-07, "logits/chosen": -3.189476728439331, "logits/rejected": -3.0403568744659424, "logps/chosen": -17.522367477416992, "logps/rejected": -40.978904724121094, "loss": 0.2895, "rewards/accuracies": 1.0, "rewards/chosen": 0.9727367162704468, "rewards/margins": 0.9727367162704468, "rewards/rejected": 0.0, "step": 259 }, { "epoch": 1.452513966480447, "grad_norm": 9.595550125483722, "learning_rate": 9.059233449477352e-07, "logits/chosen": -3.685537815093994, "logits/rejected": -3.7384848594665527, "logps/chosen": -11.41524887084961, "logps/rejected": -23.859867095947266, "loss": 0.318, "rewards/accuracies": 1.0, "rewards/chosen": 0.869933545589447, "rewards/margins": 0.869933545589447, "rewards/rejected": 0.0, "step": 260 }, { "epoch": 1.452513966480447, "eval_logits/chosen": -3.3066470623016357, "eval_logits/rejected": -3.338177442550659, "eval_logps/chosen": -19.498775482177734, "eval_logps/rejected": -31.355274200439453, "eval_loss": 0.34678158164024353, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 1.057126522064209, "eval_rewards/margins": 1.057126522064209, "eval_rewards/rejected": 0.0, "eval_runtime": 33.6534, "eval_samples_per_second": 9.212, "eval_steps_per_second": 0.297, "step": 260 }, { "epoch": 1.458100558659218, "grad_norm": 12.575604168259234, "learning_rate": 9.094076655052264e-07, "logits/chosen": -3.4521327018737793, "logits/rejected": -3.5857722759246826, "logps/chosen": -23.312461853027344, "logps/rejected": -23.504331588745117, "loss": 0.2838, "rewards/accuracies": 1.0, "rewards/chosen": 1.1274609565734863, "rewards/margins": 1.1274609565734863, "rewards/rejected": 0.0, "step": 261 }, { "epoch": 1.463687150837989, "grad_norm": 13.736170560933992, "learning_rate": 9.128919860627177e-07, "logits/chosen": -3.2900683879852295, "logits/rejected": -3.16727614402771, "logps/chosen": -33.037010192871094, "logps/rejected": -41.841793060302734, "loss": 0.346, "rewards/accuracies": 1.0, "rewards/chosen": 1.271679401397705, "rewards/margins": 1.271679401397705, "rewards/rejected": 0.0, "step": 262 }, { "epoch": 1.4692737430167597, "grad_norm": 10.469962806407239, "learning_rate": 9.16376306620209e-07, "logits/chosen": -3.060082197189331, "logits/rejected": -3.1097497940063477, "logps/chosen": -14.170034408569336, "logps/rejected": -15.353635787963867, "loss": 0.3308, "rewards/accuracies": 1.0, "rewards/chosen": 0.7865402698516846, "rewards/margins": 0.7865402698516846, "rewards/rejected": 0.0, "step": 263 }, { "epoch": 1.4748603351955307, "grad_norm": 11.331562925216264, "learning_rate": 9.198606271777003e-07, "logits/chosen": -3.761474370956421, "logits/rejected": -3.6347577571868896, "logps/chosen": -20.385032653808594, "logps/rejected": -31.475448608398438, "loss": 0.2778, "rewards/accuracies": 1.0, "rewards/chosen": 1.1574748754501343, "rewards/margins": 1.1574748754501343, "rewards/rejected": 0.0, "step": 264 }, { "epoch": 1.4804469273743017, "grad_norm": 13.562600056540433, "learning_rate": 9.233449477351916e-07, "logits/chosen": -3.46614408493042, "logits/rejected": -3.1842451095581055, "logps/chosen": -15.610736846923828, "logps/rejected": -65.3194351196289, "loss": 0.3205, "rewards/accuracies": 1.0, "rewards/chosen": 1.061760663986206, "rewards/margins": 1.061760663986206, "rewards/rejected": 0.0, "step": 265 }, { "epoch": 1.4860335195530725, "grad_norm": 10.88847151339513, "learning_rate": 9.26829268292683e-07, "logits/chosen": -3.5326571464538574, "logits/rejected": -3.43210506439209, "logps/chosen": -17.096965789794922, "logps/rejected": -25.843481063842773, "loss": 0.3111, "rewards/accuracies": 1.0, "rewards/chosen": 0.9644085168838501, "rewards/margins": 0.9644085168838501, "rewards/rejected": 0.0, "step": 266 }, { "epoch": 1.4916201117318435, "grad_norm": 9.930526188424318, "learning_rate": 9.303135888501742e-07, "logits/chosen": -3.2434909343719482, "logits/rejected": -3.5659306049346924, "logps/chosen": -36.433223724365234, "logps/rejected": -46.459388732910156, "loss": 0.292, "rewards/accuracies": 1.0, "rewards/chosen": 1.1708192825317383, "rewards/margins": 1.1708192825317383, "rewards/rejected": 0.0, "step": 267 }, { "epoch": 1.4972067039106145, "grad_norm": 12.444217260653138, "learning_rate": 9.337979094076654e-07, "logits/chosen": -3.221174955368042, "logits/rejected": -2.937802314758301, "logps/chosen": -33.17811584472656, "logps/rejected": -35.00051498413086, "loss": 0.3233, "rewards/accuracies": 1.0, "rewards/chosen": 0.7740553021430969, "rewards/margins": 0.7740553021430969, "rewards/rejected": 0.0, "step": 268 }, { "epoch": 1.5027932960893855, "grad_norm": 10.716801635731775, "learning_rate": 9.372822299651567e-07, "logits/chosen": -3.3607850074768066, "logits/rejected": -3.2163217067718506, "logps/chosen": -16.583881378173828, "logps/rejected": -48.29698944091797, "loss": 0.3303, "rewards/accuracies": 1.0, "rewards/chosen": 1.3584688901901245, "rewards/margins": 1.3584688901901245, "rewards/rejected": 0.0, "step": 269 }, { "epoch": 1.5083798882681565, "grad_norm": 15.041626311606112, "learning_rate": 9.407665505226481e-07, "logits/chosen": -3.3310461044311523, "logits/rejected": -3.4619643688201904, "logps/chosen": -38.412445068359375, "logps/rejected": -26.61621856689453, "loss": 0.3557, "rewards/accuracies": 0.75, "rewards/chosen": 0.6411152482032776, "rewards/margins": 0.6411152482032776, "rewards/rejected": 0.0, "step": 270 }, { "epoch": 1.5139664804469275, "grad_norm": 10.425063879304092, "learning_rate": 9.442508710801394e-07, "logits/chosen": -3.748817205429077, "logits/rejected": -3.6500179767608643, "logps/chosen": -21.386062622070312, "logps/rejected": -22.98751449584961, "loss": 0.2863, "rewards/accuracies": 1.0, "rewards/chosen": 1.0100945234298706, "rewards/margins": 1.0100945234298706, "rewards/rejected": 0.0, "step": 271 }, { "epoch": 1.5195530726256983, "grad_norm": 12.140949409242403, "learning_rate": 9.477351916376306e-07, "logits/chosen": -3.5599870681762695, "logits/rejected": -2.9221935272216797, "logps/chosen": -35.05908203125, "logps/rejected": -48.583229064941406, "loss": 0.3589, "rewards/accuracies": 1.0, "rewards/chosen": 0.5782874226570129, "rewards/margins": 0.5782874226570129, "rewards/rejected": 0.0, "step": 272 }, { "epoch": 1.5251396648044693, "grad_norm": 16.849180066616874, "learning_rate": 9.512195121951218e-07, "logits/chosen": -3.553140163421631, "logits/rejected": -3.5460431575775146, "logps/chosen": -17.00033950805664, "logps/rejected": -20.65375518798828, "loss": 0.3222, "rewards/accuracies": 1.0, "rewards/chosen": 1.0098237991333008, "rewards/margins": 1.0098237991333008, "rewards/rejected": 0.0, "step": 273 }, { "epoch": 1.5307262569832403, "grad_norm": 10.37525339082753, "learning_rate": 9.547038327526132e-07, "logits/chosen": -3.491828441619873, "logits/rejected": -2.971925973892212, "logps/chosen": -19.172990798950195, "logps/rejected": -57.93187713623047, "loss": 0.2899, "rewards/accuracies": 1.0, "rewards/chosen": 1.0543417930603027, "rewards/margins": 1.0543417930603027, "rewards/rejected": 0.0, "step": 274 }, { "epoch": 1.536312849162011, "grad_norm": 15.071930004216172, "learning_rate": 9.581881533101046e-07, "logits/chosen": -3.063854932785034, "logits/rejected": -3.2093632221221924, "logps/chosen": -36.64848327636719, "logps/rejected": -28.312206268310547, "loss": 0.3545, "rewards/accuracies": 1.0, "rewards/chosen": 1.0443449020385742, "rewards/margins": 1.0443449020385742, "rewards/rejected": 0.0, "step": 275 }, { "epoch": 1.541899441340782, "grad_norm": 12.085634495974382, "learning_rate": 9.616724738675958e-07, "logits/chosen": -3.3405849933624268, "logits/rejected": -3.4201269149780273, "logps/chosen": -27.971378326416016, "logps/rejected": -28.627212524414062, "loss": 0.3322, "rewards/accuracies": 1.0, "rewards/chosen": 1.19189453125, "rewards/margins": 1.19189453125, "rewards/rejected": 0.0, "step": 276 }, { "epoch": 1.547486033519553, "grad_norm": 10.207854538997713, "learning_rate": 9.65156794425087e-07, "logits/chosen": -3.3752753734588623, "logits/rejected": -3.546302556991577, "logps/chosen": -38.091705322265625, "logps/rejected": -34.98358917236328, "loss": 0.3002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0685861110687256, "rewards/margins": 2.0685861110687256, "rewards/rejected": 0.0, "step": 277 }, { "epoch": 1.553072625698324, "grad_norm": 11.616172239405316, "learning_rate": 9.686411149825783e-07, "logits/chosen": -3.7080671787261963, "logits/rejected": -3.5104024410247803, "logps/chosen": -15.084697723388672, "logps/rejected": -25.442108154296875, "loss": 0.3144, "rewards/accuracies": 1.0, "rewards/chosen": 1.0713567733764648, "rewards/margins": 1.0713567733764648, "rewards/rejected": 0.0, "step": 278 }, { "epoch": 1.558659217877095, "grad_norm": 13.22103411673907, "learning_rate": 9.721254355400697e-07, "logits/chosen": -3.4512689113616943, "logits/rejected": -3.094844341278076, "logps/chosen": -26.031299591064453, "logps/rejected": -23.88300323486328, "loss": 0.3545, "rewards/accuracies": 1.0, "rewards/chosen": 1.0946345329284668, "rewards/margins": 1.0946345329284668, "rewards/rejected": 0.0, "step": 279 }, { "epoch": 1.564245810055866, "grad_norm": 10.027545681733605, "learning_rate": 9.756097560975609e-07, "logits/chosen": -3.328197479248047, "logits/rejected": -3.5146186351776123, "logps/chosen": -39.08555603027344, "logps/rejected": -29.00870132446289, "loss": 0.3004, "rewards/accuracies": 1.0, "rewards/chosen": 0.8183404207229614, "rewards/margins": 0.8183404207229614, "rewards/rejected": 0.0, "step": 280 }, { "epoch": 1.564245810055866, "eval_logits/chosen": -3.3012683391571045, "eval_logits/rejected": -3.3296456336975098, "eval_logps/chosen": -19.454092025756836, "eval_logps/rejected": -31.271724700927734, "eval_loss": 0.34393545985221863, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 1.0615949630737305, "eval_rewards/margins": 1.0615949630737305, "eval_rewards/rejected": 0.0, "eval_runtime": 33.6036, "eval_samples_per_second": 9.225, "eval_steps_per_second": 0.298, "step": 280 }, { "epoch": 1.5698324022346368, "grad_norm": 11.965310318453831, "learning_rate": 9.790940766550523e-07, "logits/chosen": -3.3429605960845947, "logits/rejected": -3.2995493412017822, "logps/chosen": -22.374677658081055, "logps/rejected": -16.40281105041504, "loss": 0.3241, "rewards/accuracies": 1.0, "rewards/chosen": 1.042400598526001, "rewards/margins": 1.042400598526001, "rewards/rejected": 0.0, "step": 281 }, { "epoch": 1.5754189944134078, "grad_norm": 10.352211488442286, "learning_rate": 9.825783972125434e-07, "logits/chosen": -3.5352978706359863, "logits/rejected": -3.5306782722473145, "logps/chosen": -12.284505844116211, "logps/rejected": -26.704856872558594, "loss": 0.3093, "rewards/accuracies": 1.0, "rewards/chosen": 0.870249330997467, "rewards/margins": 0.870249330997467, "rewards/rejected": 0.0, "step": 282 }, { "epoch": 1.5810055865921788, "grad_norm": 12.032630843979712, "learning_rate": 9.860627177700348e-07, "logits/chosen": -3.429222583770752, "logits/rejected": -3.3236958980560303, "logps/chosen": -27.717411041259766, "logps/rejected": -25.07077980041504, "loss": 0.3017, "rewards/accuracies": 1.0, "rewards/chosen": 1.2042782306671143, "rewards/margins": 1.2042782306671143, "rewards/rejected": 0.0, "step": 283 }, { "epoch": 1.5865921787709496, "grad_norm": 9.779522151659501, "learning_rate": 9.89547038327526e-07, "logits/chosen": -3.578108310699463, "logits/rejected": -3.4658658504486084, "logps/chosen": -13.309062004089355, "logps/rejected": -26.286941528320312, "loss": 0.3008, "rewards/accuracies": 1.0, "rewards/chosen": 0.9821389317512512, "rewards/margins": 0.9821389317512512, "rewards/rejected": 0.0, "step": 284 }, { "epoch": 1.5921787709497206, "grad_norm": 10.263148728342822, "learning_rate": 9.930313588850174e-07, "logits/chosen": -3.5624876022338867, "logits/rejected": -3.529696464538574, "logps/chosen": -19.440580368041992, "logps/rejected": -21.190061569213867, "loss": 0.2961, "rewards/accuracies": 0.75, "rewards/chosen": 0.6123175621032715, "rewards/margins": 0.6123175621032715, "rewards/rejected": 0.0, "step": 285 }, { "epoch": 1.5977653631284916, "grad_norm": 10.342644978214786, "learning_rate": 9.965156794425088e-07, "logits/chosen": -3.0572822093963623, "logits/rejected": -3.336202383041382, "logps/chosen": -17.684846878051758, "logps/rejected": -34.97415542602539, "loss": 0.2706, "rewards/accuracies": 1.0, "rewards/chosen": 1.2538070678710938, "rewards/margins": 1.2538070678710938, "rewards/rejected": 0.0, "step": 286 }, { "epoch": 1.6033519553072626, "grad_norm": 17.455815237252825, "learning_rate": 1e-06, "logits/chosen": -3.422065496444702, "logits/rejected": -3.199143409729004, "logps/chosen": -12.241714477539062, "logps/rejected": -20.09499740600586, "loss": 0.3422, "rewards/accuracies": 1.0, "rewards/chosen": 0.9664725065231323, "rewards/margins": 0.9664725065231323, "rewards/rejected": 0.0, "step": 287 }, { "epoch": 1.6089385474860336, "grad_norm": 10.920072184573593, "learning_rate": 9.999996284554774e-07, "logits/chosen": -3.2156920433044434, "logits/rejected": -3.287858009338379, "logps/chosen": -14.97732162475586, "logps/rejected": -24.15703773498535, "loss": 0.288, "rewards/accuracies": 1.0, "rewards/chosen": 1.1677476167678833, "rewards/margins": 1.1677476167678833, "rewards/rejected": 0.0, "step": 288 }, { "epoch": 1.6145251396648046, "grad_norm": 10.17580058388229, "learning_rate": 9.999985138224618e-07, "logits/chosen": -3.4886186122894287, "logits/rejected": -3.316105604171753, "logps/chosen": -13.622641563415527, "logps/rejected": -46.91883087158203, "loss": 0.3019, "rewards/accuracies": 1.0, "rewards/chosen": 0.9123961925506592, "rewards/margins": 0.9123961925506592, "rewards/rejected": 0.0, "step": 289 }, { "epoch": 1.6201117318435754, "grad_norm": 9.326145815880142, "learning_rate": 9.999966561026099e-07, "logits/chosen": -3.2829484939575195, "logits/rejected": -3.075709104537964, "logps/chosen": -10.445840835571289, "logps/rejected": -40.42874526977539, "loss": 0.3039, "rewards/accuracies": 1.0, "rewards/chosen": 0.8486101627349854, "rewards/margins": 0.8486101627349854, "rewards/rejected": 0.0, "step": 290 }, { "epoch": 1.6256983240223464, "grad_norm": 9.776873356124641, "learning_rate": 9.999940552986825e-07, "logits/chosen": -3.469417095184326, "logits/rejected": -3.279420852661133, "logps/chosen": -17.53714370727539, "logps/rejected": -23.5706787109375, "loss": 0.2718, "rewards/accuracies": 1.0, "rewards/chosen": 1.2589737176895142, "rewards/margins": 1.2589737176895142, "rewards/rejected": 0.0, "step": 291 }, { "epoch": 1.6312849162011172, "grad_norm": 10.429075276268662, "learning_rate": 9.99990711414545e-07, "logits/chosen": -3.7084476947784424, "logits/rejected": -3.5754408836364746, "logps/chosen": -24.693119049072266, "logps/rejected": -33.41971969604492, "loss": 0.3242, "rewards/accuracies": 1.0, "rewards/chosen": 1.529041051864624, "rewards/margins": 1.529041051864624, "rewards/rejected": 0.0, "step": 292 }, { "epoch": 1.6368715083798882, "grad_norm": 9.720924585440683, "learning_rate": 9.999866244551666e-07, "logits/chosen": -3.4124324321746826, "logits/rejected": -3.210049867630005, "logps/chosen": -11.158379554748535, "logps/rejected": -43.5897331237793, "loss": 0.3196, "rewards/accuracies": 1.0, "rewards/chosen": 1.0147404670715332, "rewards/margins": 1.0147404670715332, "rewards/rejected": 0.0, "step": 293 }, { "epoch": 1.6424581005586592, "grad_norm": 9.944702206569138, "learning_rate": 9.999817944266215e-07, "logits/chosen": -3.3606770038604736, "logits/rejected": -3.4656686782836914, "logps/chosen": -6.251188278198242, "logps/rejected": -21.617168426513672, "loss": 0.3217, "rewards/accuracies": 1.0, "rewards/chosen": 0.5969514846801758, "rewards/margins": 0.5969514846801758, "rewards/rejected": 0.0, "step": 294 }, { "epoch": 1.6480446927374302, "grad_norm": 10.385353254236556, "learning_rate": 9.999762213360883e-07, "logits/chosen": -3.466520071029663, "logits/rejected": -3.5797107219696045, "logps/chosen": -25.80699920654297, "logps/rejected": -19.83127212524414, "loss": 0.3194, "rewards/accuracies": 1.0, "rewards/chosen": 1.3000495433807373, "rewards/margins": 1.3000495433807373, "rewards/rejected": 0.0, "step": 295 }, { "epoch": 1.6536312849162011, "grad_norm": 8.773867276405428, "learning_rate": 9.99969905191849e-07, "logits/chosen": -3.557112693786621, "logits/rejected": -3.573246955871582, "logps/chosen": -20.78133773803711, "logps/rejected": -34.07285690307617, "loss": 0.2889, "rewards/accuracies": 1.0, "rewards/chosen": 1.1277602910995483, "rewards/margins": 1.1277602910995483, "rewards/rejected": 0.0, "step": 296 }, { "epoch": 1.6592178770949721, "grad_norm": 9.821752375679727, "learning_rate": 9.99962846003291e-07, "logits/chosen": -3.7407591342926025, "logits/rejected": -3.767958879470825, "logps/chosen": -14.965896606445312, "logps/rejected": -20.833538055419922, "loss": 0.2999, "rewards/accuracies": 1.0, "rewards/chosen": 0.9994887113571167, "rewards/margins": 0.9994887113571167, "rewards/rejected": 0.0, "step": 297 }, { "epoch": 1.6648044692737431, "grad_norm": 10.928183640862303, "learning_rate": 9.999550437809053e-07, "logits/chosen": -3.2713358402252197, "logits/rejected": -3.4095914363861084, "logps/chosen": -27.807035446166992, "logps/rejected": -38.68109130859375, "loss": 0.2967, "rewards/accuracies": 1.0, "rewards/chosen": 1.2415056228637695, "rewards/margins": 1.2415056228637695, "rewards/rejected": 0.0, "step": 298 }, { "epoch": 1.670391061452514, "grad_norm": 17.270546512795296, "learning_rate": 9.999464985362873e-07, "logits/chosen": -3.3679652214050293, "logits/rejected": -3.5425937175750732, "logps/chosen": -61.781837463378906, "logps/rejected": -28.911365509033203, "loss": 0.3695, "rewards/accuracies": 0.5, "rewards/chosen": 0.8019064664840698, "rewards/margins": 0.8019064664840698, "rewards/rejected": 0.0, "step": 299 }, { "epoch": 1.675977653631285, "grad_norm": 14.185589116130966, "learning_rate": 9.999372102821368e-07, "logits/chosen": -3.5178616046905518, "logits/rejected": -3.4500045776367188, "logps/chosen": -11.943248748779297, "logps/rejected": -31.703353881835938, "loss": 0.3579, "rewards/accuracies": 1.0, "rewards/chosen": 1.0013020038604736, "rewards/margins": 1.0013020038604736, "rewards/rejected": 0.0, "step": 300 }, { "epoch": 1.675977653631285, "eval_logits/chosen": -3.322420597076416, "eval_logits/rejected": -3.361907958984375, "eval_logps/chosen": -19.29962158203125, "eval_logps/rejected": -31.117549896240234, "eval_loss": 0.344208300113678, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 1.0770422220230103, "eval_rewards/margins": 1.0770422220230103, "eval_rewards/rejected": 0.0, "eval_runtime": 33.6372, "eval_samples_per_second": 9.216, "eval_steps_per_second": 0.297, "step": 300 }, { "epoch": 1.6815642458100557, "grad_norm": 9.939112746890036, "learning_rate": 9.999271790322582e-07, "logits/chosen": -3.5513951778411865, "logits/rejected": -3.587817668914795, "logps/chosen": -10.393442153930664, "logps/rejected": -32.21281814575195, "loss": 0.3038, "rewards/accuracies": 1.0, "rewards/chosen": 0.921258807182312, "rewards/margins": 0.921258807182312, "rewards/rejected": 0.0, "step": 301 }, { "epoch": 1.6871508379888267, "grad_norm": 15.239563770884459, "learning_rate": 9.99916404801559e-07, "logits/chosen": -3.6404824256896973, "logits/rejected": -3.5286920070648193, "logps/chosen": -32.23038864135742, "logps/rejected": -39.896217346191406, "loss": 0.3328, "rewards/accuracies": 1.0, "rewards/chosen": 0.8375921249389648, "rewards/margins": 0.8375921249389648, "rewards/rejected": 0.0, "step": 302 }, { "epoch": 1.6927374301675977, "grad_norm": 10.592627695108975, "learning_rate": 9.999048876060525e-07, "logits/chosen": -3.9261133670806885, "logits/rejected": -3.867961883544922, "logps/chosen": -10.186397552490234, "logps/rejected": -26.875429153442383, "loss": 0.3275, "rewards/accuracies": 1.0, "rewards/chosen": 1.019852638244629, "rewards/margins": 1.019852638244629, "rewards/rejected": 0.0, "step": 303 }, { "epoch": 1.6983240223463687, "grad_norm": 11.76517910635558, "learning_rate": 9.998926274628546e-07, "logits/chosen": -3.2354116439819336, "logits/rejected": -3.0572993755340576, "logps/chosen": -36.67839050292969, "logps/rejected": -22.477191925048828, "loss": 0.2708, "rewards/accuracies": 0.75, "rewards/chosen": 0.6770723462104797, "rewards/margins": 0.6770723462104797, "rewards/rejected": 0.0, "step": 304 }, { "epoch": 1.7039106145251397, "grad_norm": 14.159098515470435, "learning_rate": 9.99879624390186e-07, "logits/chosen": -3.5911238193511963, "logits/rejected": -3.755124092102051, "logps/chosen": -21.760345458984375, "logps/rejected": -15.202546119689941, "loss": 0.3113, "rewards/accuracies": 1.0, "rewards/chosen": 1.0169107913970947, "rewards/margins": 1.0169107913970947, "rewards/rejected": 0.0, "step": 305 }, { "epoch": 1.7094972067039107, "grad_norm": 9.736802850531827, "learning_rate": 9.998658784073723e-07, "logits/chosen": -3.1625123023986816, "logits/rejected": -3.1821582317352295, "logps/chosen": -21.424030303955078, "logps/rejected": -20.46610450744629, "loss": 0.2899, "rewards/accuracies": 1.0, "rewards/chosen": 1.1583060026168823, "rewards/margins": 1.1583060026168823, "rewards/rejected": 0.0, "step": 306 }, { "epoch": 1.7150837988826817, "grad_norm": 11.009203336538388, "learning_rate": 9.998513895348417e-07, "logits/chosen": -3.2195072174072266, "logits/rejected": -3.4642629623413086, "logps/chosen": -17.911853790283203, "logps/rejected": -15.646801948547363, "loss": 0.3135, "rewards/accuracies": 1.0, "rewards/chosen": 1.1663609743118286, "rewards/margins": 1.1663609743118286, "rewards/rejected": 0.0, "step": 307 }, { "epoch": 1.7206703910614525, "grad_norm": 10.53450471248475, "learning_rate": 9.998361577941278e-07, "logits/chosen": -3.488490104675293, "logits/rejected": -3.322944164276123, "logps/chosen": -16.006378173828125, "logps/rejected": -28.02416229248047, "loss": 0.3442, "rewards/accuracies": 1.0, "rewards/chosen": 1.024048089981079, "rewards/margins": 1.024048089981079, "rewards/rejected": 0.0, "step": 308 }, { "epoch": 1.7262569832402235, "grad_norm": 12.582435776032845, "learning_rate": 9.998201832078674e-07, "logits/chosen": -3.3339123725891113, "logits/rejected": -3.316779851913452, "logps/chosen": -28.325908660888672, "logps/rejected": -16.219600677490234, "loss": 0.3268, "rewards/accuracies": 1.0, "rewards/chosen": 0.9893666505813599, "rewards/margins": 0.9893666505813599, "rewards/rejected": 0.0, "step": 309 }, { "epoch": 1.7318435754189943, "grad_norm": 13.41209684100597, "learning_rate": 9.998034657998019e-07, "logits/chosen": -3.640216588973999, "logits/rejected": -3.4110727310180664, "logps/chosen": -11.483031272888184, "logps/rejected": -40.59260559082031, "loss": 0.3189, "rewards/accuracies": 1.0, "rewards/chosen": 1.0720748901367188, "rewards/margins": 1.0720748901367188, "rewards/rejected": 0.0, "step": 310 }, { "epoch": 1.7374301675977653, "grad_norm": 15.205084198489166, "learning_rate": 9.997860055947756e-07, "logits/chosen": -3.6733248233795166, "logits/rejected": -3.6724252700805664, "logps/chosen": -7.7731428146362305, "logps/rejected": -32.35521697998047, "loss": 0.3746, "rewards/accuracies": 1.0, "rewards/chosen": 0.5072684288024902, "rewards/margins": 0.5072684288024902, "rewards/rejected": 0.0, "step": 311 }, { "epoch": 1.7430167597765363, "grad_norm": 11.414239251919467, "learning_rate": 9.997678026187383e-07, "logits/chosen": -3.6234049797058105, "logits/rejected": -3.676602363586426, "logps/chosen": -12.703027725219727, "logps/rejected": -30.089244842529297, "loss": 0.3159, "rewards/accuracies": 1.0, "rewards/chosen": 0.7911865711212158, "rewards/margins": 0.7911865711212158, "rewards/rejected": 0.0, "step": 312 }, { "epoch": 1.7486033519553073, "grad_norm": 12.823829809057761, "learning_rate": 9.997488568987425e-07, "logits/chosen": -3.5169265270233154, "logits/rejected": -3.595710039138794, "logps/chosen": -11.737189292907715, "logps/rejected": -31.50156021118164, "loss": 0.3476, "rewards/accuracies": 1.0, "rewards/chosen": 0.8435858488082886, "rewards/margins": 0.8435858488082886, "rewards/rejected": 0.0, "step": 313 }, { "epoch": 1.7541899441340782, "grad_norm": 10.401381588926625, "learning_rate": 9.997291684629449e-07, "logits/chosen": -3.3408241271972656, "logits/rejected": -3.465125560760498, "logps/chosen": -18.41136932373047, "logps/rejected": -20.647062301635742, "loss": 0.3035, "rewards/accuracies": 1.0, "rewards/chosen": 1.3910760879516602, "rewards/margins": 1.3910760879516602, "rewards/rejected": 0.0, "step": 314 }, { "epoch": 1.7597765363128492, "grad_norm": 8.50936106344969, "learning_rate": 9.99708737340606e-07, "logits/chosen": -3.2572875022888184, "logits/rejected": -3.19663143157959, "logps/chosen": -24.815534591674805, "logps/rejected": -24.796138763427734, "loss": 0.2827, "rewards/accuracies": 1.0, "rewards/chosen": 1.203381061553955, "rewards/margins": 1.203381061553955, "rewards/rejected": 0.0, "step": 315 }, { "epoch": 1.7653631284916202, "grad_norm": 15.15456006027298, "learning_rate": 9.996875635620902e-07, "logits/chosen": -3.3994522094726562, "logits/rejected": -3.299717903137207, "logps/chosen": -14.031728744506836, "logps/rejected": -47.74146270751953, "loss": 0.3482, "rewards/accuracies": 1.0, "rewards/chosen": 1.1319087743759155, "rewards/margins": 1.1319087743759155, "rewards/rejected": 0.0, "step": 316 }, { "epoch": 1.770949720670391, "grad_norm": 13.875581331692954, "learning_rate": 9.996656471588656e-07, "logits/chosen": -3.031449556350708, "logits/rejected": -3.1041312217712402, "logps/chosen": -19.398300170898438, "logps/rejected": -33.0574951171875, "loss": 0.3268, "rewards/accuracies": 1.0, "rewards/chosen": 1.2168288230895996, "rewards/margins": 1.2168288230895996, "rewards/rejected": 0.0, "step": 317 }, { "epoch": 1.776536312849162, "grad_norm": 13.74240813346542, "learning_rate": 9.996429881635036e-07, "logits/chosen": -3.656426191329956, "logits/rejected": -3.317042112350464, "logps/chosen": -10.931537628173828, "logps/rejected": -45.407440185546875, "loss": 0.3717, "rewards/accuracies": 1.0, "rewards/chosen": 0.8999665975570679, "rewards/margins": 0.8999665975570679, "rewards/rejected": 0.0, "step": 318 }, { "epoch": 1.7821229050279328, "grad_norm": 16.88114307609148, "learning_rate": 9.996195866096797e-07, "logits/chosen": -3.3901474475860596, "logits/rejected": -3.2347769737243652, "logps/chosen": -20.043804168701172, "logps/rejected": -23.260557174682617, "loss": 0.3657, "rewards/accuracies": 1.0, "rewards/chosen": 1.1257836818695068, "rewards/margins": 1.1257836818695068, "rewards/rejected": 0.0, "step": 319 }, { "epoch": 1.7877094972067038, "grad_norm": 19.952904936930338, "learning_rate": 9.995954425321727e-07, "logits/chosen": -3.3588874340057373, "logits/rejected": -3.2260642051696777, "logps/chosen": -66.18074798583984, "logps/rejected": -35.820491790771484, "loss": 0.3457, "rewards/accuracies": 0.75, "rewards/chosen": 1.2065391540527344, "rewards/margins": 1.2065391540527344, "rewards/rejected": 0.0, "step": 320 }, { "epoch": 1.7877094972067038, "eval_logits/chosen": -3.3043503761291504, "eval_logits/rejected": -3.3413078784942627, "eval_logps/chosen": -19.3726749420166, "eval_logps/rejected": -31.22524642944336, "eval_loss": 0.34021639823913574, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 1.0697367191314697, "eval_rewards/margins": 1.0697367191314697, "eval_rewards/rejected": 0.0, "eval_runtime": 33.6357, "eval_samples_per_second": 9.216, "eval_steps_per_second": 0.297, "step": 320 }, { "epoch": 1.7932960893854748, "grad_norm": 9.301428863056444, "learning_rate": 9.99570555966865e-07, "logits/chosen": -3.5000479221343994, "logits/rejected": -3.395080089569092, "logps/chosen": -30.758852005004883, "logps/rejected": -30.442750930786133, "loss": 0.2747, "rewards/accuracies": 1.0, "rewards/chosen": 1.9963459968566895, "rewards/margins": 1.9963459968566895, "rewards/rejected": 0.0, "step": 321 }, { "epoch": 1.7988826815642458, "grad_norm": 9.171701997345275, "learning_rate": 9.995449269507422e-07, "logits/chosen": -3.5941829681396484, "logits/rejected": -3.524878978729248, "logps/chosen": -9.524094581604004, "logps/rejected": -32.728294372558594, "loss": 0.2647, "rewards/accuracies": 1.0, "rewards/chosen": 0.7229211330413818, "rewards/margins": 0.7229211330413818, "rewards/rejected": 0.0, "step": 322 }, { "epoch": 1.8044692737430168, "grad_norm": 10.69476295700697, "learning_rate": 9.995185555218943e-07, "logits/chosen": -3.359325408935547, "logits/rejected": -3.411824941635132, "logps/chosen": -14.76585578918457, "logps/rejected": -23.467166900634766, "loss": 0.3463, "rewards/accuracies": 0.75, "rewards/chosen": 0.5161473155021667, "rewards/margins": 0.5161473155021667, "rewards/rejected": 0.0, "step": 323 }, { "epoch": 1.8100558659217878, "grad_norm": 11.118347896847748, "learning_rate": 9.994914417195133e-07, "logits/chosen": -3.2842142581939697, "logits/rejected": -3.1278810501098633, "logps/chosen": -9.60197639465332, "logps/rejected": -22.244657516479492, "loss": 0.3278, "rewards/accuracies": 1.0, "rewards/chosen": 0.7514290809631348, "rewards/margins": 0.7514290809631348, "rewards/rejected": 0.0, "step": 324 }, { "epoch": 1.8156424581005588, "grad_norm": 13.029770617319564, "learning_rate": 9.994635855838955e-07, "logits/chosen": -3.752204179763794, "logits/rejected": -3.814516305923462, "logps/chosen": -18.200443267822266, "logps/rejected": -22.95499038696289, "loss": 0.3628, "rewards/accuracies": 1.0, "rewards/chosen": 0.9537428617477417, "rewards/margins": 0.9537428617477417, "rewards/rejected": 0.0, "step": 325 }, { "epoch": 1.8212290502793296, "grad_norm": 12.513334642173808, "learning_rate": 9.994349871564399e-07, "logits/chosen": -3.5632996559143066, "logits/rejected": -3.520127296447754, "logps/chosen": -26.24734115600586, "logps/rejected": -28.493831634521484, "loss": 0.2948, "rewards/accuracies": 1.0, "rewards/chosen": 1.1466038227081299, "rewards/margins": 1.1466038227081299, "rewards/rejected": 0.0, "step": 326 }, { "epoch": 1.8268156424581006, "grad_norm": 9.798945596568291, "learning_rate": 9.994056464796488e-07, "logits/chosen": -2.756512403488159, "logits/rejected": -2.7556052207946777, "logps/chosen": -42.9817008972168, "logps/rejected": -46.558284759521484, "loss": 0.3303, "rewards/accuracies": 1.0, "rewards/chosen": 0.8298018574714661, "rewards/margins": 0.8298018574714661, "rewards/rejected": 0.0, "step": 327 }, { "epoch": 1.8324022346368714, "grad_norm": 10.918392572339316, "learning_rate": 9.993755635971276e-07, "logits/chosen": -3.643589496612549, "logits/rejected": -3.6738765239715576, "logps/chosen": -17.016036987304688, "logps/rejected": -27.112552642822266, "loss": 0.3252, "rewards/accuracies": 1.0, "rewards/chosen": 1.4312465190887451, "rewards/margins": 1.4312465190887451, "rewards/rejected": 0.0, "step": 328 }, { "epoch": 1.8379888268156424, "grad_norm": 13.706813583359736, "learning_rate": 9.993447385535851e-07, "logits/chosen": -2.72222638130188, "logits/rejected": -2.739816427230835, "logps/chosen": -28.710716247558594, "logps/rejected": -27.147722244262695, "loss": 0.3485, "rewards/accuracies": 1.0, "rewards/chosen": 0.7921134233474731, "rewards/margins": 0.7921134233474731, "rewards/rejected": 0.0, "step": 329 }, { "epoch": 1.8435754189944134, "grad_norm": 13.114944075004717, "learning_rate": 9.99313171394833e-07, "logits/chosen": -3.390263795852661, "logits/rejected": -3.270251750946045, "logps/chosen": -27.65669059753418, "logps/rejected": -27.90283203125, "loss": 0.3475, "rewards/accuracies": 1.0, "rewards/chosen": 1.8979809284210205, "rewards/margins": 1.8979809284210205, "rewards/rejected": 0.0, "step": 330 }, { "epoch": 1.8491620111731844, "grad_norm": 9.133700091484517, "learning_rate": 9.992808621677849e-07, "logits/chosen": -3.3948614597320557, "logits/rejected": -3.347947597503662, "logps/chosen": -17.9841251373291, "logps/rejected": -37.400943756103516, "loss": 0.2728, "rewards/accuracies": 1.0, "rewards/chosen": 1.332104206085205, "rewards/margins": 1.332104206085205, "rewards/rejected": 0.0, "step": 331 }, { "epoch": 1.8547486033519553, "grad_norm": 9.317769026143214, "learning_rate": 9.992478109204587e-07, "logits/chosen": -3.4927287101745605, "logits/rejected": -3.7286641597747803, "logps/chosen": -14.411958694458008, "logps/rejected": -28.702346801757812, "loss": 0.2789, "rewards/accuracies": 1.0, "rewards/chosen": 1.1069204807281494, "rewards/margins": 1.1069204807281494, "rewards/rejected": 0.0, "step": 332 }, { "epoch": 1.8603351955307263, "grad_norm": 14.201362831463452, "learning_rate": 9.992140177019743e-07, "logits/chosen": -3.479283571243286, "logits/rejected": -3.3960371017456055, "logps/chosen": -11.420783996582031, "logps/rejected": -22.475345611572266, "loss": 0.3369, "rewards/accuracies": 1.0, "rewards/chosen": 0.8035995364189148, "rewards/margins": 0.8035995364189148, "rewards/rejected": 0.0, "step": 333 }, { "epoch": 1.8659217877094973, "grad_norm": 11.175373020780594, "learning_rate": 9.991794825625546e-07, "logits/chosen": -3.3228561878204346, "logits/rejected": -3.2982242107391357, "logps/chosen": -5.908030986785889, "logps/rejected": -56.5552978515625, "loss": 0.3385, "rewards/accuracies": 1.0, "rewards/chosen": 0.6184911727905273, "rewards/margins": 0.6184911727905273, "rewards/rejected": 0.0, "step": 334 }, { "epoch": 1.8715083798882681, "grad_norm": 13.277307948180802, "learning_rate": 9.991442055535246e-07, "logits/chosen": -3.829953670501709, "logits/rejected": -3.4723265171051025, "logps/chosen": -14.793371200561523, "logps/rejected": -20.083637237548828, "loss": 0.3148, "rewards/accuracies": 1.0, "rewards/chosen": 1.3210265636444092, "rewards/margins": 1.3210265636444092, "rewards/rejected": 0.0, "step": 335 }, { "epoch": 1.8770949720670391, "grad_norm": 9.406315729563916, "learning_rate": 9.991081867273126e-07, "logits/chosen": -3.4278557300567627, "logits/rejected": -3.540053606033325, "logps/chosen": -18.8541259765625, "logps/rejected": -33.45713806152344, "loss": 0.2721, "rewards/accuracies": 1.0, "rewards/chosen": 1.3648288249969482, "rewards/margins": 1.3648288249969482, "rewards/rejected": 0.0, "step": 336 }, { "epoch": 1.88268156424581, "grad_norm": 9.818598859075479, "learning_rate": 9.990714261374489e-07, "logits/chosen": -3.2425389289855957, "logits/rejected": -3.331890821456909, "logps/chosen": -10.86574935913086, "logps/rejected": -31.01609230041504, "loss": 0.3132, "rewards/accuracies": 1.0, "rewards/chosen": 0.6972233057022095, "rewards/margins": 0.6972233057022095, "rewards/rejected": 0.0, "step": 337 }, { "epoch": 1.888268156424581, "grad_norm": 10.079056695811483, "learning_rate": 9.99033923838566e-07, "logits/chosen": -3.457608222961426, "logits/rejected": -3.2599005699157715, "logps/chosen": -45.99235916137695, "logps/rejected": -35.339759826660156, "loss": 0.2789, "rewards/accuracies": 1.0, "rewards/chosen": 1.6016557216644287, "rewards/margins": 1.6016557216644287, "rewards/rejected": 0.0, "step": 338 }, { "epoch": 1.893854748603352, "grad_norm": 11.410716488878608, "learning_rate": 9.989956798863992e-07, "logits/chosen": -3.503572940826416, "logits/rejected": -3.4884138107299805, "logps/chosen": -19.33344268798828, "logps/rejected": -22.809539794921875, "loss": 0.3277, "rewards/accuracies": 1.0, "rewards/chosen": 1.0641868114471436, "rewards/margins": 1.0641868114471436, "rewards/rejected": 0.0, "step": 339 }, { "epoch": 1.899441340782123, "grad_norm": 12.560637580640856, "learning_rate": 9.989566943377859e-07, "logits/chosen": -3.2320213317871094, "logits/rejected": -3.377939462661743, "logps/chosen": -48.088279724121094, "logps/rejected": -21.49866485595703, "loss": 0.3362, "rewards/accuracies": 1.0, "rewards/chosen": 1.3534762859344482, "rewards/margins": 1.3534762859344482, "rewards/rejected": 0.0, "step": 340 }, { "epoch": 1.899441340782123, "eval_logits/chosen": -3.3132081031799316, "eval_logits/rejected": -3.350616931915283, "eval_logps/chosen": -19.246543884277344, "eval_logps/rejected": -31.018817901611328, "eval_loss": 0.3389597237110138, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 1.0823500156402588, "eval_rewards/margins": 1.0823500156402588, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5746, "eval_samples_per_second": 9.233, "eval_steps_per_second": 0.298, "step": 340 }, { "epoch": 1.905027932960894, "grad_norm": 9.892599407239048, "learning_rate": 9.989169672506653e-07, "logits/chosen": -3.5700721740722656, "logits/rejected": -3.4472672939300537, "logps/chosen": -15.808403015136719, "logps/rejected": -32.856849670410156, "loss": 0.2814, "rewards/accuracies": 1.0, "rewards/chosen": 1.3196511268615723, "rewards/margins": 1.3196511268615723, "rewards/rejected": 0.0, "step": 341 }, { "epoch": 1.910614525139665, "grad_norm": 13.061729648562062, "learning_rate": 9.988764986840794e-07, "logits/chosen": -3.519749402999878, "logits/rejected": -3.3913304805755615, "logps/chosen": -18.575572967529297, "logps/rejected": -45.02961349487305, "loss": 0.3381, "rewards/accuracies": 1.0, "rewards/chosen": 1.1368541717529297, "rewards/margins": 1.1368541717529297, "rewards/rejected": 0.0, "step": 342 }, { "epoch": 1.916201117318436, "grad_norm": 14.711655580356501, "learning_rate": 9.988352886981713e-07, "logits/chosen": -3.5021438598632812, "logits/rejected": -3.3737618923187256, "logps/chosen": -9.830192565917969, "logps/rejected": -30.507808685302734, "loss": 0.3278, "rewards/accuracies": 1.0, "rewards/chosen": 0.9768810868263245, "rewards/margins": 0.9768810868263245, "rewards/rejected": 0.0, "step": 343 }, { "epoch": 1.9217877094972067, "grad_norm": 12.603322659212049, "learning_rate": 9.987933373541866e-07, "logits/chosen": -3.649998903274536, "logits/rejected": -3.4158873558044434, "logps/chosen": -47.723350524902344, "logps/rejected": -29.496013641357422, "loss": 0.308, "rewards/accuracies": 0.75, "rewards/chosen": 1.104366660118103, "rewards/margins": 1.104366660118103, "rewards/rejected": 0.0, "step": 344 }, { "epoch": 1.9273743016759777, "grad_norm": 9.37885249451381, "learning_rate": 9.98750644714472e-07, "logits/chosen": -3.400846242904663, "logits/rejected": -3.4327080249786377, "logps/chosen": -12.303414344787598, "logps/rejected": -16.289201736450195, "loss": 0.2985, "rewards/accuracies": 1.0, "rewards/chosen": 0.5962328314781189, "rewards/margins": 0.5962328314781189, "rewards/rejected": 0.0, "step": 345 }, { "epoch": 1.9329608938547485, "grad_norm": 11.10219966687012, "learning_rate": 9.987072108424771e-07, "logits/chosen": -3.700981378555298, "logits/rejected": -3.167149543762207, "logps/chosen": -13.309455871582031, "logps/rejected": -43.61916732788086, "loss": 0.3056, "rewards/accuracies": 1.0, "rewards/chosen": 0.9612671732902527, "rewards/margins": 0.9612671732902527, "rewards/rejected": 0.0, "step": 346 }, { "epoch": 1.9385474860335195, "grad_norm": 14.401670529551904, "learning_rate": 9.986630358027519e-07, "logits/chosen": -3.465524435043335, "logits/rejected": -3.315683126449585, "logps/chosen": -14.349411964416504, "logps/rejected": -32.112144470214844, "loss": 0.3166, "rewards/accuracies": 1.0, "rewards/chosen": 1.256392478942871, "rewards/margins": 1.256392478942871, "rewards/rejected": 0.0, "step": 347 }, { "epoch": 1.9441340782122905, "grad_norm": 11.261316217053585, "learning_rate": 9.986181196609485e-07, "logits/chosen": -3.340606689453125, "logits/rejected": -3.401167154312134, "logps/chosen": -34.154457092285156, "logps/rejected": -33.26017761230469, "loss": 0.3193, "rewards/accuracies": 0.75, "rewards/chosen": 0.9464106559753418, "rewards/margins": 0.9464106559753418, "rewards/rejected": 0.0, "step": 348 }, { "epoch": 1.9497206703910615, "grad_norm": 8.624436204570914, "learning_rate": 9.9857246248382e-07, "logits/chosen": -3.3766283988952637, "logits/rejected": -3.463714599609375, "logps/chosen": -8.690055847167969, "logps/rejected": -27.917278289794922, "loss": 0.2987, "rewards/accuracies": 1.0, "rewards/chosen": 0.7766602039337158, "rewards/margins": 0.7766602039337158, "rewards/rejected": 0.0, "step": 349 }, { "epoch": 1.9553072625698324, "grad_norm": 14.03088083668422, "learning_rate": 9.985260643392214e-07, "logits/chosen": -3.6260368824005127, "logits/rejected": -3.6323251724243164, "logps/chosen": -27.713470458984375, "logps/rejected": -29.578903198242188, "loss": 0.2988, "rewards/accuracies": 1.0, "rewards/chosen": 1.267095685005188, "rewards/margins": 1.267095685005188, "rewards/rejected": 0.0, "step": 350 }, { "epoch": 1.9608938547486034, "grad_norm": 12.16822330191893, "learning_rate": 9.984789252961087e-07, "logits/chosen": -3.0514800548553467, "logits/rejected": -2.936555862426758, "logps/chosen": -56.688716888427734, "logps/rejected": -36.15301513671875, "loss": 0.319, "rewards/accuracies": 1.0, "rewards/chosen": 1.2467468976974487, "rewards/margins": 1.2467468976974487, "rewards/rejected": 0.0, "step": 351 }, { "epoch": 1.9664804469273744, "grad_norm": 10.1811700058371, "learning_rate": 9.984310454245387e-07, "logits/chosen": -3.7579891681671143, "logits/rejected": -3.6203415393829346, "logps/chosen": -17.171201705932617, "logps/rejected": -24.970687866210938, "loss": 0.2861, "rewards/accuracies": 1.0, "rewards/chosen": 0.7694160342216492, "rewards/margins": 0.7694160342216492, "rewards/rejected": 0.0, "step": 352 }, { "epoch": 1.9720670391061452, "grad_norm": 8.383463260037288, "learning_rate": 9.983824247956697e-07, "logits/chosen": -3.534579277038574, "logits/rejected": -3.261944055557251, "logps/chosen": -21.65915298461914, "logps/rejected": -61.01673126220703, "loss": 0.2666, "rewards/accuracies": 1.0, "rewards/chosen": 1.189016342163086, "rewards/margins": 1.189016342163086, "rewards/rejected": 0.0, "step": 353 }, { "epoch": 1.9776536312849162, "grad_norm": 10.72433684140575, "learning_rate": 9.983330634817604e-07, "logits/chosen": -3.474168539047241, "logits/rejected": -3.3596277236938477, "logps/chosen": -29.677217483520508, "logps/rejected": -22.099822998046875, "loss": 0.2935, "rewards/accuracies": 1.0, "rewards/chosen": 1.5245972871780396, "rewards/margins": 1.5245972871780396, "rewards/rejected": 0.0, "step": 354 }, { "epoch": 1.983240223463687, "grad_norm": 9.773463439448527, "learning_rate": 9.982829615561703e-07, "logits/chosen": -3.77665638923645, "logits/rejected": -3.8121116161346436, "logps/chosen": -13.98667049407959, "logps/rejected": -18.671918869018555, "loss": 0.3192, "rewards/accuracies": 1.0, "rewards/chosen": 0.8396562337875366, "rewards/margins": 0.8396562337875366, "rewards/rejected": 0.0, "step": 355 }, { "epoch": 1.988826815642458, "grad_norm": 14.832142493277997, "learning_rate": 9.9823211909336e-07, "logits/chosen": -3.797262668609619, "logits/rejected": -3.7703962326049805, "logps/chosen": -13.690839767456055, "logps/rejected": -22.22066879272461, "loss": 0.304, "rewards/accuracies": 1.0, "rewards/chosen": 1.1787787675857544, "rewards/margins": 1.1787787675857544, "rewards/rejected": 0.0, "step": 356 }, { "epoch": 1.994413407821229, "grad_norm": 9.654778339478744, "learning_rate": 9.981805361688905e-07, "logits/chosen": -3.5876450538635254, "logits/rejected": -3.2162351608276367, "logps/chosen": -23.21221351623535, "logps/rejected": -22.66413688659668, "loss": 0.2689, "rewards/accuracies": 1.0, "rewards/chosen": 1.9285216331481934, "rewards/margins": 1.9285216331481934, "rewards/rejected": 0.0, "step": 357 }, { "epoch": 2.0, "grad_norm": 10.368296764057684, "learning_rate": 9.98128212859423e-07, "logits/chosen": -3.411379814147949, "logits/rejected": -3.3156909942626953, "logps/chosen": -26.243820190429688, "logps/rejected": -26.946678161621094, "loss": 0.2911, "rewards/accuracies": 0.75, "rewards/chosen": 0.3683784008026123, "rewards/margins": 0.3683784008026123, "rewards/rejected": 0.0, "step": 358 }, { "epoch": 2.005586592178771, "grad_norm": 6.853559733334418, "learning_rate": 9.980751492427196e-07, "logits/chosen": -2.940213680267334, "logits/rejected": -3.107919692993164, "logps/chosen": -23.83705711364746, "logps/rejected": -21.831621170043945, "loss": 0.2572, "rewards/accuracies": 1.0, "rewards/chosen": 1.8738245964050293, "rewards/margins": 1.8738245964050293, "rewards/rejected": 0.0, "step": 359 }, { "epoch": 2.011173184357542, "grad_norm": 7.230543916216328, "learning_rate": 9.98021345397642e-07, "logits/chosen": -3.674678087234497, "logits/rejected": -3.4114761352539062, "logps/chosen": -7.945418834686279, "logps/rejected": -19.77797508239746, "loss": 0.2611, "rewards/accuracies": 1.0, "rewards/chosen": 0.9623333215713501, "rewards/margins": 0.9623333215713501, "rewards/rejected": 0.0, "step": 360 }, { "epoch": 2.011173184357542, "eval_logits/chosen": -3.352512836456299, "eval_logits/rejected": -3.3942763805389404, "eval_logps/chosen": -19.009695053100586, "eval_logps/rejected": -31.048065185546875, "eval_loss": 0.3390383720397949, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 1.106034517288208, "eval_rewards/margins": 1.106034517288208, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5901, "eval_samples_per_second": 9.229, "eval_steps_per_second": 0.298, "step": 360 }, { "epoch": 2.016759776536313, "grad_norm": 7.137277220971277, "learning_rate": 9.979668014041524e-07, "logits/chosen": -3.667314052581787, "logits/rejected": -3.686204195022583, "logps/chosen": -13.76287841796875, "logps/rejected": -24.61775779724121, "loss": 0.2442, "rewards/accuracies": 1.0, "rewards/chosen": 1.6051509380340576, "rewards/margins": 1.6051509380340576, "rewards/rejected": 0.0, "step": 361 }, { "epoch": 2.022346368715084, "grad_norm": 6.284282002970191, "learning_rate": 9.979115173433128e-07, "logits/chosen": -3.2760801315307617, "logits/rejected": -3.306630849838257, "logps/chosen": -31.155853271484375, "logps/rejected": -43.76316833496094, "loss": 0.2428, "rewards/accuracies": 1.0, "rewards/chosen": 1.9641227722167969, "rewards/margins": 1.9641227722167969, "rewards/rejected": 0.0, "step": 362 }, { "epoch": 2.0279329608938546, "grad_norm": 7.587844952549412, "learning_rate": 9.978554932972853e-07, "logits/chosen": -3.5016777515411377, "logits/rejected": -3.313396692276001, "logps/chosen": -25.92389488220215, "logps/rejected": -32.96814727783203, "loss": 0.2582, "rewards/accuracies": 1.0, "rewards/chosen": 1.5138459205627441, "rewards/margins": 1.5138459205627441, "rewards/rejected": 0.0, "step": 363 }, { "epoch": 2.0335195530726256, "grad_norm": 7.315707530774128, "learning_rate": 9.977987293493316e-07, "logits/chosen": -3.5630156993865967, "logits/rejected": -3.6592371463775635, "logps/chosen": -24.435527801513672, "logps/rejected": -34.446014404296875, "loss": 0.2369, "rewards/accuracies": 1.0, "rewards/chosen": 1.220048189163208, "rewards/margins": 1.220048189163208, "rewards/rejected": 0.0, "step": 364 }, { "epoch": 2.0391061452513966, "grad_norm": 6.938745469268472, "learning_rate": 9.97741225583813e-07, "logits/chosen": -3.7066171169281006, "logits/rejected": -3.533735752105713, "logps/chosen": -12.29153060913086, "logps/rejected": -30.40150260925293, "loss": 0.2508, "rewards/accuracies": 1.0, "rewards/chosen": 1.2298760414123535, "rewards/margins": 1.2298760414123535, "rewards/rejected": 0.0, "step": 365 }, { "epoch": 2.0446927374301676, "grad_norm": 6.890346502664088, "learning_rate": 9.976829820861904e-07, "logits/chosen": -3.3776321411132812, "logits/rejected": -3.2434563636779785, "logps/chosen": -16.57495880126953, "logps/rejected": -17.069612503051758, "loss": 0.228, "rewards/accuracies": 1.0, "rewards/chosen": 1.5914559364318848, "rewards/margins": 1.5914559364318848, "rewards/rejected": 0.0, "step": 366 }, { "epoch": 2.0502793296089385, "grad_norm": 8.398925349958642, "learning_rate": 9.976239989430238e-07, "logits/chosen": -3.385850667953491, "logits/rejected": -3.657202959060669, "logps/chosen": -10.436373710632324, "logps/rejected": -21.774452209472656, "loss": 0.2498, "rewards/accuracies": 1.0, "rewards/chosen": 1.079164743423462, "rewards/margins": 1.079164743423462, "rewards/rejected": 0.0, "step": 367 }, { "epoch": 2.0558659217877095, "grad_norm": 6.474590748831068, "learning_rate": 9.975642762419727e-07, "logits/chosen": -3.501469373703003, "logits/rejected": -3.5434398651123047, "logps/chosen": -10.090056419372559, "logps/rejected": -40.99167251586914, "loss": 0.2454, "rewards/accuracies": 1.0, "rewards/chosen": 1.0617396831512451, "rewards/margins": 1.0617396831512451, "rewards/rejected": 0.0, "step": 368 }, { "epoch": 2.0614525139664805, "grad_norm": 7.754974370689558, "learning_rate": 9.975038140717958e-07, "logits/chosen": -3.7316436767578125, "logits/rejected": -3.6640937328338623, "logps/chosen": -9.623273849487305, "logps/rejected": -28.749025344848633, "loss": 0.2771, "rewards/accuracies": 1.0, "rewards/chosen": 1.1402578353881836, "rewards/margins": 1.1402578353881836, "rewards/rejected": 0.0, "step": 369 }, { "epoch": 2.0670391061452515, "grad_norm": 7.824714594571133, "learning_rate": 9.974426125223506e-07, "logits/chosen": -3.76188063621521, "logits/rejected": -3.493408441543579, "logps/chosen": -8.70077896118164, "logps/rejected": -20.87495231628418, "loss": 0.2659, "rewards/accuracies": 1.0, "rewards/chosen": 1.1030755043029785, "rewards/margins": 1.1030755043029785, "rewards/rejected": 0.0, "step": 370 }, { "epoch": 2.0726256983240225, "grad_norm": 8.03964632747558, "learning_rate": 9.973806716845937e-07, "logits/chosen": -3.290515184402466, "logits/rejected": -3.3557488918304443, "logps/chosen": -18.478015899658203, "logps/rejected": -40.62440872192383, "loss": 0.2351, "rewards/accuracies": 1.0, "rewards/chosen": 1.738030195236206, "rewards/margins": 1.738030195236206, "rewards/rejected": 0.0, "step": 371 }, { "epoch": 2.078212290502793, "grad_norm": 7.024462513908243, "learning_rate": 9.973179916505799e-07, "logits/chosen": -3.6067373752593994, "logits/rejected": -3.547588586807251, "logps/chosen": -62.18476867675781, "logps/rejected": -46.74725341796875, "loss": 0.2089, "rewards/accuracies": 1.0, "rewards/chosen": 2.101921558380127, "rewards/margins": 2.101921558380127, "rewards/rejected": 0.0, "step": 372 }, { "epoch": 2.083798882681564, "grad_norm": 7.383675548720896, "learning_rate": 9.972545725134628e-07, "logits/chosen": -3.372720718383789, "logits/rejected": -3.7493672370910645, "logps/chosen": -13.201423645019531, "logps/rejected": -40.3868408203125, "loss": 0.2454, "rewards/accuracies": 1.0, "rewards/chosen": 1.3980975151062012, "rewards/margins": 1.3980975151062012, "rewards/rejected": 0.0, "step": 373 }, { "epoch": 2.089385474860335, "grad_norm": 8.722114344066854, "learning_rate": 9.97190414367495e-07, "logits/chosen": -3.102618932723999, "logits/rejected": -3.4442105293273926, "logps/chosen": -38.9439697265625, "logps/rejected": -23.601165771484375, "loss": 0.2853, "rewards/accuracies": 1.0, "rewards/chosen": 1.5407699346542358, "rewards/margins": 1.5407699346542358, "rewards/rejected": 0.0, "step": 374 }, { "epoch": 2.094972067039106, "grad_norm": 7.431900946832534, "learning_rate": 9.971255173080266e-07, "logits/chosen": -3.483156204223633, "logits/rejected": -3.4012656211853027, "logps/chosen": -13.505800247192383, "logps/rejected": -27.328590393066406, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 1.495887279510498, "rewards/margins": 1.495887279510498, "rewards/rejected": 0.0, "step": 375 }, { "epoch": 2.100558659217877, "grad_norm": 7.470933693936137, "learning_rate": 9.970598814315063e-07, "logits/chosen": -3.3025078773498535, "logits/rejected": -3.138798475265503, "logps/chosen": -20.259449005126953, "logps/rejected": -28.857093811035156, "loss": 0.2419, "rewards/accuracies": 1.0, "rewards/chosen": 1.7965961694717407, "rewards/margins": 1.7965961694717407, "rewards/rejected": 0.0, "step": 376 }, { "epoch": 2.106145251396648, "grad_norm": 9.27200140273602, "learning_rate": 9.969935068354806e-07, "logits/chosen": -3.6736648082733154, "logits/rejected": -3.6531009674072266, "logps/chosen": -10.967897415161133, "logps/rejected": -24.558155059814453, "loss": 0.2284, "rewards/accuracies": 1.0, "rewards/chosen": 1.3979521989822388, "rewards/margins": 1.3979521989822388, "rewards/rejected": 0.0, "step": 377 }, { "epoch": 2.111731843575419, "grad_norm": 9.061565782826737, "learning_rate": 9.969263936185942e-07, "logits/chosen": -3.4653420448303223, "logits/rejected": -3.643233299255371, "logps/chosen": -42.82839584350586, "logps/rejected": -40.906795501708984, "loss": 0.2622, "rewards/accuracies": 1.0, "rewards/chosen": 1.3158607482910156, "rewards/margins": 1.3158607482910156, "rewards/rejected": 0.0, "step": 378 }, { "epoch": 2.11731843575419, "grad_norm": 8.00290082777598, "learning_rate": 9.96858541880589e-07, "logits/chosen": -3.524867057800293, "logits/rejected": -3.4138917922973633, "logps/chosen": -15.306783676147461, "logps/rejected": -24.926111221313477, "loss": 0.2273, "rewards/accuracies": 1.0, "rewards/chosen": 1.2128949165344238, "rewards/margins": 1.2128949165344238, "rewards/rejected": 0.0, "step": 379 }, { "epoch": 2.122905027932961, "grad_norm": 8.282038668199027, "learning_rate": 9.96789951722305e-07, "logits/chosen": -2.8289055824279785, "logits/rejected": -2.834775686264038, "logps/chosen": -15.831747055053711, "logps/rejected": -30.430438995361328, "loss": 0.2496, "rewards/accuracies": 1.0, "rewards/chosen": 1.0240150690078735, "rewards/margins": 1.0240150690078735, "rewards/rejected": 0.0, "step": 380 }, { "epoch": 2.122905027932961, "eval_logits/chosen": -3.3850479125976562, "eval_logits/rejected": -3.451110363006592, "eval_logps/chosen": -19.02072525024414, "eval_logps/rejected": -31.587453842163086, "eval_loss": 0.3512902557849884, "eval_rewards/accuracies": 0.949999988079071, "eval_rewards/chosen": 1.1049315929412842, "eval_rewards/margins": 1.1049315929412842, "eval_rewards/rejected": 0.0, "eval_runtime": 33.6034, "eval_samples_per_second": 9.225, "eval_steps_per_second": 0.298, "step": 380 }, { "epoch": 2.1284916201117317, "grad_norm": 7.659119705802381, "learning_rate": 9.967206232456792e-07, "logits/chosen": -3.3204312324523926, "logits/rejected": -3.4590940475463867, "logps/chosen": -29.15018081665039, "logps/rejected": -26.92755126953125, "loss": 0.228, "rewards/accuracies": 1.0, "rewards/chosen": 1.5056679248809814, "rewards/margins": 1.5056679248809814, "rewards/rejected": 0.0, "step": 381 }, { "epoch": 2.1340782122905027, "grad_norm": 7.341943731665606, "learning_rate": 9.966505565537462e-07, "logits/chosen": -3.463940382003784, "logits/rejected": -3.5414977073669434, "logps/chosen": -23.84471893310547, "logps/rejected": -43.20133590698242, "loss": 0.209, "rewards/accuracies": 1.0, "rewards/chosen": 2.0310187339782715, "rewards/margins": 2.0310187339782715, "rewards/rejected": 0.0, "step": 382 }, { "epoch": 2.1396648044692737, "grad_norm": 7.65895099687802, "learning_rate": 9.965797517506377e-07, "logits/chosen": -3.5571227073669434, "logits/rejected": -3.448068618774414, "logps/chosen": -12.737523078918457, "logps/rejected": -39.384822845458984, "loss": 0.2409, "rewards/accuracies": 1.0, "rewards/chosen": 1.6982755661010742, "rewards/margins": 1.6982755661010742, "rewards/rejected": 0.0, "step": 383 }, { "epoch": 2.1452513966480447, "grad_norm": 6.53867383915077, "learning_rate": 9.965082089415818e-07, "logits/chosen": -3.653449535369873, "logits/rejected": -3.518336772918701, "logps/chosen": -14.590206146240234, "logps/rejected": -39.92870330810547, "loss": 0.1985, "rewards/accuracies": 1.0, "rewards/chosen": 1.8198078870773315, "rewards/margins": 1.8198078870773315, "rewards/rejected": 0.0, "step": 384 }, { "epoch": 2.1508379888268156, "grad_norm": 11.769028103839736, "learning_rate": 9.964359282329042e-07, "logits/chosen": -3.7708849906921387, "logits/rejected": -3.714660406112671, "logps/chosen": -5.746259689331055, "logps/rejected": -25.73727798461914, "loss": 0.2974, "rewards/accuracies": 1.0, "rewards/chosen": 0.9119479060173035, "rewards/margins": 0.9119479060173035, "rewards/rejected": 0.0, "step": 385 }, { "epoch": 2.1564245810055866, "grad_norm": 7.368904952446889, "learning_rate": 9.96362909732027e-07, "logits/chosen": -3.477548837661743, "logits/rejected": -3.552312135696411, "logps/chosen": -23.343643188476562, "logps/rejected": -42.16249084472656, "loss": 0.2322, "rewards/accuracies": 1.0, "rewards/chosen": 1.6210466623306274, "rewards/margins": 1.6210466623306274, "rewards/rejected": 0.0, "step": 386 }, { "epoch": 2.1620111731843576, "grad_norm": 7.806055517330862, "learning_rate": 9.962891535474683e-07, "logits/chosen": -3.7190170288085938, "logits/rejected": -3.836012840270996, "logps/chosen": -7.597694396972656, "logps/rejected": -36.90325164794922, "loss": 0.2476, "rewards/accuracies": 1.0, "rewards/chosen": 1.0124187469482422, "rewards/margins": 1.0124187469482422, "rewards/rejected": 0.0, "step": 387 }, { "epoch": 2.1675977653631286, "grad_norm": 7.731071810124617, "learning_rate": 9.962146597888433e-07, "logits/chosen": -3.7043721675872803, "logits/rejected": -3.74884033203125, "logps/chosen": -37.62725830078125, "logps/rejected": -27.989416122436523, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 1.7073400020599365, "rewards/margins": 1.7073400020599365, "rewards/rejected": 0.0, "step": 388 }, { "epoch": 2.1731843575418996, "grad_norm": 7.002769887579688, "learning_rate": 9.961394285668628e-07, "logits/chosen": -3.4300410747528076, "logits/rejected": -3.171128511428833, "logps/chosen": -16.042146682739258, "logps/rejected": -29.850797653198242, "loss": 0.2075, "rewards/accuracies": 1.0, "rewards/chosen": 1.3781628608703613, "rewards/margins": 1.3781628608703613, "rewards/rejected": 0.0, "step": 389 }, { "epoch": 2.17877094972067, "grad_norm": 8.783255120045123, "learning_rate": 9.96063459993334e-07, "logits/chosen": -3.6922378540039062, "logits/rejected": -3.79630184173584, "logps/chosen": -21.953340530395508, "logps/rejected": -23.263206481933594, "loss": 0.2502, "rewards/accuracies": 1.0, "rewards/chosen": 1.887188196182251, "rewards/margins": 1.887188196182251, "rewards/rejected": 0.0, "step": 390 }, { "epoch": 2.184357541899441, "grad_norm": 7.067604970449007, "learning_rate": 9.959867541811595e-07, "logits/chosen": -3.6089160442352295, "logits/rejected": -3.683166980743408, "logps/chosen": -20.209598541259766, "logps/rejected": -29.633625030517578, "loss": 0.204, "rewards/accuracies": 1.0, "rewards/chosen": 1.683326244354248, "rewards/margins": 1.683326244354248, "rewards/rejected": 0.0, "step": 391 }, { "epoch": 2.189944134078212, "grad_norm": 7.975251972964183, "learning_rate": 9.959093112443377e-07, "logits/chosen": -3.6021435260772705, "logits/rejected": -3.445990562438965, "logps/chosen": -24.468564987182617, "logps/rejected": -21.645599365234375, "loss": 0.2562, "rewards/accuracies": 1.0, "rewards/chosen": 2.0186338424682617, "rewards/margins": 2.0186338424682617, "rewards/rejected": 0.0, "step": 392 }, { "epoch": 2.195530726256983, "grad_norm": 8.238477675873897, "learning_rate": 9.958311312979631e-07, "logits/chosen": -3.7424333095550537, "logits/rejected": -3.7833523750305176, "logps/chosen": -22.42348861694336, "logps/rejected": -37.2231559753418, "loss": 0.2484, "rewards/accuracies": 1.0, "rewards/chosen": 1.0180425643920898, "rewards/margins": 1.0180425643920898, "rewards/rejected": 0.0, "step": 393 }, { "epoch": 2.201117318435754, "grad_norm": 7.069688666193769, "learning_rate": 9.957522144582244e-07, "logits/chosen": -3.653636932373047, "logits/rejected": -3.844578266143799, "logps/chosen": -49.03389358520508, "logps/rejected": -65.41432189941406, "loss": 0.1985, "rewards/accuracies": 1.0, "rewards/chosen": 1.2444148063659668, "rewards/margins": 1.2444148063659668, "rewards/rejected": 0.0, "step": 394 }, { "epoch": 2.206703910614525, "grad_norm": 7.451897833919531, "learning_rate": 9.956725608424065e-07, "logits/chosen": -3.4446585178375244, "logits/rejected": -3.402251958847046, "logps/chosen": -21.064973831176758, "logps/rejected": -19.122060775756836, "loss": 0.2414, "rewards/accuracies": 1.0, "rewards/chosen": 1.3341941833496094, "rewards/margins": 1.3341941833496094, "rewards/rejected": 0.0, "step": 395 }, { "epoch": 2.212290502793296, "grad_norm": 8.402157956861563, "learning_rate": 9.955921705688887e-07, "logits/chosen": -3.3678083419799805, "logits/rejected": -3.352842330932617, "logps/chosen": -50.160003662109375, "logps/rejected": -31.446388244628906, "loss": 0.2071, "rewards/accuracies": 1.0, "rewards/chosen": 1.7660503387451172, "rewards/margins": 1.7660503387451172, "rewards/rejected": 0.0, "step": 396 }, { "epoch": 2.217877094972067, "grad_norm": 7.344066870799037, "learning_rate": 9.955110437571454e-07, "logits/chosen": -3.5992870330810547, "logits/rejected": -3.6589908599853516, "logps/chosen": -12.551340103149414, "logps/rejected": -37.615997314453125, "loss": 0.2539, "rewards/accuracies": 1.0, "rewards/chosen": 1.4545259475708008, "rewards/margins": 1.4545259475708008, "rewards/rejected": 0.0, "step": 397 }, { "epoch": 2.223463687150838, "grad_norm": 7.220123935246985, "learning_rate": 9.954291805277454e-07, "logits/chosen": -3.710559606552124, "logits/rejected": -3.6783299446105957, "logps/chosen": -20.05228042602539, "logps/rejected": -30.739959716796875, "loss": 0.2515, "rewards/accuracies": 1.0, "rewards/chosen": 1.4118175506591797, "rewards/margins": 1.4118175506591797, "rewards/rejected": 0.0, "step": 398 }, { "epoch": 2.2290502793296088, "grad_norm": 7.4890890379198884, "learning_rate": 9.95346581002352e-07, "logits/chosen": -3.7515759468078613, "logits/rejected": -3.7081682682037354, "logps/chosen": -14.216001510620117, "logps/rejected": -45.78520202636719, "loss": 0.241, "rewards/accuracies": 1.0, "rewards/chosen": 1.5549489259719849, "rewards/margins": 1.5549489259719849, "rewards/rejected": 0.0, "step": 399 }, { "epoch": 2.2346368715083798, "grad_norm": 7.588464054751335, "learning_rate": 9.952632453037227e-07, "logits/chosen": -3.35744571685791, "logits/rejected": -3.4066972732543945, "logps/chosen": -11.357706069946289, "logps/rejected": -35.88401412963867, "loss": 0.2294, "rewards/accuracies": 1.0, "rewards/chosen": 1.3197273015975952, "rewards/margins": 1.3197273015975952, "rewards/rejected": 0.0, "step": 400 }, { "epoch": 2.2346368715083798, "eval_logits/chosen": -3.406590223312378, "eval_logits/rejected": -3.4720587730407715, "eval_logps/chosen": -18.758052825927734, "eval_logps/rejected": -31.479137420654297, "eval_loss": 0.347927451133728, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 1.1311991214752197, "eval_rewards/margins": 1.1311991214752197, "eval_rewards/rejected": 0.0, "eval_runtime": 33.6225, "eval_samples_per_second": 9.22, "eval_steps_per_second": 0.297, "step": 400 }, { "epoch": 2.2402234636871508, "grad_norm": 7.582352058613477, "learning_rate": 9.951791735557093e-07, "logits/chosen": -3.3883655071258545, "logits/rejected": -3.6184258460998535, "logps/chosen": -49.78010940551758, "logps/rejected": -40.60430145263672, "loss": 0.2156, "rewards/accuracies": 1.0, "rewards/chosen": 1.381688117980957, "rewards/margins": 1.381688117980957, "rewards/rejected": 0.0, "step": 401 }, { "epoch": 2.2458100558659218, "grad_norm": 7.350648779219589, "learning_rate": 9.950943658832576e-07, "logits/chosen": -3.5008554458618164, "logits/rejected": -3.2886977195739746, "logps/chosen": -10.4576997756958, "logps/rejected": -28.83907699584961, "loss": 0.2622, "rewards/accuracies": 1.0, "rewards/chosen": 1.4585580825805664, "rewards/margins": 1.4585580825805664, "rewards/rejected": 0.0, "step": 402 }, { "epoch": 2.2513966480446927, "grad_norm": 6.75041204068673, "learning_rate": 9.950088224124066e-07, "logits/chosen": -3.7398993968963623, "logits/rejected": -3.5482699871063232, "logps/chosen": -10.035907745361328, "logps/rejected": -38.832725524902344, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 1.1586952209472656, "rewards/margins": 1.1586952209472656, "rewards/rejected": 0.0, "step": 403 }, { "epoch": 2.2569832402234637, "grad_norm": 7.8954947377312745, "learning_rate": 9.949225432702892e-07, "logits/chosen": -3.1041765213012695, "logits/rejected": -3.3727102279663086, "logps/chosen": -27.308000564575195, "logps/rejected": -45.766178131103516, "loss": 0.233, "rewards/accuracies": 1.0, "rewards/chosen": 1.3659512996673584, "rewards/margins": 1.3659512996673584, "rewards/rejected": 0.0, "step": 404 }, { "epoch": 2.2625698324022347, "grad_norm": 7.76564918974186, "learning_rate": 9.948355285851317e-07, "logits/chosen": -3.777467966079712, "logits/rejected": -3.669759750366211, "logps/chosen": -19.442195892333984, "logps/rejected": -21.301504135131836, "loss": 0.233, "rewards/accuracies": 1.0, "rewards/chosen": 1.3987016677856445, "rewards/margins": 1.3987016677856445, "rewards/rejected": 0.0, "step": 405 }, { "epoch": 2.2681564245810057, "grad_norm": 8.368115428995738, "learning_rate": 9.947477784862535e-07, "logits/chosen": -3.5681798458099365, "logits/rejected": -3.5615100860595703, "logps/chosen": -31.800512313842773, "logps/rejected": -34.029930114746094, "loss": 0.259, "rewards/accuracies": 1.0, "rewards/chosen": 1.5696308612823486, "rewards/margins": 1.5696308612823486, "rewards/rejected": 0.0, "step": 406 }, { "epoch": 2.2737430167597763, "grad_norm": 7.02416768188933, "learning_rate": 9.946592931040665e-07, "logits/chosen": -3.4972879886627197, "logits/rejected": -3.69376277923584, "logps/chosen": -20.93242645263672, "logps/rejected": -23.23131561279297, "loss": 0.2577, "rewards/accuracies": 1.0, "rewards/chosen": 1.4333603382110596, "rewards/margins": 1.4333603382110596, "rewards/rejected": 0.0, "step": 407 }, { "epoch": 2.2793296089385473, "grad_norm": 8.308033162842571, "learning_rate": 9.945700725700759e-07, "logits/chosen": -3.5655198097229004, "logits/rejected": -3.555826187133789, "logps/chosen": -11.822471618652344, "logps/rejected": -23.9578800201416, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 1.2672007083892822, "rewards/margins": 1.2672007083892822, "rewards/rejected": 0.0, "step": 408 }, { "epoch": 2.2849162011173183, "grad_norm": 7.178366503388651, "learning_rate": 9.944801170168794e-07, "logits/chosen": -3.5784828662872314, "logits/rejected": -3.629476547241211, "logps/chosen": -18.417068481445312, "logps/rejected": -39.53338623046875, "loss": 0.2267, "rewards/accuracies": 1.0, "rewards/chosen": 1.4985662698745728, "rewards/margins": 1.4985662698745728, "rewards/rejected": 0.0, "step": 409 }, { "epoch": 2.2905027932960893, "grad_norm": 7.5812514192209415, "learning_rate": 9.943894265781668e-07, "logits/chosen": -2.8852853775024414, "logits/rejected": -2.9510042667388916, "logps/chosen": -25.948524475097656, "logps/rejected": -56.017242431640625, "loss": 0.2209, "rewards/accuracies": 1.0, "rewards/chosen": 1.315206527709961, "rewards/margins": 1.315206527709961, "rewards/rejected": 0.0, "step": 410 }, { "epoch": 2.2960893854748603, "grad_norm": 8.091361697251866, "learning_rate": 9.942980013887206e-07, "logits/chosen": -3.0868020057678223, "logits/rejected": -3.217686891555786, "logps/chosen": -14.222066879272461, "logps/rejected": -55.355682373046875, "loss": 0.2509, "rewards/accuracies": 1.0, "rewards/chosen": 1.462952971458435, "rewards/margins": 1.462952971458435, "rewards/rejected": 0.0, "step": 411 }, { "epoch": 2.3016759776536313, "grad_norm": 7.704261312067314, "learning_rate": 9.942058415844147e-07, "logits/chosen": -3.523170232772827, "logits/rejected": -3.435476541519165, "logps/chosen": -29.237510681152344, "logps/rejected": -41.71282958984375, "loss": 0.2056, "rewards/accuracies": 1.0, "rewards/chosen": 2.677089214324951, "rewards/margins": 2.677089214324951, "rewards/rejected": 0.0, "step": 412 }, { "epoch": 2.3072625698324023, "grad_norm": 7.968377469779117, "learning_rate": 9.941129473022148e-07, "logits/chosen": -3.4820969104766846, "logits/rejected": -3.5996475219726562, "logps/chosen": -10.383661270141602, "logps/rejected": -46.434814453125, "loss": 0.237, "rewards/accuracies": 1.0, "rewards/chosen": 1.342440128326416, "rewards/margins": 1.342440128326416, "rewards/rejected": 0.0, "step": 413 }, { "epoch": 2.3128491620111733, "grad_norm": 8.865221690856353, "learning_rate": 9.940193186801786e-07, "logits/chosen": -3.543195962905884, "logits/rejected": -3.466761589050293, "logps/chosen": -21.331546783447266, "logps/rejected": -60.1800537109375, "loss": 0.237, "rewards/accuracies": 1.0, "rewards/chosen": 1.712676763534546, "rewards/margins": 1.712676763534546, "rewards/rejected": 0.0, "step": 414 }, { "epoch": 2.3184357541899443, "grad_norm": 6.842146244984532, "learning_rate": 9.939249558574548e-07, "logits/chosen": -3.5901739597320557, "logits/rejected": -3.483553647994995, "logps/chosen": -12.781819343566895, "logps/rejected": -24.080650329589844, "loss": 0.2128, "rewards/accuracies": 1.0, "rewards/chosen": 1.1559152603149414, "rewards/margins": 1.1559152603149414, "rewards/rejected": 0.0, "step": 415 }, { "epoch": 2.3240223463687153, "grad_norm": 7.230405389338254, "learning_rate": 9.938298589742834e-07, "logits/chosen": -3.8077878952026367, "logits/rejected": -3.926487445831299, "logps/chosen": -9.203804016113281, "logps/rejected": -24.789108276367188, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 1.1044079065322876, "rewards/margins": 1.1044079065322876, "rewards/rejected": 0.0, "step": 416 }, { "epoch": 2.329608938547486, "grad_norm": 8.61336686443517, "learning_rate": 9.937340281719952e-07, "logits/chosen": -3.538182497024536, "logits/rejected": -3.41615629196167, "logps/chosen": -15.180647850036621, "logps/rejected": -33.629058837890625, "loss": 0.2205, "rewards/accuracies": 1.0, "rewards/chosen": 1.701636791229248, "rewards/margins": 1.701636791229248, "rewards/rejected": 0.0, "step": 417 }, { "epoch": 2.335195530726257, "grad_norm": 10.429641400424977, "learning_rate": 9.936374635930117e-07, "logits/chosen": -3.1239843368530273, "logits/rejected": -3.1988978385925293, "logps/chosen": -32.015865325927734, "logps/rejected": -20.780353546142578, "loss": 0.2093, "rewards/accuracies": 1.0, "rewards/chosen": 1.3216593265533447, "rewards/margins": 1.3216593265533447, "rewards/rejected": 0.0, "step": 418 }, { "epoch": 2.340782122905028, "grad_norm": 7.421014419343002, "learning_rate": 9.935401653808457e-07, "logits/chosen": -3.5012435913085938, "logits/rejected": -3.516075849533081, "logps/chosen": -22.774662017822266, "logps/rejected": -39.66460418701172, "loss": 0.2468, "rewards/accuracies": 1.0, "rewards/chosen": 1.757613182067871, "rewards/margins": 1.757613182067871, "rewards/rejected": 0.0, "step": 419 }, { "epoch": 2.346368715083799, "grad_norm": 7.576509851812004, "learning_rate": 9.934421336800992e-07, "logits/chosen": -3.8414852619171143, "logits/rejected": -3.465494394302368, "logps/chosen": -7.34761905670166, "logps/rejected": -38.78349304199219, "loss": 0.2657, "rewards/accuracies": 1.0, "rewards/chosen": 1.3509728908538818, "rewards/margins": 1.3509728908538818, "rewards/rejected": 0.0, "step": 420 }, { "epoch": 2.346368715083799, "eval_logits/chosen": -3.4067509174346924, "eval_logits/rejected": -3.4796643257141113, "eval_logps/chosen": -18.939788818359375, "eval_logps/rejected": -32.09042739868164, "eval_loss": 0.3535655438899994, "eval_rewards/accuracies": 0.949999988079071, "eval_rewards/chosen": 1.1130255460739136, "eval_rewards/margins": 1.1130255460739136, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5488, "eval_samples_per_second": 9.24, "eval_steps_per_second": 0.298, "step": 420 }, { "epoch": 2.35195530726257, "grad_norm": 8.850244258366178, "learning_rate": 9.933433686364644e-07, "logits/chosen": -3.617330312728882, "logits/rejected": -3.3769824504852295, "logps/chosen": -15.00255012512207, "logps/rejected": -18.44194221496582, "loss": 0.2446, "rewards/accuracies": 1.0, "rewards/chosen": 1.4632066488265991, "rewards/margins": 1.4632066488265991, "rewards/rejected": 0.0, "step": 421 }, { "epoch": 2.357541899441341, "grad_norm": 7.3360611383001775, "learning_rate": 9.932438703967246e-07, "logits/chosen": -3.679149866104126, "logits/rejected": -3.724022150039673, "logps/chosen": -16.18329429626465, "logps/rejected": -46.84700012207031, "loss": 0.2214, "rewards/accuracies": 1.0, "rewards/chosen": 1.4130620956420898, "rewards/margins": 1.4130620956420898, "rewards/rejected": 0.0, "step": 422 }, { "epoch": 2.363128491620112, "grad_norm": 8.520519128108717, "learning_rate": 9.931436391087513e-07, "logits/chosen": -3.628382444381714, "logits/rejected": -3.237825632095337, "logps/chosen": -6.118841171264648, "logps/rejected": -23.551803588867188, "loss": 0.2539, "rewards/accuracies": 1.0, "rewards/chosen": 1.0133485794067383, "rewards/margins": 1.0133485794067383, "rewards/rejected": 0.0, "step": 423 }, { "epoch": 2.368715083798883, "grad_norm": 7.844051635317521, "learning_rate": 9.930426749215063e-07, "logits/chosen": -3.580740451812744, "logits/rejected": -3.685987949371338, "logps/chosen": -21.19434356689453, "logps/rejected": -18.519508361816406, "loss": 0.2257, "rewards/accuracies": 1.0, "rewards/chosen": 1.82901132106781, "rewards/margins": 1.82901132106781, "rewards/rejected": 0.0, "step": 424 }, { "epoch": 2.3743016759776534, "grad_norm": 10.937737527530368, "learning_rate": 9.9294097798504e-07, "logits/chosen": -3.3463058471679688, "logits/rejected": -3.4205501079559326, "logps/chosen": -20.50113296508789, "logps/rejected": -44.7275390625, "loss": 0.2862, "rewards/accuracies": 1.0, "rewards/chosen": 1.96407949924469, "rewards/margins": 1.96407949924469, "rewards/rejected": 0.0, "step": 425 }, { "epoch": 2.3798882681564244, "grad_norm": 9.66736665940558, "learning_rate": 9.928385484504927e-07, "logits/chosen": -3.729299306869507, "logits/rejected": -3.614223003387451, "logps/chosen": -11.2921142578125, "logps/rejected": -28.89381980895996, "loss": 0.2489, "rewards/accuracies": 1.0, "rewards/chosen": 1.2638719081878662, "rewards/margins": 1.2638719081878662, "rewards/rejected": 0.0, "step": 426 }, { "epoch": 2.3854748603351954, "grad_norm": 6.717558676276133, "learning_rate": 9.927353864700926e-07, "logits/chosen": -3.8699488639831543, "logits/rejected": -3.7441611289978027, "logps/chosen": -8.718036651611328, "logps/rejected": -17.121315002441406, "loss": 0.2462, "rewards/accuracies": 1.0, "rewards/chosen": 1.3821943998336792, "rewards/margins": 1.3821943998336792, "rewards/rejected": 0.0, "step": 427 }, { "epoch": 2.3910614525139664, "grad_norm": 7.9211753757705825, "learning_rate": 9.926314921971568e-07, "logits/chosen": -3.1198785305023193, "logits/rejected": -3.305750608444214, "logps/chosen": -20.42072296142578, "logps/rejected": -23.660072326660156, "loss": 0.23, "rewards/accuracies": 1.0, "rewards/chosen": 1.2869709730148315, "rewards/margins": 1.2869709730148315, "rewards/rejected": 0.0, "step": 428 }, { "epoch": 2.3966480446927374, "grad_norm": 12.073967218598673, "learning_rate": 9.925268657860906e-07, "logits/chosen": -3.7187838554382324, "logits/rejected": -3.7397525310516357, "logps/chosen": -7.898566246032715, "logps/rejected": -16.721696853637695, "loss": 0.2562, "rewards/accuracies": 1.0, "rewards/chosen": 1.1372358798980713, "rewards/margins": 1.1372358798980713, "rewards/rejected": 0.0, "step": 429 }, { "epoch": 2.4022346368715084, "grad_norm": 8.959279178556857, "learning_rate": 9.924215073923878e-07, "logits/chosen": -3.3832149505615234, "logits/rejected": -3.1709272861480713, "logps/chosen": -16.782611846923828, "logps/rejected": -25.03054428100586, "loss": 0.2458, "rewards/accuracies": 1.0, "rewards/chosen": 1.549168586730957, "rewards/margins": 1.549168586730957, "rewards/rejected": 0.0, "step": 430 }, { "epoch": 2.4078212290502794, "grad_norm": 10.542399892202047, "learning_rate": 9.923154171726295e-07, "logits/chosen": -3.636807441711426, "logits/rejected": -2.7520697116851807, "logps/chosen": -14.142077445983887, "logps/rejected": -35.1505241394043, "loss": 0.2751, "rewards/accuracies": 1.0, "rewards/chosen": 1.4068236351013184, "rewards/margins": 1.4068236351013184, "rewards/rejected": 0.0, "step": 431 }, { "epoch": 2.4134078212290504, "grad_norm": 8.389676538659716, "learning_rate": 9.922085952844844e-07, "logits/chosen": -3.3838095664978027, "logits/rejected": -3.5815417766571045, "logps/chosen": -20.966859817504883, "logps/rejected": -24.863521575927734, "loss": 0.2465, "rewards/accuracies": 1.0, "rewards/chosen": 1.4586759805679321, "rewards/margins": 1.4586759805679321, "rewards/rejected": 0.0, "step": 432 }, { "epoch": 2.4189944134078214, "grad_norm": 9.465693911271115, "learning_rate": 9.921010418867095e-07, "logits/chosen": -3.8997325897216797, "logits/rejected": -3.8577191829681396, "logps/chosen": -21.410032272338867, "logps/rejected": -22.073474884033203, "loss": 0.2564, "rewards/accuracies": 1.0, "rewards/chosen": 1.5003677606582642, "rewards/margins": 1.5003677606582642, "rewards/rejected": 0.0, "step": 433 }, { "epoch": 2.4245810055865924, "grad_norm": 7.341157159790346, "learning_rate": 9.91992757139148e-07, "logits/chosen": -3.2469494342803955, "logits/rejected": -3.2828922271728516, "logps/chosen": -11.902131080627441, "logps/rejected": -21.969844818115234, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 1.2229560613632202, "rewards/margins": 1.2229560613632202, "rewards/rejected": 0.0, "step": 434 }, { "epoch": 2.430167597765363, "grad_norm": 6.575904750471763, "learning_rate": 9.918837412027301e-07, "logits/chosen": -3.707895278930664, "logits/rejected": -3.5448644161224365, "logps/chosen": -20.005281448364258, "logps/rejected": -39.55828094482422, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 1.8787816762924194, "rewards/margins": 1.8787816762924194, "rewards/rejected": 0.0, "step": 435 }, { "epoch": 2.435754189944134, "grad_norm": 7.2042726517935245, "learning_rate": 9.917739942394732e-07, "logits/chosen": -3.557511806488037, "logits/rejected": -3.477961540222168, "logps/chosen": -10.696720123291016, "logps/rejected": -28.972217559814453, "loss": 0.2361, "rewards/accuracies": 1.0, "rewards/chosen": 1.1771442890167236, "rewards/margins": 1.1771442890167236, "rewards/rejected": 0.0, "step": 436 }, { "epoch": 2.441340782122905, "grad_norm": 7.603658907854762, "learning_rate": 9.916635164124807e-07, "logits/chosen": -3.7466676235198975, "logits/rejected": -3.890836000442505, "logps/chosen": -10.05591869354248, "logps/rejected": -53.3323860168457, "loss": 0.2175, "rewards/accuracies": 1.0, "rewards/chosen": 1.3430405855178833, "rewards/margins": 1.3430405855178833, "rewards/rejected": 0.0, "step": 437 }, { "epoch": 2.446927374301676, "grad_norm": 7.134620867385865, "learning_rate": 9.915523078859423e-07, "logits/chosen": -3.170605421066284, "logits/rejected": -3.387576103210449, "logps/chosen": -25.830745697021484, "logps/rejected": -18.52677345275879, "loss": 0.235, "rewards/accuracies": 1.0, "rewards/chosen": 1.5619460344314575, "rewards/margins": 1.5619460344314575, "rewards/rejected": 0.0, "step": 438 }, { "epoch": 2.452513966480447, "grad_norm": 7.558971744311051, "learning_rate": 9.914403688251336e-07, "logits/chosen": -3.418914794921875, "logits/rejected": -3.3568410873413086, "logps/chosen": -15.482650756835938, "logps/rejected": -60.44960403442383, "loss": 0.2175, "rewards/accuracies": 1.0, "rewards/chosen": 1.548828363418579, "rewards/margins": 1.548828363418579, "rewards/rejected": 0.0, "step": 439 }, { "epoch": 2.458100558659218, "grad_norm": 8.513656421826987, "learning_rate": 9.913276993964162e-07, "logits/chosen": -3.6556973457336426, "logits/rejected": -3.377732992172241, "logps/chosen": -33.013580322265625, "logps/rejected": -30.232341766357422, "loss": 0.2225, "rewards/accuracies": 1.0, "rewards/chosen": 1.6873899698257446, "rewards/margins": 1.6873899698257446, "rewards/rejected": 0.0, "step": 440 }, { "epoch": 2.458100558659218, "eval_logits/chosen": -3.4010918140411377, "eval_logits/rejected": -3.471395969390869, "eval_logps/chosen": -18.80868148803711, "eval_logps/rejected": -31.709808349609375, "eval_loss": 0.3474169969558716, "eval_rewards/accuracies": 0.9750000238418579, "eval_rewards/chosen": 1.126136302947998, "eval_rewards/margins": 1.126136302947998, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5932, "eval_samples_per_second": 9.228, "eval_steps_per_second": 0.298, "step": 440 }, { "epoch": 2.463687150837989, "grad_norm": 8.45964119210809, "learning_rate": 9.912142997672365e-07, "logits/chosen": -3.669959306716919, "logits/rejected": -3.6248857975006104, "logps/chosen": -48.35924530029297, "logps/rejected": -22.86141586303711, "loss": 0.2135, "rewards/accuracies": 1.0, "rewards/chosen": 2.139950752258301, "rewards/margins": 2.139950752258301, "rewards/rejected": 0.0, "step": 441 }, { "epoch": 2.46927374301676, "grad_norm": 8.336543648116804, "learning_rate": 9.911001701061272e-07, "logits/chosen": -3.458721399307251, "logits/rejected": -3.456511974334717, "logps/chosen": -7.8299360275268555, "logps/rejected": -15.928705215454102, "loss": 0.2184, "rewards/accuracies": 1.0, "rewards/chosen": 1.279940128326416, "rewards/margins": 1.279940128326416, "rewards/rejected": 0.0, "step": 442 }, { "epoch": 2.4748603351955305, "grad_norm": 7.98398514794912, "learning_rate": 9.909853105827049e-07, "logits/chosen": -3.6968343257904053, "logits/rejected": -3.656633138656616, "logps/chosen": -11.460504531860352, "logps/rejected": -29.274757385253906, "loss": 0.2583, "rewards/accuracies": 1.0, "rewards/chosen": 1.0127944946289062, "rewards/margins": 1.0127944946289062, "rewards/rejected": 0.0, "step": 443 }, { "epoch": 2.4804469273743015, "grad_norm": 6.471490355639204, "learning_rate": 9.908697213676715e-07, "logits/chosen": -3.431952476501465, "logits/rejected": -3.3830695152282715, "logps/chosen": -26.654708862304688, "logps/rejected": -35.89697265625, "loss": 0.2052, "rewards/accuracies": 1.0, "rewards/chosen": 1.6807575225830078, "rewards/margins": 1.6807575225830078, "rewards/rejected": 0.0, "step": 444 }, { "epoch": 2.4860335195530725, "grad_norm": 8.775363392001863, "learning_rate": 9.907534026328129e-07, "logits/chosen": -3.72342848777771, "logits/rejected": -3.810544729232788, "logps/chosen": -8.127252578735352, "logps/rejected": -72.51339721679688, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 1.1973881721496582, "rewards/margins": 1.1973881721496582, "rewards/rejected": 0.0, "step": 445 }, { "epoch": 2.4916201117318435, "grad_norm": 9.159198455313787, "learning_rate": 9.906363545509994e-07, "logits/chosen": -3.7136473655700684, "logits/rejected": -3.652353525161743, "logps/chosen": -14.011004447937012, "logps/rejected": -19.61683464050293, "loss": 0.2467, "rewards/accuracies": 1.0, "rewards/chosen": 1.4680709838867188, "rewards/margins": 1.4680709838867188, "rewards/rejected": 0.0, "step": 446 }, { "epoch": 2.4972067039106145, "grad_norm": 7.922207692774938, "learning_rate": 9.905185772961856e-07, "logits/chosen": -3.517378091812134, "logits/rejected": -3.40279483795166, "logps/chosen": -40.9967041015625, "logps/rejected": -22.460494995117188, "loss": 0.2311, "rewards/accuracies": 1.0, "rewards/chosen": 1.9493257999420166, "rewards/margins": 1.9493257999420166, "rewards/rejected": 0.0, "step": 447 }, { "epoch": 2.5027932960893855, "grad_norm": 7.398737562546686, "learning_rate": 9.904000710434096e-07, "logits/chosen": -3.7379915714263916, "logits/rejected": -3.6095502376556396, "logps/chosen": -20.070119857788086, "logps/rejected": -26.29829978942871, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 1.7872674465179443, "rewards/margins": 1.7872674465179443, "rewards/rejected": 0.0, "step": 448 }, { "epoch": 2.5083798882681565, "grad_norm": 7.070392361472554, "learning_rate": 9.902808359687924e-07, "logits/chosen": -3.3179070949554443, "logits/rejected": -3.313913583755493, "logps/chosen": -11.521672248840332, "logps/rejected": -53.297386169433594, "loss": 0.2465, "rewards/accuracies": 1.0, "rewards/chosen": 1.6013566255569458, "rewards/margins": 1.6013566255569458, "rewards/rejected": 0.0, "step": 449 }, { "epoch": 2.5139664804469275, "grad_norm": 6.400628795294673, "learning_rate": 9.901608722495387e-07, "logits/chosen": -3.8882439136505127, "logits/rejected": -3.842254400253296, "logps/chosen": -12.59760856628418, "logps/rejected": -25.522594451904297, "loss": 0.2314, "rewards/accuracies": 1.0, "rewards/chosen": 1.5315548181533813, "rewards/margins": 1.5315548181533813, "rewards/rejected": 0.0, "step": 450 }, { "epoch": 2.5195530726256985, "grad_norm": 9.815768243812006, "learning_rate": 9.90040180063936e-07, "logits/chosen": -3.7810263633728027, "logits/rejected": -3.626575231552124, "logps/chosen": -7.732497215270996, "logps/rejected": -29.06593132019043, "loss": 0.2651, "rewards/accuracies": 1.0, "rewards/chosen": 1.313602328300476, "rewards/margins": 1.313602328300476, "rewards/rejected": 0.0, "step": 451 }, { "epoch": 2.5251396648044695, "grad_norm": 7.2065539188776375, "learning_rate": 9.899187595913544e-07, "logits/chosen": -3.0272104740142822, "logits/rejected": -2.841782808303833, "logps/chosen": -26.768041610717773, "logps/rejected": -32.70100021362305, "loss": 0.2298, "rewards/accuracies": 1.0, "rewards/chosen": 1.3412799835205078, "rewards/margins": 1.3412799835205078, "rewards/rejected": 0.0, "step": 452 }, { "epoch": 2.5307262569832405, "grad_norm": 8.772903781327832, "learning_rate": 9.89796611012246e-07, "logits/chosen": -3.745570182800293, "logits/rejected": -3.7247447967529297, "logps/chosen": -12.117805480957031, "logps/rejected": -28.30303192138672, "loss": 0.2516, "rewards/accuracies": 1.0, "rewards/chosen": 1.231385588645935, "rewards/margins": 1.231385588645935, "rewards/rejected": 0.0, "step": 453 }, { "epoch": 2.536312849162011, "grad_norm": 8.16462910523264, "learning_rate": 9.89673734508146e-07, "logits/chosen": -3.556431531906128, "logits/rejected": -3.6420626640319824, "logps/chosen": -10.046062469482422, "logps/rejected": -24.843278884887695, "loss": 0.2247, "rewards/accuracies": 1.0, "rewards/chosen": 1.1439343690872192, "rewards/margins": 1.1439343690872192, "rewards/rejected": 0.0, "step": 454 }, { "epoch": 2.541899441340782, "grad_norm": 8.260323686769588, "learning_rate": 9.8955013026167e-07, "logits/chosen": -3.498030424118042, "logits/rejected": -3.5360107421875, "logps/chosen": -13.82684326171875, "logps/rejected": -24.774059295654297, "loss": 0.2436, "rewards/accuracies": 1.0, "rewards/chosen": 1.1852829456329346, "rewards/margins": 1.1852829456329346, "rewards/rejected": 0.0, "step": 455 }, { "epoch": 2.547486033519553, "grad_norm": 6.82526879346672, "learning_rate": 9.894257984565167e-07, "logits/chosen": -3.50545597076416, "logits/rejected": -3.660874843597412, "logps/chosen": -11.925575256347656, "logps/rejected": -39.22270965576172, "loss": 0.2215, "rewards/accuracies": 1.0, "rewards/chosen": 1.5857868194580078, "rewards/margins": 1.5857868194580078, "rewards/rejected": 0.0, "step": 456 }, { "epoch": 2.553072625698324, "grad_norm": 7.818925205163151, "learning_rate": 9.893007392774645e-07, "logits/chosen": -3.6695072650909424, "logits/rejected": -3.783515453338623, "logps/chosen": -6.81137752532959, "logps/rejected": -24.871543884277344, "loss": 0.2204, "rewards/accuracies": 1.0, "rewards/chosen": 0.9589478969573975, "rewards/margins": 0.9589478969573975, "rewards/rejected": 0.0, "step": 457 }, { "epoch": 2.558659217877095, "grad_norm": 7.808942118894329, "learning_rate": 9.891749529103745e-07, "logits/chosen": -3.488028049468994, "logits/rejected": -3.680635929107666, "logps/chosen": -7.478118896484375, "logps/rejected": -46.97710418701172, "loss": 0.2551, "rewards/accuracies": 1.0, "rewards/chosen": 0.7159961462020874, "rewards/margins": 0.7159961462020874, "rewards/rejected": 0.0, "step": 458 }, { "epoch": 2.564245810055866, "grad_norm": 7.385416058515253, "learning_rate": 9.890484395421869e-07, "logits/chosen": -3.5422539710998535, "logits/rejected": -3.697225570678711, "logps/chosen": -7.821656703948975, "logps/rejected": -36.82724380493164, "loss": 0.2341, "rewards/accuracies": 1.0, "rewards/chosen": 1.1111254692077637, "rewards/margins": 1.1111254692077637, "rewards/rejected": 0.0, "step": 459 }, { "epoch": 2.5698324022346366, "grad_norm": 9.10891011608351, "learning_rate": 9.889211993609234e-07, "logits/chosen": -3.4917874336242676, "logits/rejected": -3.5584025382995605, "logps/chosen": -29.07617950439453, "logps/rejected": -73.38496398925781, "loss": 0.2554, "rewards/accuracies": 1.0, "rewards/chosen": 1.4889638423919678, "rewards/margins": 1.4889638423919678, "rewards/rejected": 0.0, "step": 460 }, { "epoch": 2.5698324022346366, "eval_logits/chosen": -3.4113235473632812, "eval_logits/rejected": -3.4855563640594482, "eval_logps/chosen": -18.93764877319336, "eval_logps/rejected": -31.75335121154785, "eval_loss": 0.3509919345378876, "eval_rewards/accuracies": 0.949999988079071, "eval_rewards/chosen": 1.1132395267486572, "eval_rewards/margins": 1.1132395267486572, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5433, "eval_samples_per_second": 9.242, "eval_steps_per_second": 0.298, "step": 460 }, { "epoch": 2.5754189944134076, "grad_norm": 7.519226930117591, "learning_rate": 9.887932325556858e-07, "logits/chosen": -3.36040997505188, "logits/rejected": -3.4190354347229004, "logps/chosen": -41.65183639526367, "logps/rejected": -25.77553367614746, "loss": 0.2352, "rewards/accuracies": 1.0, "rewards/chosen": 1.757643699645996, "rewards/margins": 1.757643699645996, "rewards/rejected": 0.0, "step": 461 }, { "epoch": 2.5810055865921786, "grad_norm": 8.008746265043428, "learning_rate": 9.88664539316655e-07, "logits/chosen": -3.5281214714050293, "logits/rejected": -3.6912050247192383, "logps/chosen": -19.70013427734375, "logps/rejected": -29.371292114257812, "loss": 0.2059, "rewards/accuracies": 1.0, "rewards/chosen": 1.6448612213134766, "rewards/margins": 1.6448612213134766, "rewards/rejected": 0.0, "step": 462 }, { "epoch": 2.5865921787709496, "grad_norm": 7.121357396179553, "learning_rate": 9.885351198350927e-07, "logits/chosen": -3.2395377159118652, "logits/rejected": -3.192934513092041, "logps/chosen": -12.96796989440918, "logps/rejected": -40.11272048950195, "loss": 0.2265, "rewards/accuracies": 1.0, "rewards/chosen": 1.2172858715057373, "rewards/margins": 1.2172858715057373, "rewards/rejected": 0.0, "step": 463 }, { "epoch": 2.5921787709497206, "grad_norm": 6.943555180366184, "learning_rate": 9.884049743033388e-07, "logits/chosen": -3.6315016746520996, "logits/rejected": -3.5012624263763428, "logps/chosen": -10.144268989562988, "logps/rejected": -36.149906158447266, "loss": 0.2218, "rewards/accuracies": 1.0, "rewards/chosen": 1.6167594194412231, "rewards/margins": 1.6167594194412231, "rewards/rejected": 0.0, "step": 464 }, { "epoch": 2.5977653631284916, "grad_norm": 9.093457875219448, "learning_rate": 9.882741029148127e-07, "logits/chosen": -3.257049083709717, "logits/rejected": -3.3369534015655518, "logps/chosen": -61.227699279785156, "logps/rejected": -29.78754425048828, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 1.8105043172836304, "rewards/margins": 1.8105043172836304, "rewards/rejected": 0.0, "step": 465 }, { "epoch": 2.6033519553072626, "grad_norm": 9.024743638308934, "learning_rate": 9.88142505864013e-07, "logits/chosen": -3.7688000202178955, "logits/rejected": -3.7803730964660645, "logps/chosen": -16.94661521911621, "logps/rejected": -28.79592514038086, "loss": 0.2684, "rewards/accuracies": 1.0, "rewards/chosen": 1.6676511764526367, "rewards/margins": 1.6676511764526367, "rewards/rejected": 0.0, "step": 466 }, { "epoch": 2.6089385474860336, "grad_norm": 8.921806265731254, "learning_rate": 9.880101833465162e-07, "logits/chosen": -3.630502700805664, "logits/rejected": -3.7189152240753174, "logps/chosen": -8.833230018615723, "logps/rejected": -24.829486846923828, "loss": 0.2445, "rewards/accuracies": 1.0, "rewards/chosen": 1.2556135654449463, "rewards/margins": 1.2556135654449463, "rewards/rejected": 0.0, "step": 467 }, { "epoch": 2.6145251396648046, "grad_norm": 7.116479813143785, "learning_rate": 9.87877135558977e-07, "logits/chosen": -3.725851535797119, "logits/rejected": -3.6491265296936035, "logps/chosen": -25.265533447265625, "logps/rejected": -33.25012969970703, "loss": 0.1997, "rewards/accuracies": 1.0, "rewards/chosen": 1.8423246145248413, "rewards/margins": 1.8423246145248413, "rewards/rejected": 0.0, "step": 468 }, { "epoch": 2.6201117318435756, "grad_norm": 8.115055828693515, "learning_rate": 9.877433626991283e-07, "logits/chosen": -3.5986835956573486, "logits/rejected": -3.6389102935791016, "logps/chosen": -12.681318283081055, "logps/rejected": -43.688018798828125, "loss": 0.2724, "rewards/accuracies": 1.0, "rewards/chosen": 1.4984073638916016, "rewards/margins": 1.4984073638916016, "rewards/rejected": 0.0, "step": 469 }, { "epoch": 2.6256983240223466, "grad_norm": 7.880393166220747, "learning_rate": 9.876088649657802e-07, "logits/chosen": -4.028558731079102, "logits/rejected": -3.991490602493286, "logps/chosen": -11.951882362365723, "logps/rejected": -16.695070266723633, "loss": 0.2507, "rewards/accuracies": 1.0, "rewards/chosen": 1.3863277435302734, "rewards/margins": 1.3863277435302734, "rewards/rejected": 0.0, "step": 470 }, { "epoch": 2.631284916201117, "grad_norm": 8.075539955799195, "learning_rate": 9.874736425588204e-07, "logits/chosen": -3.785693407058716, "logits/rejected": -3.8653554916381836, "logps/chosen": -13.701693534851074, "logps/rejected": -27.03780746459961, "loss": 0.2314, "rewards/accuracies": 1.0, "rewards/chosen": 1.8585710525512695, "rewards/margins": 1.8585710525512695, "rewards/rejected": 0.0, "step": 471 }, { "epoch": 2.636871508379888, "grad_norm": 8.101379256216118, "learning_rate": 9.873376956792136e-07, "logits/chosen": -3.4866418838500977, "logits/rejected": -3.533543825149536, "logps/chosen": -11.929169654846191, "logps/rejected": -19.965227127075195, "loss": 0.2392, "rewards/accuracies": 1.0, "rewards/chosen": 1.2067784070968628, "rewards/margins": 1.2067784070968628, "rewards/rejected": 0.0, "step": 472 }, { "epoch": 2.642458100558659, "grad_norm": 7.236270883694466, "learning_rate": 9.872010245290008e-07, "logits/chosen": -3.7483625411987305, "logits/rejected": -3.351423978805542, "logps/chosen": -16.332185745239258, "logps/rejected": -20.266708374023438, "loss": 0.1975, "rewards/accuracies": 1.0, "rewards/chosen": 1.5048670768737793, "rewards/margins": 1.5048670768737793, "rewards/rejected": 0.0, "step": 473 }, { "epoch": 2.64804469273743, "grad_norm": 9.621756868632152, "learning_rate": 9.870636293112998e-07, "logits/chosen": -3.9652161598205566, "logits/rejected": -3.7689316272735596, "logps/chosen": -17.28591537475586, "logps/rejected": -25.530614852905273, "loss": 0.2742, "rewards/accuracies": 1.0, "rewards/chosen": 1.6187388896942139, "rewards/margins": 1.6187388896942139, "rewards/rejected": 0.0, "step": 474 }, { "epoch": 2.653631284916201, "grad_norm": 7.0671066458309975, "learning_rate": 9.869255102303043e-07, "logits/chosen": -3.672330617904663, "logits/rejected": -3.6384193897247314, "logps/chosen": -20.17595100402832, "logps/rejected": -28.395429611206055, "loss": 0.2522, "rewards/accuracies": 1.0, "rewards/chosen": 1.3444428443908691, "rewards/margins": 1.3444428443908691, "rewards/rejected": 0.0, "step": 475 }, { "epoch": 2.659217877094972, "grad_norm": 8.07970111958636, "learning_rate": 9.86786667491284e-07, "logits/chosen": -3.524941921234131, "logits/rejected": -3.5708658695220947, "logps/chosen": -38.21711349487305, "logps/rejected": -21.77924919128418, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 1.5858030319213867, "rewards/margins": 1.5858030319213867, "rewards/rejected": 0.0, "step": 476 }, { "epoch": 2.664804469273743, "grad_norm": 7.784828158908748, "learning_rate": 9.866471013005837e-07, "logits/chosen": -3.4317426681518555, "logits/rejected": -3.399907112121582, "logps/chosen": -29.516658782958984, "logps/rejected": -28.625930786132812, "loss": 0.2304, "rewards/accuracies": 1.0, "rewards/chosen": 2.044358253479004, "rewards/margins": 2.044358253479004, "rewards/rejected": 0.0, "step": 477 }, { "epoch": 2.6703910614525137, "grad_norm": 8.152705669502572, "learning_rate": 9.865068118656239e-07, "logits/chosen": -3.7083640098571777, "logits/rejected": -3.397965669631958, "logps/chosen": -12.964552879333496, "logps/rejected": -51.40794372558594, "loss": 0.2329, "rewards/accuracies": 1.0, "rewards/chosen": 1.4145393371582031, "rewards/margins": 1.4145393371582031, "rewards/rejected": 0.0, "step": 478 }, { "epoch": 2.6759776536312847, "grad_norm": 8.268904824398946, "learning_rate": 9.863657993948996e-07, "logits/chosen": -3.613013982772827, "logits/rejected": -3.5054068565368652, "logps/chosen": -5.858638763427734, "logps/rejected": -18.040542602539062, "loss": 0.2504, "rewards/accuracies": 1.0, "rewards/chosen": 0.9335199594497681, "rewards/margins": 0.9335199594497681, "rewards/rejected": 0.0, "step": 479 }, { "epoch": 2.6815642458100557, "grad_norm": 7.9052242520574705, "learning_rate": 9.862240640979801e-07, "logits/chosen": -3.3727173805236816, "logits/rejected": -3.4428517818450928, "logps/chosen": -17.80929946899414, "logps/rejected": -37.69042205810547, "loss": 0.2282, "rewards/accuracies": 1.0, "rewards/chosen": 1.604065179824829, "rewards/margins": 1.604065179824829, "rewards/rejected": 0.0, "step": 480 }, { "epoch": 2.6815642458100557, "eval_logits/chosen": -3.4260826110839844, "eval_logits/rejected": -3.4980411529541016, "eval_logps/chosen": -18.827129364013672, "eval_logps/rejected": -31.69999122619629, "eval_loss": 0.34813663363456726, "eval_rewards/accuracies": 0.949999988079071, "eval_rewards/chosen": 1.124291181564331, "eval_rewards/margins": 1.124291181564331, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5466, "eval_samples_per_second": 9.241, "eval_steps_per_second": 0.298, "step": 480 }, { "epoch": 2.6871508379888267, "grad_norm": 7.902342051974376, "learning_rate": 9.860816061855097e-07, "logits/chosen": -3.8566508293151855, "logits/rejected": -3.9841768741607666, "logps/chosen": -15.78173828125, "logps/rejected": -19.682714462280273, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 1.2836341857910156, "rewards/margins": 1.2836341857910156, "rewards/rejected": 0.0, "step": 481 }, { "epoch": 2.6927374301675977, "grad_norm": 8.128367088301074, "learning_rate": 9.859384258692063e-07, "logits/chosen": -3.5985052585601807, "logits/rejected": -3.4468796253204346, "logps/chosen": -13.781343460083008, "logps/rejected": -39.90230178833008, "loss": 0.2144, "rewards/accuracies": 1.0, "rewards/chosen": 1.5477707386016846, "rewards/margins": 1.5477707386016846, "rewards/rejected": 0.0, "step": 482 }, { "epoch": 2.6983240223463687, "grad_norm": 7.945758910520379, "learning_rate": 9.85794523361861e-07, "logits/chosen": -3.4471285343170166, "logits/rejected": -3.401249647140503, "logps/chosen": -29.81804656982422, "logps/rejected": -40.78144836425781, "loss": 0.2225, "rewards/accuracies": 1.0, "rewards/chosen": 1.9413402080535889, "rewards/margins": 1.9413402080535889, "rewards/rejected": 0.0, "step": 483 }, { "epoch": 2.7039106145251397, "grad_norm": 7.482279160486083, "learning_rate": 9.856498988773388e-07, "logits/chosen": -3.764979600906372, "logits/rejected": -3.8511509895324707, "logps/chosen": -6.9386138916015625, "logps/rejected": -45.165382385253906, "loss": 0.2231, "rewards/accuracies": 1.0, "rewards/chosen": 0.9394733309745789, "rewards/margins": 0.9394733309745789, "rewards/rejected": 0.0, "step": 484 }, { "epoch": 2.7094972067039107, "grad_norm": 9.236290688906081, "learning_rate": 9.855045526305772e-07, "logits/chosen": -3.6754257678985596, "logits/rejected": -3.456483840942383, "logps/chosen": -9.34485149383545, "logps/rejected": -17.740493774414062, "loss": 0.2672, "rewards/accuracies": 1.0, "rewards/chosen": 1.4246737957000732, "rewards/margins": 1.4246737957000732, "rewards/rejected": 0.0, "step": 485 }, { "epoch": 2.7150837988826817, "grad_norm": 8.512651763195937, "learning_rate": 9.85358484837587e-07, "logits/chosen": -3.905971050262451, "logits/rejected": -3.810239791870117, "logps/chosen": -10.29494571685791, "logps/rejected": -13.215850830078125, "loss": 0.2433, "rewards/accuracies": 1.0, "rewards/chosen": 1.171828269958496, "rewards/margins": 1.171828269958496, "rewards/rejected": 0.0, "step": 486 }, { "epoch": 2.7206703910614527, "grad_norm": 7.14735012300244, "learning_rate": 9.852116957154505e-07, "logits/chosen": -3.829719066619873, "logits/rejected": -3.6599392890930176, "logps/chosen": -14.203679084777832, "logps/rejected": -27.11825180053711, "loss": 0.2368, "rewards/accuracies": 1.0, "rewards/chosen": 1.285489797592163, "rewards/margins": 1.285489797592163, "rewards/rejected": 0.0, "step": 487 }, { "epoch": 2.7262569832402237, "grad_norm": 8.321189811146263, "learning_rate": 9.850641854823227e-07, "logits/chosen": -3.6217041015625, "logits/rejected": -3.5015790462493896, "logps/chosen": -19.579452514648438, "logps/rejected": -30.540557861328125, "loss": 0.2378, "rewards/accuracies": 1.0, "rewards/chosen": 1.5553691387176514, "rewards/margins": 1.5553691387176514, "rewards/rejected": 0.0, "step": 488 }, { "epoch": 2.7318435754189943, "grad_norm": 8.77169091111931, "learning_rate": 9.8491595435743e-07, "logits/chosen": -3.7543747425079346, "logits/rejected": -3.7363133430480957, "logps/chosen": -11.50475788116455, "logps/rejected": -21.293922424316406, "loss": 0.2359, "rewards/accuracies": 1.0, "rewards/chosen": 1.4948424100875854, "rewards/margins": 1.4948424100875854, "rewards/rejected": 0.0, "step": 489 }, { "epoch": 2.7374301675977653, "grad_norm": 7.473231553816813, "learning_rate": 9.847670025610706e-07, "logits/chosen": -3.4229557514190674, "logits/rejected": -3.723675489425659, "logps/chosen": -11.39542293548584, "logps/rejected": -31.53961753845215, "loss": 0.1943, "rewards/accuracies": 1.0, "rewards/chosen": 1.520459771156311, "rewards/margins": 1.520459771156311, "rewards/rejected": 0.0, "step": 490 }, { "epoch": 2.7430167597765363, "grad_norm": 7.167214974189555, "learning_rate": 9.84617330314613e-07, "logits/chosen": -3.6125621795654297, "logits/rejected": -3.4417295455932617, "logps/chosen": -9.274349212646484, "logps/rejected": -23.995391845703125, "loss": 0.2352, "rewards/accuracies": 1.0, "rewards/chosen": 1.3021409511566162, "rewards/margins": 1.3021409511566162, "rewards/rejected": 0.0, "step": 491 }, { "epoch": 2.7486033519553073, "grad_norm": 6.673448358635546, "learning_rate": 9.84466937840497e-07, "logits/chosen": -3.8617923259735107, "logits/rejected": -3.861264944076538, "logps/chosen": -12.381843566894531, "logps/rejected": -40.608482360839844, "loss": 0.2046, "rewards/accuracies": 1.0, "rewards/chosen": 1.6555776596069336, "rewards/margins": 1.6555776596069336, "rewards/rejected": 0.0, "step": 492 }, { "epoch": 2.7541899441340782, "grad_norm": 6.906722025226586, "learning_rate": 9.843158253622323e-07, "logits/chosen": -3.7957656383514404, "logits/rejected": -3.8069732189178467, "logps/chosen": -11.624693870544434, "logps/rejected": -24.539522171020508, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": 1.3188285827636719, "rewards/margins": 1.3188285827636719, "rewards/rejected": 0.0, "step": 493 }, { "epoch": 2.7597765363128492, "grad_norm": 7.284973496779757, "learning_rate": 9.841639931043993e-07, "logits/chosen": -3.497393846511841, "logits/rejected": -3.6471757888793945, "logps/chosen": -6.985918045043945, "logps/rejected": -34.42020034790039, "loss": 0.2501, "rewards/accuracies": 1.0, "rewards/chosen": 0.8936736583709717, "rewards/margins": 0.8936736583709717, "rewards/rejected": 0.0, "step": 494 }, { "epoch": 2.7653631284916202, "grad_norm": 9.564494734379064, "learning_rate": 9.840114412926477e-07, "logits/chosen": -3.9058496952056885, "logits/rejected": -4.01422643661499, "logps/chosen": -9.518606185913086, "logps/rejected": -35.89985275268555, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 1.1451034545898438, "rewards/margins": 1.1451034545898438, "rewards/rejected": 0.0, "step": 495 }, { "epoch": 2.770949720670391, "grad_norm": 10.982171123662225, "learning_rate": 9.838581701536966e-07, "logits/chosen": -3.650115728378296, "logits/rejected": -3.7084121704101562, "logps/chosen": -20.638866424560547, "logps/rejected": -29.952651977539062, "loss": 0.2463, "rewards/accuracies": 1.0, "rewards/chosen": 2.4969639778137207, "rewards/margins": 2.4969639778137207, "rewards/rejected": 0.0, "step": 496 }, { "epoch": 2.776536312849162, "grad_norm": 10.535507629208146, "learning_rate": 9.83704179915334e-07, "logits/chosen": -3.655679225921631, "logits/rejected": -3.5606353282928467, "logps/chosen": -17.765445709228516, "logps/rejected": -28.804914474487305, "loss": 0.2594, "rewards/accuracies": 1.0, "rewards/chosen": 0.9974253177642822, "rewards/margins": 0.9974253177642822, "rewards/rejected": 0.0, "step": 497 }, { "epoch": 2.782122905027933, "grad_norm": 7.847733143629526, "learning_rate": 9.835494708064174e-07, "logits/chosen": -3.6984760761260986, "logits/rejected": -3.708005666732788, "logps/chosen": -8.091940879821777, "logps/rejected": -45.89476013183594, "loss": 0.2171, "rewards/accuracies": 1.0, "rewards/chosen": 1.3567581176757812, "rewards/margins": 1.3567581176757812, "rewards/rejected": 0.0, "step": 498 }, { "epoch": 2.787709497206704, "grad_norm": 6.614344245198836, "learning_rate": 9.833940430568716e-07, "logits/chosen": -3.3254106044769287, "logits/rejected": -3.47277569770813, "logps/chosen": -18.11175537109375, "logps/rejected": -30.400558471679688, "loss": 0.1786, "rewards/accuracies": 1.0, "rewards/chosen": 1.502833366394043, "rewards/margins": 1.502833366394043, "rewards/rejected": 0.0, "step": 499 }, { "epoch": 2.793296089385475, "grad_norm": 7.448538904626172, "learning_rate": 9.8323789689769e-07, "logits/chosen": -3.6233277320861816, "logits/rejected": -3.7375636100769043, "logps/chosen": -20.592002868652344, "logps/rejected": -21.011043548583984, "loss": 0.2073, "rewards/accuracies": 1.0, "rewards/chosen": 1.9914714097976685, "rewards/margins": 1.9914714097976685, "rewards/rejected": 0.0, "step": 500 }, { "epoch": 2.793296089385475, "eval_logits/chosen": -3.4131484031677246, "eval_logits/rejected": -3.4875328540802, "eval_logps/chosen": -18.787128448486328, "eval_logps/rejected": -31.88308334350586, "eval_loss": 0.3499985337257385, "eval_rewards/accuracies": 0.949999988079071, "eval_rewards/chosen": 1.128291368484497, "eval_rewards/margins": 1.128291368484497, "eval_rewards/rejected": 0.0, "eval_runtime": 33.6147, "eval_samples_per_second": 9.222, "eval_steps_per_second": 0.297, "step": 500 }, { "epoch": 2.798882681564246, "grad_norm": 7.006173844335316, "learning_rate": 9.830810325609336e-07, "logits/chosen": -3.4901397228240967, "logits/rejected": -3.6741926670074463, "logps/chosen": -8.288501739501953, "logps/rejected": -35.86272048950195, "loss": 0.2157, "rewards/accuracies": 1.0, "rewards/chosen": 1.1337344646453857, "rewards/margins": 1.1337344646453857, "rewards/rejected": 0.0, "step": 501 }, { "epoch": 2.804469273743017, "grad_norm": 6.873407950602945, "learning_rate": 9.829234502797308e-07, "logits/chosen": -3.667271137237549, "logits/rejected": -3.5253448486328125, "logps/chosen": -8.964853286743164, "logps/rejected": -29.030786514282227, "loss": 0.2287, "rewards/accuracies": 1.0, "rewards/chosen": 1.5016932487487793, "rewards/margins": 1.5016932487487793, "rewards/rejected": 0.0, "step": 502 }, { "epoch": 2.810055865921788, "grad_norm": 8.405649575131, "learning_rate": 9.82765150288277e-07, "logits/chosen": -3.6840107440948486, "logits/rejected": -3.177286386489868, "logps/chosen": -10.823659896850586, "logps/rejected": -64.39732360839844, "loss": 0.232, "rewards/accuracies": 1.0, "rewards/chosen": 1.265246868133545, "rewards/margins": 1.265246868133545, "rewards/rejected": 0.0, "step": 503 }, { "epoch": 2.815642458100559, "grad_norm": 7.593265729493165, "learning_rate": 9.82606132821834e-07, "logits/chosen": -3.047468423843384, "logits/rejected": -3.244701385498047, "logps/chosen": -47.37431716918945, "logps/rejected": -28.011714935302734, "loss": 0.2279, "rewards/accuracies": 1.0, "rewards/chosen": 1.6064038276672363, "rewards/margins": 1.6064038276672363, "rewards/rejected": 0.0, "step": 504 }, { "epoch": 2.82122905027933, "grad_norm": 6.952402357724541, "learning_rate": 9.824463981167303e-07, "logits/chosen": -3.7553091049194336, "logits/rejected": -3.633820056915283, "logps/chosen": -8.730056762695312, "logps/rejected": -17.908893585205078, "loss": 0.2106, "rewards/accuracies": 1.0, "rewards/chosen": 1.2194769382476807, "rewards/margins": 1.2194769382476807, "rewards/rejected": 0.0, "step": 505 }, { "epoch": 2.826815642458101, "grad_norm": 6.967895770582853, "learning_rate": 9.8228594641036e-07, "logits/chosen": -3.747936248779297, "logits/rejected": -3.849527359008789, "logps/chosen": -10.058979034423828, "logps/rejected": -21.122617721557617, "loss": 0.2191, "rewards/accuracies": 1.0, "rewards/chosen": 1.4362872838974, "rewards/margins": 1.4362872838974, "rewards/rejected": 0.0, "step": 506 }, { "epoch": 2.8324022346368714, "grad_norm": 10.638736519793175, "learning_rate": 9.82124777941183e-07, "logits/chosen": -3.6542043685913086, "logits/rejected": -3.6405322551727295, "logps/chosen": -13.411842346191406, "logps/rejected": -28.5947265625, "loss": 0.2546, "rewards/accuracies": 1.0, "rewards/chosen": 1.1960914134979248, "rewards/margins": 1.1960914134979248, "rewards/rejected": 0.0, "step": 507 }, { "epoch": 2.8379888268156424, "grad_norm": 6.862882851062471, "learning_rate": 9.81962892948724e-07, "logits/chosen": -3.875190258026123, "logits/rejected": -3.661947250366211, "logps/chosen": -18.41823387145996, "logps/rejected": -29.832141876220703, "loss": 0.2041, "rewards/accuracies": 1.0, "rewards/chosen": 1.8571171760559082, "rewards/margins": 1.8571171760559082, "rewards/rejected": 0.0, "step": 508 }, { "epoch": 2.8435754189944134, "grad_norm": 8.190086694414964, "learning_rate": 9.818002916735733e-07, "logits/chosen": -4.057163715362549, "logits/rejected": -3.9159181118011475, "logps/chosen": -11.774065017700195, "logps/rejected": -21.041200637817383, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 1.1493196487426758, "rewards/margins": 1.1493196487426758, "rewards/rejected": 0.0, "step": 509 }, { "epoch": 2.8491620111731844, "grad_norm": 8.668457802964342, "learning_rate": 9.816369743573851e-07, "logits/chosen": -3.3303356170654297, "logits/rejected": -3.314500093460083, "logps/chosen": -42.74287414550781, "logps/rejected": -19.058147430419922, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 1.6401937007904053, "rewards/margins": 1.6401937007904053, "rewards/rejected": 0.0, "step": 510 }, { "epoch": 2.8547486033519553, "grad_norm": 7.721684645719408, "learning_rate": 9.814729412428785e-07, "logits/chosen": -3.3857314586639404, "logits/rejected": -3.330643892288208, "logps/chosen": -20.835432052612305, "logps/rejected": -30.11691665649414, "loss": 0.1992, "rewards/accuracies": 1.0, "rewards/chosen": 1.359020471572876, "rewards/margins": 1.359020471572876, "rewards/rejected": 0.0, "step": 511 }, { "epoch": 2.8603351955307263, "grad_norm": 9.145606388010972, "learning_rate": 9.813081925738353e-07, "logits/chosen": -3.6362464427948, "logits/rejected": -3.593132972717285, "logps/chosen": -23.955276489257812, "logps/rejected": -30.369144439697266, "loss": 0.22, "rewards/accuracies": 1.0, "rewards/chosen": 1.6455130577087402, "rewards/margins": 1.6455130577087402, "rewards/rejected": 0.0, "step": 512 }, { "epoch": 2.8659217877094973, "grad_norm": 9.01464953544514, "learning_rate": 9.811427285951016e-07, "logits/chosen": -3.5415940284729004, "logits/rejected": -3.3448758125305176, "logps/chosen": -24.666587829589844, "logps/rejected": -30.633447647094727, "loss": 0.2234, "rewards/accuracies": 1.0, "rewards/chosen": 1.7907155752182007, "rewards/margins": 1.7907155752182007, "rewards/rejected": 0.0, "step": 513 }, { "epoch": 2.871508379888268, "grad_norm": 6.905190973282463, "learning_rate": 9.809765495525865e-07, "logits/chosen": -3.708414316177368, "logits/rejected": -3.320551872253418, "logps/chosen": -22.223108291625977, "logps/rejected": -56.73927688598633, "loss": 0.2047, "rewards/accuracies": 1.0, "rewards/chosen": 2.0079169273376465, "rewards/margins": 2.0079169273376465, "rewards/rejected": 0.0, "step": 514 }, { "epoch": 2.877094972067039, "grad_norm": 7.27438558863278, "learning_rate": 9.808096556932615e-07, "logits/chosen": -3.25369930267334, "logits/rejected": -3.4669570922851562, "logps/chosen": -15.14928913116455, "logps/rejected": -34.39002227783203, "loss": 0.2087, "rewards/accuracies": 1.0, "rewards/chosen": 1.2067991495132446, "rewards/margins": 1.2067991495132446, "rewards/rejected": 0.0, "step": 515 }, { "epoch": 2.88268156424581, "grad_norm": 8.695252550756638, "learning_rate": 9.806420472651606e-07, "logits/chosen": -3.400707960128784, "logits/rejected": -3.3857288360595703, "logps/chosen": -19.411767959594727, "logps/rejected": -33.77351760864258, "loss": 0.2396, "rewards/accuracies": 1.0, "rewards/chosen": 2.2417821884155273, "rewards/margins": 2.2417821884155273, "rewards/rejected": 0.0, "step": 516 }, { "epoch": 2.888268156424581, "grad_norm": 6.249609765410309, "learning_rate": 9.8047372451738e-07, "logits/chosen": -3.5469932556152344, "logits/rejected": -3.7076032161712646, "logps/chosen": -34.2308349609375, "logps/rejected": -22.720203399658203, "loss": 0.1958, "rewards/accuracies": 1.0, "rewards/chosen": 1.6457676887512207, "rewards/margins": 1.6457676887512207, "rewards/rejected": 0.0, "step": 517 }, { "epoch": 2.893854748603352, "grad_norm": 8.155276171497226, "learning_rate": 9.803046877000769e-07, "logits/chosen": -3.419424533843994, "logits/rejected": -3.2604241371154785, "logps/chosen": -50.358097076416016, "logps/rejected": -29.022581100463867, "loss": 0.2053, "rewards/accuracies": 1.0, "rewards/chosen": 1.7308439016342163, "rewards/margins": 1.7308439016342163, "rewards/rejected": 0.0, "step": 518 }, { "epoch": 2.899441340782123, "grad_norm": 9.309994435464231, "learning_rate": 9.801349370644702e-07, "logits/chosen": -3.366185426712036, "logits/rejected": -3.3842945098876953, "logps/chosen": -5.38421630859375, "logps/rejected": -48.389190673828125, "loss": 0.2357, "rewards/accuracies": 1.0, "rewards/chosen": 0.9194574356079102, "rewards/margins": 0.9194574356079102, "rewards/rejected": 0.0, "step": 519 }, { "epoch": 2.905027932960894, "grad_norm": 9.390118081596956, "learning_rate": 9.799644728628399e-07, "logits/chosen": -3.5186169147491455, "logits/rejected": -3.3356292247772217, "logps/chosen": -15.109439849853516, "logps/rejected": -19.943649291992188, "loss": 0.2345, "rewards/accuracies": 1.0, "rewards/chosen": 0.694863498210907, "rewards/margins": 0.694863498210907, "rewards/rejected": 0.0, "step": 520 }, { "epoch": 2.905027932960894, "eval_logits/chosen": -3.4156765937805176, "eval_logits/rejected": -3.4941775798797607, "eval_logps/chosen": -18.871572494506836, "eval_logps/rejected": -31.991893768310547, "eval_loss": 0.35296905040740967, "eval_rewards/accuracies": 0.949999988079071, "eval_rewards/chosen": 1.1198468208312988, "eval_rewards/margins": 1.1198468208312988, "eval_rewards/rejected": 0.0, "eval_runtime": 33.604, "eval_samples_per_second": 9.225, "eval_steps_per_second": 0.298, "step": 520 }, { "epoch": 2.910614525139665, "grad_norm": 9.195833038191644, "learning_rate": 9.797932953485255e-07, "logits/chosen": -3.8919308185577393, "logits/rejected": -3.7082579135894775, "logps/chosen": -18.82931900024414, "logps/rejected": -58.64323425292969, "loss": 0.2005, "rewards/accuracies": 1.0, "rewards/chosen": 1.8860766887664795, "rewards/margins": 1.8860766887664795, "rewards/rejected": 0.0, "step": 521 }, { "epoch": 2.916201117318436, "grad_norm": 11.431428415303868, "learning_rate": 9.796214047759282e-07, "logits/chosen": -3.6113293170928955, "logits/rejected": -3.451960325241089, "logps/chosen": -31.122095108032227, "logps/rejected": -16.40538215637207, "loss": 0.232, "rewards/accuracies": 1.0, "rewards/chosen": 1.5730338096618652, "rewards/margins": 1.5730338096618652, "rewards/rejected": 0.0, "step": 522 }, { "epoch": 2.921787709497207, "grad_norm": 7.757319062171428, "learning_rate": 9.79448801400507e-07, "logits/chosen": -3.6258773803710938, "logits/rejected": -3.2235469818115234, "logps/chosen": -16.786638259887695, "logps/rejected": -26.439407348632812, "loss": 0.2112, "rewards/accuracies": 1.0, "rewards/chosen": 1.4600039720535278, "rewards/margins": 1.4600039720535278, "rewards/rejected": 0.0, "step": 523 }, { "epoch": 2.927374301675978, "grad_norm": 8.896054053489028, "learning_rate": 9.792754854787822e-07, "logits/chosen": -3.507004499435425, "logits/rejected": -3.4773612022399902, "logps/chosen": -23.045026779174805, "logps/rejected": -33.94160079956055, "loss": 0.2574, "rewards/accuracies": 1.0, "rewards/chosen": 1.8514593839645386, "rewards/margins": 1.8514593839645386, "rewards/rejected": 0.0, "step": 524 }, { "epoch": 2.9329608938547485, "grad_norm": 7.173060291129353, "learning_rate": 9.791014572683315e-07, "logits/chosen": -3.7352986335754395, "logits/rejected": -3.105494499206543, "logps/chosen": -13.770273208618164, "logps/rejected": -40.190067291259766, "loss": 0.2375, "rewards/accuracies": 1.0, "rewards/chosen": 1.5704180002212524, "rewards/margins": 1.5704180002212524, "rewards/rejected": 0.0, "step": 525 }, { "epoch": 2.9385474860335195, "grad_norm": 7.558496228552007, "learning_rate": 9.78926717027792e-07, "logits/chosen": -3.6851797103881836, "logits/rejected": -3.4479734897613525, "logps/chosen": -16.109195709228516, "logps/rejected": -34.50910186767578, "loss": 0.2203, "rewards/accuracies": 1.0, "rewards/chosen": 1.467513918876648, "rewards/margins": 1.467513918876648, "rewards/rejected": 0.0, "step": 526 }, { "epoch": 2.9441340782122905, "grad_norm": 7.028257607923507, "learning_rate": 9.78751265016859e-07, "logits/chosen": -3.689628839492798, "logits/rejected": -3.438250780105591, "logps/chosen": -26.680370330810547, "logps/rejected": -34.16841125488281, "loss": 0.1758, "rewards/accuracies": 1.0, "rewards/chosen": 1.5826314687728882, "rewards/margins": 1.5826314687728882, "rewards/rejected": 0.0, "step": 527 }, { "epoch": 2.9497206703910615, "grad_norm": 7.452248237330729, "learning_rate": 9.785751014962851e-07, "logits/chosen": -3.894042730331421, "logits/rejected": -3.8289926052093506, "logps/chosen": -10.17105770111084, "logps/rejected": -21.5932559967041, "loss": 0.1969, "rewards/accuracies": 1.0, "rewards/chosen": 1.5189507007598877, "rewards/margins": 1.5189507007598877, "rewards/rejected": 0.0, "step": 528 }, { "epoch": 2.9553072625698324, "grad_norm": 7.606588657029743, "learning_rate": 9.783982267278808e-07, "logits/chosen": -3.688990831375122, "logits/rejected": -3.6647708415985107, "logps/chosen": -24.94097900390625, "logps/rejected": -26.211376190185547, "loss": 0.2151, "rewards/accuracies": 1.0, "rewards/chosen": 1.5228779315948486, "rewards/margins": 1.5228779315948486, "rewards/rejected": 0.0, "step": 529 }, { "epoch": 2.9608938547486034, "grad_norm": 7.300424464888586, "learning_rate": 9.782206409745134e-07, "logits/chosen": -3.7147138118743896, "logits/rejected": -3.255930185317993, "logps/chosen": -8.188423156738281, "logps/rejected": -33.88630294799805, "loss": 0.2095, "rewards/accuracies": 1.0, "rewards/chosen": 1.0383009910583496, "rewards/margins": 1.0383009910583496, "rewards/rejected": 0.0, "step": 530 }, { "epoch": 2.9664804469273744, "grad_norm": 7.532588767917452, "learning_rate": 9.780423445001072e-07, "logits/chosen": -3.4041967391967773, "logits/rejected": -3.420712947845459, "logps/chosen": -43.992828369140625, "logps/rejected": -31.57377052307129, "loss": 0.2052, "rewards/accuracies": 1.0, "rewards/chosen": 2.2596793174743652, "rewards/margins": 2.2596793174743652, "rewards/rejected": 0.0, "step": 531 }, { "epoch": 2.972067039106145, "grad_norm": 9.639239919983133, "learning_rate": 9.778633375696424e-07, "logits/chosen": -3.5303702354431152, "logits/rejected": -3.5252037048339844, "logps/chosen": -17.531352996826172, "logps/rejected": -27.08810043334961, "loss": 0.2215, "rewards/accuracies": 1.0, "rewards/chosen": 1.5886486768722534, "rewards/margins": 1.5886486768722534, "rewards/rejected": 0.0, "step": 532 }, { "epoch": 2.977653631284916, "grad_norm": 7.825710810550395, "learning_rate": 9.776836204491552e-07, "logits/chosen": -3.514402151107788, "logits/rejected": -3.4888627529144287, "logps/chosen": -28.722347259521484, "logps/rejected": -41.736698150634766, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 1.1408169269561768, "rewards/margins": 1.1408169269561768, "rewards/rejected": 0.0, "step": 533 }, { "epoch": 2.983240223463687, "grad_norm": 7.344748276963433, "learning_rate": 9.775031934057373e-07, "logits/chosen": -3.348198652267456, "logits/rejected": -3.621284246444702, "logps/chosen": -15.124707221984863, "logps/rejected": -28.8204345703125, "loss": 0.2101, "rewards/accuracies": 1.0, "rewards/chosen": 1.4255895614624023, "rewards/margins": 1.4255895614624023, "rewards/rejected": 0.0, "step": 534 }, { "epoch": 2.988826815642458, "grad_norm": 6.686215734213654, "learning_rate": 9.77322056707535e-07, "logits/chosen": -3.156296968460083, "logits/rejected": -3.096890449523926, "logps/chosen": -22.622802734375, "logps/rejected": -37.47029113769531, "loss": 0.1905, "rewards/accuracies": 1.0, "rewards/chosen": 1.8413368463516235, "rewards/margins": 1.8413368463516235, "rewards/rejected": 0.0, "step": 535 }, { "epoch": 2.994413407821229, "grad_norm": 8.883809445571819, "learning_rate": 9.771402106237503e-07, "logits/chosen": -3.5562469959259033, "logits/rejected": -3.5949959754943848, "logps/chosen": -14.242838859558105, "logps/rejected": -36.978660583496094, "loss": 0.2287, "rewards/accuracies": 1.0, "rewards/chosen": 1.5399631261825562, "rewards/margins": 1.5399631261825562, "rewards/rejected": 0.0, "step": 536 }, { "epoch": 3.0, "grad_norm": 5.951618447991793, "learning_rate": 9.769576554246383e-07, "logits/chosen": -3.464947462081909, "logits/rejected": -3.4362552165985107, "logps/chosen": -14.894698143005371, "logps/rejected": -22.000164031982422, "loss": 0.2003, "rewards/accuracies": 1.0, "rewards/chosen": 1.3502216339111328, "rewards/margins": 1.3502216339111328, "rewards/rejected": 0.0, "step": 537 }, { "epoch": 3.005586592178771, "grad_norm": 4.245095056572733, "learning_rate": 9.767743913815092e-07, "logits/chosen": -3.527196168899536, "logits/rejected": -3.772918701171875, "logps/chosen": -9.334918975830078, "logps/rejected": -30.373565673828125, "loss": 0.183, "rewards/accuracies": 1.0, "rewards/chosen": 1.5863807201385498, "rewards/margins": 1.5863807201385498, "rewards/rejected": 0.0, "step": 538 }, { "epoch": 3.011173184357542, "grad_norm": 5.1840999250382245, "learning_rate": 9.765904187667254e-07, "logits/chosen": -3.642043113708496, "logits/rejected": -3.252462863922119, "logps/chosen": -6.1675238609313965, "logps/rejected": -28.275943756103516, "loss": 0.2001, "rewards/accuracies": 1.0, "rewards/chosen": 1.4355202913284302, "rewards/margins": 1.4355202913284302, "rewards/rejected": 0.0, "step": 539 }, { "epoch": 3.016759776536313, "grad_norm": 4.85681935133445, "learning_rate": 9.764057378537034e-07, "logits/chosen": -3.1955578327178955, "logits/rejected": -3.1691908836364746, "logps/chosen": -43.302391052246094, "logps/rejected": -48.15077590942383, "loss": 0.1987, "rewards/accuracies": 1.0, "rewards/chosen": 2.09118390083313, "rewards/margins": 2.09118390083313, "rewards/rejected": 0.0, "step": 540 }, { "epoch": 3.016759776536313, "eval_logits/chosen": -3.426398754119873, "eval_logits/rejected": -3.50640869140625, "eval_logps/chosen": -18.674394607543945, "eval_logps/rejected": -32.07451629638672, "eval_loss": 0.3496454656124115, "eval_rewards/accuracies": 0.925000011920929, "eval_rewards/chosen": 1.1395649909973145, "eval_rewards/margins": 1.1395649909973145, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5941, "eval_samples_per_second": 9.228, "eval_steps_per_second": 0.298, "step": 540 }, { "epoch": 3.022346368715084, "grad_norm": 4.687407332922764, "learning_rate": 9.762203489169115e-07, "logits/chosen": -3.765223264694214, "logits/rejected": -3.6322054862976074, "logps/chosen": -10.266185760498047, "logps/rejected": -21.843515396118164, "loss": 0.182, "rewards/accuracies": 1.0, "rewards/chosen": 1.8560717105865479, "rewards/margins": 1.8560717105865479, "rewards/rejected": 0.0, "step": 541 }, { "epoch": 3.0279329608938546, "grad_norm": 4.518162104408857, "learning_rate": 9.76034252231871e-07, "logits/chosen": -3.720940589904785, "logits/rejected": -3.8452420234680176, "logps/chosen": -19.095359802246094, "logps/rejected": -28.30400276184082, "loss": 0.1657, "rewards/accuracies": 1.0, "rewards/chosen": 2.2671868801116943, "rewards/margins": 2.2671868801116943, "rewards/rejected": 0.0, "step": 542 }, { "epoch": 3.0335195530726256, "grad_norm": 5.579688457027353, "learning_rate": 9.758474480751547e-07, "logits/chosen": -3.8501555919647217, "logits/rejected": -3.6848278045654297, "logps/chosen": -13.278109550476074, "logps/rejected": -39.80680465698242, "loss": 0.1818, "rewards/accuracies": 1.0, "rewards/chosen": 1.9774212837219238, "rewards/margins": 1.9774212837219238, "rewards/rejected": 0.0, "step": 543 }, { "epoch": 3.0391061452513966, "grad_norm": 4.411804230080304, "learning_rate": 9.756599367243868e-07, "logits/chosen": -3.7062551975250244, "logits/rejected": -3.4251039028167725, "logps/chosen": -15.456059455871582, "logps/rejected": -33.30834197998047, "loss": 0.1826, "rewards/accuracies": 1.0, "rewards/chosen": 2.043917179107666, "rewards/margins": 2.043917179107666, "rewards/rejected": 0.0, "step": 544 }, { "epoch": 3.0446927374301676, "grad_norm": 4.009334068053905, "learning_rate": 9.754717184582422e-07, "logits/chosen": -2.9957168102264404, "logits/rejected": -2.999037981033325, "logps/chosen": -33.969444274902344, "logps/rejected": -48.57637023925781, "loss": 0.1799, "rewards/accuracies": 1.0, "rewards/chosen": 2.5263586044311523, "rewards/margins": 2.5263586044311523, "rewards/rejected": 0.0, "step": 545 }, { "epoch": 3.0502793296089385, "grad_norm": 4.7000551597518125, "learning_rate": 9.752827935564474e-07, "logits/chosen": -3.714012622833252, "logits/rejected": -3.739675521850586, "logps/chosen": -32.73251724243164, "logps/rejected": -31.016817092895508, "loss": 0.1843, "rewards/accuracies": 1.0, "rewards/chosen": 1.6957581043243408, "rewards/margins": 1.6957581043243408, "rewards/rejected": 0.0, "step": 546 }, { "epoch": 3.0558659217877095, "grad_norm": 5.036818270344218, "learning_rate": 9.75093162299778e-07, "logits/chosen": -3.67802095413208, "logits/rejected": -3.584538459777832, "logps/chosen": -31.94599151611328, "logps/rejected": -31.759218215942383, "loss": 0.1651, "rewards/accuracies": 1.0, "rewards/chosen": 2.74308443069458, "rewards/margins": 2.74308443069458, "rewards/rejected": 0.0, "step": 547 }, { "epoch": 3.0614525139664805, "grad_norm": 4.3369665011037375, "learning_rate": 9.7490282497006e-07, "logits/chosen": -3.5875184535980225, "logits/rejected": -3.5783565044403076, "logps/chosen": -15.221088409423828, "logps/rejected": -18.263507843017578, "loss": 0.1478, "rewards/accuracies": 1.0, "rewards/chosen": 2.2133846282958984, "rewards/margins": 2.2133846282958984, "rewards/rejected": 0.0, "step": 548 }, { "epoch": 3.0670391061452515, "grad_norm": 4.64409448625801, "learning_rate": 9.747117818501687e-07, "logits/chosen": -3.67057204246521, "logits/rejected": -3.667259931564331, "logps/chosen": -5.813396453857422, "logps/rejected": -16.091121673583984, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": 1.5230515003204346, "rewards/margins": 1.5230515003204346, "rewards/rejected": 0.0, "step": 549 }, { "epoch": 3.0726256983240225, "grad_norm": 5.647539059115093, "learning_rate": 9.745200332240278e-07, "logits/chosen": -3.5769731998443604, "logits/rejected": -3.6055614948272705, "logps/chosen": -15.150192260742188, "logps/rejected": -37.067344665527344, "loss": 0.1799, "rewards/accuracies": 1.0, "rewards/chosen": 2.007539749145508, "rewards/margins": 2.007539749145508, "rewards/rejected": 0.0, "step": 550 }, { "epoch": 3.078212290502793, "grad_norm": 5.14672413289712, "learning_rate": 9.743275793766102e-07, "logits/chosen": -3.8731751441955566, "logits/rejected": -3.747708320617676, "logps/chosen": -26.233776092529297, "logps/rejected": -35.369693756103516, "loss": 0.1869, "rewards/accuracies": 1.0, "rewards/chosen": 1.7416244745254517, "rewards/margins": 1.7416244745254517, "rewards/rejected": 0.0, "step": 551 }, { "epoch": 3.083798882681564, "grad_norm": 4.849462299874129, "learning_rate": 9.741344205939364e-07, "logits/chosen": -3.7174947261810303, "logits/rejected": -3.245631217956543, "logps/chosen": -7.081581115722656, "logps/rejected": -37.488609313964844, "loss": 0.1541, "rewards/accuracies": 1.0, "rewards/chosen": 1.5629022121429443, "rewards/margins": 1.5629022121429443, "rewards/rejected": 0.0, "step": 552 }, { "epoch": 3.089385474860335, "grad_norm": 4.209133371240973, "learning_rate": 9.739405571630752e-07, "logits/chosen": -3.8818559646606445, "logits/rejected": -3.9326014518737793, "logps/chosen": -13.727590560913086, "logps/rejected": -22.858854293823242, "loss": 0.1753, "rewards/accuracies": 1.0, "rewards/chosen": 1.7776994705200195, "rewards/margins": 1.7776994705200195, "rewards/rejected": 0.0, "step": 553 }, { "epoch": 3.094972067039106, "grad_norm": 4.999596680859089, "learning_rate": 9.737459893721415e-07, "logits/chosen": -3.8012993335723877, "logits/rejected": -3.6896021366119385, "logps/chosen": -3.1425883769989014, "logps/rejected": -53.1058464050293, "loss": 0.1771, "rewards/accuracies": 1.0, "rewards/chosen": 1.3161832094192505, "rewards/margins": 1.3161832094192505, "rewards/rejected": 0.0, "step": 554 }, { "epoch": 3.100558659217877, "grad_norm": 4.611992849402734, "learning_rate": 9.735507175102983e-07, "logits/chosen": -3.735285520553589, "logits/rejected": -3.5629961490631104, "logps/chosen": -7.803858757019043, "logps/rejected": -42.674530029296875, "loss": 0.1612, "rewards/accuracies": 1.0, "rewards/chosen": 1.3933154344558716, "rewards/margins": 1.3933154344558716, "rewards/rejected": 0.0, "step": 555 }, { "epoch": 3.106145251396648, "grad_norm": 4.925793425741922, "learning_rate": 9.733547418677538e-07, "logits/chosen": -3.6361725330352783, "logits/rejected": -3.8588008880615234, "logps/chosen": -10.72397232055664, "logps/rejected": -59.879058837890625, "loss": 0.1996, "rewards/accuracies": 1.0, "rewards/chosen": 1.8188023567199707, "rewards/margins": 1.8188023567199707, "rewards/rejected": 0.0, "step": 556 }, { "epoch": 3.111731843575419, "grad_norm": 5.475225802231006, "learning_rate": 9.731580627357635e-07, "logits/chosen": -3.7604262828826904, "logits/rejected": -3.319699287414551, "logps/chosen": -7.3406982421875, "logps/rejected": -58.49966049194336, "loss": 0.2007, "rewards/accuracies": 1.0, "rewards/chosen": 1.6027517318725586, "rewards/margins": 1.6027517318725586, "rewards/rejected": 0.0, "step": 557 }, { "epoch": 3.11731843575419, "grad_norm": 4.809392527499902, "learning_rate": 9.729606804066268e-07, "logits/chosen": -3.924943208694458, "logits/rejected": -3.829216241836548, "logps/chosen": -7.529721260070801, "logps/rejected": -54.85078811645508, "loss": 0.1983, "rewards/accuracies": 1.0, "rewards/chosen": 1.4421954154968262, "rewards/margins": 1.4421954154968262, "rewards/rejected": 0.0, "step": 558 }, { "epoch": 3.122905027932961, "grad_norm": 4.345413799217623, "learning_rate": 9.727625951736894e-07, "logits/chosen": -3.5260565280914307, "logits/rejected": -3.7493531703948975, "logps/chosen": -3.8053598403930664, "logps/rejected": -39.421329498291016, "loss": 0.1672, "rewards/accuracies": 1.0, "rewards/chosen": 1.2054493427276611, "rewards/margins": 1.2054493427276611, "rewards/rejected": 0.0, "step": 559 }, { "epoch": 3.1284916201117317, "grad_norm": 5.22143163482338, "learning_rate": 9.725638073313412e-07, "logits/chosen": -2.7598049640655518, "logits/rejected": -2.6982693672180176, "logps/chosen": -28.23615264892578, "logps/rejected": -41.56056213378906, "loss": 0.1838, "rewards/accuracies": 1.0, "rewards/chosen": 1.8444175720214844, "rewards/margins": 1.8444175720214844, "rewards/rejected": 0.0, "step": 560 }, { "epoch": 3.1284916201117317, "eval_logits/chosen": -3.452139377593994, "eval_logits/rejected": -3.551227569580078, "eval_logps/chosen": -19.524335861206055, "eval_logps/rejected": -34.0481071472168, "eval_loss": 0.38711562752723694, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 1.0545704364776611, "eval_rewards/margins": 1.0545704364776611, "eval_rewards/rejected": 0.0, "eval_runtime": 33.6012, "eval_samples_per_second": 9.226, "eval_steps_per_second": 0.298, "step": 560 }, { "epoch": 3.1340782122905027, "grad_norm": 4.742808432195938, "learning_rate": 9.723643171750161e-07, "logits/chosen": -3.5465304851531982, "logits/rejected": -3.5002918243408203, "logps/chosen": -3.8645851612091064, "logps/rejected": -37.22589111328125, "loss": 0.1957, "rewards/accuracies": 1.0, "rewards/chosen": 1.2577381134033203, "rewards/margins": 1.2577381134033203, "rewards/rejected": 0.0, "step": 561 }, { "epoch": 3.1396648044692737, "grad_norm": 4.531387635169225, "learning_rate": 9.721641250011926e-07, "logits/chosen": -3.4351985454559326, "logits/rejected": -3.562939167022705, "logps/chosen": -25.417177200317383, "logps/rejected": -27.687837600708008, "loss": 0.1714, "rewards/accuracies": 1.0, "rewards/chosen": 1.8707070350646973, "rewards/margins": 1.8707070350646973, "rewards/rejected": 0.0, "step": 562 }, { "epoch": 3.1452513966480447, "grad_norm": 5.785023128231763, "learning_rate": 9.719632311073912e-07, "logits/chosen": -3.410985231399536, "logits/rejected": -3.544358730316162, "logps/chosen": -7.332213401794434, "logps/rejected": -29.132205963134766, "loss": 0.1919, "rewards/accuracies": 1.0, "rewards/chosen": 1.501019835472107, "rewards/margins": 1.501019835472107, "rewards/rejected": 0.0, "step": 563 }, { "epoch": 3.1508379888268156, "grad_norm": 4.816389939776029, "learning_rate": 9.717616357921764e-07, "logits/chosen": -3.499091386795044, "logits/rejected": -3.3932666778564453, "logps/chosen": -9.977988243103027, "logps/rejected": -31.942049026489258, "loss": 0.2038, "rewards/accuracies": 1.0, "rewards/chosen": 1.4507946968078613, "rewards/margins": 1.4507946968078613, "rewards/rejected": 0.0, "step": 564 }, { "epoch": 3.1564245810055866, "grad_norm": 5.080691874623269, "learning_rate": 9.715593393551545e-07, "logits/chosen": -3.698702096939087, "logits/rejected": -3.818716287612915, "logps/chosen": -6.350429534912109, "logps/rejected": -32.38847732543945, "loss": 0.1626, "rewards/accuracies": 1.0, "rewards/chosen": 1.4038186073303223, "rewards/margins": 1.4038186073303223, "rewards/rejected": 0.0, "step": 565 }, { "epoch": 3.1620111731843576, "grad_norm": 4.900675475627213, "learning_rate": 9.713563420969744e-07, "logits/chosen": -3.4896557331085205, "logits/rejected": -3.595815420150757, "logps/chosen": -7.538320541381836, "logps/rejected": -51.425331115722656, "loss": 0.1624, "rewards/accuracies": 1.0, "rewards/chosen": 1.9275681972503662, "rewards/margins": 1.9275681972503662, "rewards/rejected": 0.0, "step": 566 }, { "epoch": 3.1675977653631286, "grad_norm": 4.772537916253512, "learning_rate": 9.711526443193258e-07, "logits/chosen": -3.5935449600219727, "logits/rejected": -3.7333123683929443, "logps/chosen": -14.802519798278809, "logps/rejected": -22.667762756347656, "loss": 0.1506, "rewards/accuracies": 1.0, "rewards/chosen": 1.9541294574737549, "rewards/margins": 1.9541294574737549, "rewards/rejected": 0.0, "step": 567 }, { "epoch": 3.1731843575418996, "grad_norm": 4.410642601758276, "learning_rate": 9.709482463249402e-07, "logits/chosen": -3.8915655612945557, "logits/rejected": -3.8851208686828613, "logps/chosen": -4.589975357055664, "logps/rejected": -21.963491439819336, "loss": 0.1891, "rewards/accuracies": 1.0, "rewards/chosen": 1.3541920185089111, "rewards/margins": 1.3541920185089111, "rewards/rejected": 0.0, "step": 568 }, { "epoch": 3.17877094972067, "grad_norm": 5.001923513173423, "learning_rate": 9.707431484175893e-07, "logits/chosen": -3.3935484886169434, "logits/rejected": -3.537262439727783, "logps/chosen": -25.665592193603516, "logps/rejected": -27.040699005126953, "loss": 0.1691, "rewards/accuracies": 1.0, "rewards/chosen": 2.2004714012145996, "rewards/margins": 2.2004714012145996, "rewards/rejected": 0.0, "step": 569 }, { "epoch": 3.184357541899441, "grad_norm": 6.779016548131147, "learning_rate": 9.705373509020848e-07, "logits/chosen": -3.615057945251465, "logits/rejected": -3.2200875282287598, "logps/chosen": -6.432480335235596, "logps/rejected": -58.13328170776367, "loss": 0.1804, "rewards/accuracies": 1.0, "rewards/chosen": 1.4290056228637695, "rewards/margins": 1.4290056228637695, "rewards/rejected": 0.0, "step": 570 }, { "epoch": 3.189944134078212, "grad_norm": 4.736026679285262, "learning_rate": 9.70330854084279e-07, "logits/chosen": -3.712716817855835, "logits/rejected": -3.572997570037842, "logps/chosen": -23.108036041259766, "logps/rejected": -22.306825637817383, "loss": 0.1618, "rewards/accuracies": 1.0, "rewards/chosen": 2.165341854095459, "rewards/margins": 2.165341854095459, "rewards/rejected": 0.0, "step": 571 }, { "epoch": 3.195530726256983, "grad_norm": 5.626284254757533, "learning_rate": 9.701236582710627e-07, "logits/chosen": -3.353090763092041, "logits/rejected": -3.665696859359741, "logps/chosen": -27.17815399169922, "logps/rejected": -34.68605041503906, "loss": 0.1495, "rewards/accuracies": 1.0, "rewards/chosen": 2.3534014225006104, "rewards/margins": 2.3534014225006104, "rewards/rejected": 0.0, "step": 572 }, { "epoch": 3.201117318435754, "grad_norm": 4.870352035036208, "learning_rate": 9.699157637703656e-07, "logits/chosen": -3.9493651390075684, "logits/rejected": -3.839723825454712, "logps/chosen": -7.73994255065918, "logps/rejected": -40.916465759277344, "loss": 0.149, "rewards/accuracies": 1.0, "rewards/chosen": 1.5562421083450317, "rewards/margins": 1.5562421083450317, "rewards/rejected": 0.0, "step": 573 }, { "epoch": 3.206703910614525, "grad_norm": 5.277463053572278, "learning_rate": 9.697071708911563e-07, "logits/chosen": -3.6510212421417236, "logits/rejected": -3.637693166732788, "logps/chosen": -8.033740997314453, "logps/rejected": -30.34661293029785, "loss": 0.1869, "rewards/accuracies": 1.0, "rewards/chosen": 1.5631598234176636, "rewards/margins": 1.5631598234176636, "rewards/rejected": 0.0, "step": 574 }, { "epoch": 3.212290502793296, "grad_norm": 5.264094144924965, "learning_rate": 9.694978799434408e-07, "logits/chosen": -3.5529937744140625, "logits/rejected": -3.5506813526153564, "logps/chosen": -4.875071048736572, "logps/rejected": -36.16624450683594, "loss": 0.1843, "rewards/accuracies": 1.0, "rewards/chosen": 1.1266826391220093, "rewards/margins": 1.1266826391220093, "rewards/rejected": 0.0, "step": 575 }, { "epoch": 3.217877094972067, "grad_norm": 5.485669842712066, "learning_rate": 9.692878912382625e-07, "logits/chosen": -3.9466843605041504, "logits/rejected": -3.9598653316497803, "logps/chosen": -6.40760612487793, "logps/rejected": -22.46565055847168, "loss": 0.1884, "rewards/accuracies": 1.0, "rewards/chosen": 1.5205612182617188, "rewards/margins": 1.5205612182617188, "rewards/rejected": 0.0, "step": 576 }, { "epoch": 3.223463687150838, "grad_norm": 4.98802706808277, "learning_rate": 9.690772050877024e-07, "logits/chosen": -3.943087339401245, "logits/rejected": -3.983419418334961, "logps/chosen": -44.90244674682617, "logps/rejected": -22.856143951416016, "loss": 0.1743, "rewards/accuracies": 1.0, "rewards/chosen": 2.329369306564331, "rewards/margins": 2.329369306564331, "rewards/rejected": 0.0, "step": 577 }, { "epoch": 3.2290502793296088, "grad_norm": 4.842990265689181, "learning_rate": 9.688658218048773e-07, "logits/chosen": -3.3331680297851562, "logits/rejected": -3.435561418533325, "logps/chosen": -51.9761848449707, "logps/rejected": -41.16301727294922, "loss": 0.1442, "rewards/accuracies": 1.0, "rewards/chosen": 2.0154778957366943, "rewards/margins": 2.0154778957366943, "rewards/rejected": 0.0, "step": 578 }, { "epoch": 3.2346368715083798, "grad_norm": 5.312142250461408, "learning_rate": 9.686537417039407e-07, "logits/chosen": -3.582887887954712, "logits/rejected": -3.7412376403808594, "logps/chosen": -11.86709213256836, "logps/rejected": -42.311580657958984, "loss": 0.1803, "rewards/accuracies": 1.0, "rewards/chosen": 1.5465190410614014, "rewards/margins": 1.5465190410614014, "rewards/rejected": 0.0, "step": 579 }, { "epoch": 3.2402234636871508, "grad_norm": 9.410288475259977, "learning_rate": 9.684409651000811e-07, "logits/chosen": -3.9253406524658203, "logits/rejected": -3.460865020751953, "logps/chosen": -10.171557426452637, "logps/rejected": -44.976715087890625, "loss": 0.1988, "rewards/accuracies": 1.0, "rewards/chosen": 1.9322619438171387, "rewards/margins": 1.9322619438171387, "rewards/rejected": 0.0, "step": 580 }, { "epoch": 3.2402234636871508, "eval_logits/chosen": -3.462258815765381, "eval_logits/rejected": -3.5622501373291016, "eval_logps/chosen": -19.60980224609375, "eval_logps/rejected": -33.709877014160156, "eval_loss": 0.3881343901157379, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 1.0460238456726074, "eval_rewards/margins": 1.0460238456726074, "eval_rewards/rejected": 0.0, "eval_runtime": 33.583, "eval_samples_per_second": 9.231, "eval_steps_per_second": 0.298, "step": 580 }, { "epoch": 3.2458100558659218, "grad_norm": 6.161358425089739, "learning_rate": 9.682274923095229e-07, "logits/chosen": -3.1356165409088135, "logits/rejected": -3.395218849182129, "logps/chosen": -31.436750411987305, "logps/rejected": -31.18406867980957, "loss": 0.2037, "rewards/accuracies": 1.0, "rewards/chosen": 2.7328619956970215, "rewards/margins": 2.7328619956970215, "rewards/rejected": 0.0, "step": 581 }, { "epoch": 3.2513966480446927, "grad_norm": 4.578745281068452, "learning_rate": 9.68013323649524e-07, "logits/chosen": -3.405984878540039, "logits/rejected": -3.7583210468292236, "logps/chosen": -7.036988258361816, "logps/rejected": -24.276416778564453, "loss": 0.1941, "rewards/accuracies": 1.0, "rewards/chosen": 1.6983870267868042, "rewards/margins": 1.6983870267868042, "rewards/rejected": 0.0, "step": 582 }, { "epoch": 3.2569832402234637, "grad_norm": 5.9174722341795825, "learning_rate": 9.677984594383778e-07, "logits/chosen": -3.7339975833892822, "logits/rejected": -3.9361507892608643, "logps/chosen": -17.876081466674805, "logps/rejected": -24.841102600097656, "loss": 0.1399, "rewards/accuracies": 1.0, "rewards/chosen": 1.7446861267089844, "rewards/margins": 1.7446861267089844, "rewards/rejected": 0.0, "step": 583 }, { "epoch": 3.2625698324022347, "grad_norm": 6.793699025911994, "learning_rate": 9.675828999954104e-07, "logits/chosen": -3.9633615016937256, "logits/rejected": -3.6193535327911377, "logps/chosen": -9.287819862365723, "logps/rejected": -38.18955993652344, "loss": 0.1919, "rewards/accuracies": 1.0, "rewards/chosen": 1.739070177078247, "rewards/margins": 1.739070177078247, "rewards/rejected": 0.0, "step": 584 }, { "epoch": 3.2681564245810057, "grad_norm": 5.075795557710363, "learning_rate": 9.673666456409817e-07, "logits/chosen": -3.2759780883789062, "logits/rejected": -3.0620763301849365, "logps/chosen": -22.58893585205078, "logps/rejected": -75.91329956054688, "loss": 0.1507, "rewards/accuracies": 1.0, "rewards/chosen": 1.643172264099121, "rewards/margins": 1.643172264099121, "rewards/rejected": 0.0, "step": 585 }, { "epoch": 3.2737430167597763, "grad_norm": 4.83784737013629, "learning_rate": 9.671496966964842e-07, "logits/chosen": -3.522770881652832, "logits/rejected": -3.5017261505126953, "logps/chosen": -24.941234588623047, "logps/rejected": -32.05644989013672, "loss": 0.1712, "rewards/accuracies": 1.0, "rewards/chosen": 2.4505114555358887, "rewards/margins": 2.4505114555358887, "rewards/rejected": 0.0, "step": 586 }, { "epoch": 3.2793296089385473, "grad_norm": 4.962956416097082, "learning_rate": 9.669320534843425e-07, "logits/chosen": -3.6959798336029053, "logits/rejected": -3.7863824367523193, "logps/chosen": -23.056482315063477, "logps/rejected": -43.51755142211914, "loss": 0.1876, "rewards/accuracies": 1.0, "rewards/chosen": 1.9795132875442505, "rewards/margins": 1.9795132875442505, "rewards/rejected": 0.0, "step": 587 }, { "epoch": 3.2849162011173183, "grad_norm": 5.535105657873138, "learning_rate": 9.667137163280133e-07, "logits/chosen": -3.588747024536133, "logits/rejected": -3.5322866439819336, "logps/chosen": -24.421964645385742, "logps/rejected": -36.126529693603516, "loss": 0.1672, "rewards/accuracies": 1.0, "rewards/chosen": 2.2174808979034424, "rewards/margins": 2.2174808979034424, "rewards/rejected": 0.0, "step": 588 }, { "epoch": 3.2905027932960893, "grad_norm": 4.949491208974845, "learning_rate": 9.664946855519843e-07, "logits/chosen": -3.6174607276916504, "logits/rejected": -3.545384407043457, "logps/chosen": -10.511805534362793, "logps/rejected": -33.92623519897461, "loss": 0.1705, "rewards/accuracies": 1.0, "rewards/chosen": 1.7961182594299316, "rewards/margins": 1.7961182594299316, "rewards/rejected": 0.0, "step": 589 }, { "epoch": 3.2960893854748603, "grad_norm": 4.832052588159931, "learning_rate": 9.662749614817746e-07, "logits/chosen": -3.5949666500091553, "logits/rejected": -3.5336391925811768, "logps/chosen": -16.682287216186523, "logps/rejected": -39.20673370361328, "loss": 0.1686, "rewards/accuracies": 1.0, "rewards/chosen": 1.7638062238693237, "rewards/margins": 1.7638062238693237, "rewards/rejected": 0.0, "step": 590 }, { "epoch": 3.3016759776536313, "grad_norm": 4.374631165657219, "learning_rate": 9.660545444439332e-07, "logits/chosen": -3.721179485321045, "logits/rejected": -3.7360987663269043, "logps/chosen": -8.91193962097168, "logps/rejected": -36.664329528808594, "loss": 0.1618, "rewards/accuracies": 1.0, "rewards/chosen": 1.7523739337921143, "rewards/margins": 1.7523739337921143, "rewards/rejected": 0.0, "step": 591 }, { "epoch": 3.3072625698324023, "grad_norm": 6.306868741111923, "learning_rate": 9.658334347660389e-07, "logits/chosen": -2.937652349472046, "logits/rejected": -3.0750551223754883, "logps/chosen": -18.957584381103516, "logps/rejected": -55.87245178222656, "loss": 0.1598, "rewards/accuracies": 1.0, "rewards/chosen": 1.6617270708084106, "rewards/margins": 1.6617270708084106, "rewards/rejected": 0.0, "step": 592 }, { "epoch": 3.3128491620111733, "grad_norm": 6.186761405552854, "learning_rate": 9.656116327767002e-07, "logits/chosen": -3.8097431659698486, "logits/rejected": -3.948094129562378, "logps/chosen": -19.085521697998047, "logps/rejected": -32.111228942871094, "loss": 0.1701, "rewards/accuracies": 1.0, "rewards/chosen": 2.1641557216644287, "rewards/margins": 2.1641557216644287, "rewards/rejected": 0.0, "step": 593 }, { "epoch": 3.3184357541899443, "grad_norm": 4.4902103816695, "learning_rate": 9.65389138805554e-07, "logits/chosen": -3.5979137420654297, "logits/rejected": -3.61662220954895, "logps/chosen": -7.313446998596191, "logps/rejected": -33.970970153808594, "loss": 0.1669, "rewards/accuracies": 1.0, "rewards/chosen": 1.4582792520523071, "rewards/margins": 1.4582792520523071, "rewards/rejected": 0.0, "step": 594 }, { "epoch": 3.3240223463687153, "grad_norm": 5.527789456110965, "learning_rate": 9.651659531832665e-07, "logits/chosen": -3.4975526332855225, "logits/rejected": -3.5394434928894043, "logps/chosen": -26.84719467163086, "logps/rejected": -37.31050109863281, "loss": 0.1295, "rewards/accuracies": 1.0, "rewards/chosen": 2.6903393268585205, "rewards/margins": 2.6903393268585205, "rewards/rejected": 0.0, "step": 595 }, { "epoch": 3.329608938547486, "grad_norm": 6.680730860628917, "learning_rate": 9.64942076241531e-07, "logits/chosen": -3.551564931869507, "logits/rejected": -3.630307674407959, "logps/chosen": -13.276336669921875, "logps/rejected": -29.829763412475586, "loss": 0.1894, "rewards/accuracies": 1.0, "rewards/chosen": 1.529152750968933, "rewards/margins": 1.529152750968933, "rewards/rejected": 0.0, "step": 596 }, { "epoch": 3.335195530726257, "grad_norm": 4.386712935890993, "learning_rate": 9.647175083130687e-07, "logits/chosen": -3.4997851848602295, "logits/rejected": -3.688692331314087, "logps/chosen": -3.3455991744995117, "logps/rejected": -41.719268798828125, "loss": 0.1659, "rewards/accuracies": 1.0, "rewards/chosen": 1.3428454399108887, "rewards/margins": 1.3428454399108887, "rewards/rejected": 0.0, "step": 597 }, { "epoch": 3.340782122905028, "grad_norm": 5.648244605274734, "learning_rate": 9.644922497316272e-07, "logits/chosen": -3.492734670639038, "logits/rejected": -3.720151662826538, "logps/chosen": -14.767166137695312, "logps/rejected": -77.58329010009766, "loss": 0.1865, "rewards/accuracies": 1.0, "rewards/chosen": 1.538488507270813, "rewards/margins": 1.538488507270813, "rewards/rejected": 0.0, "step": 598 }, { "epoch": 3.346368715083799, "grad_norm": 4.483493491856391, "learning_rate": 9.642663008319811e-07, "logits/chosen": -3.735584020614624, "logits/rejected": -3.4455156326293945, "logps/chosen": -8.765111923217773, "logps/rejected": -49.826454162597656, "loss": 0.163, "rewards/accuracies": 1.0, "rewards/chosen": 1.848058819770813, "rewards/margins": 1.848058819770813, "rewards/rejected": 0.0, "step": 599 }, { "epoch": 3.35195530726257, "grad_norm": 4.781236911208581, "learning_rate": 9.640396619499307e-07, "logits/chosen": -3.654815435409546, "logits/rejected": -3.8434436321258545, "logps/chosen": -5.785635948181152, "logps/rejected": -44.263404846191406, "loss": 0.1733, "rewards/accuracies": 1.0, "rewards/chosen": 1.4081251621246338, "rewards/margins": 1.4081251621246338, "rewards/rejected": 0.0, "step": 600 }, { "epoch": 3.35195530726257, "eval_logits/chosen": -3.4567484855651855, "eval_logits/rejected": -3.5548903942108154, "eval_logps/chosen": -19.334123611450195, "eval_logps/rejected": -33.267757415771484, "eval_loss": 0.3792971074581146, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 1.073591947555542, "eval_rewards/margins": 1.073591947555542, "eval_rewards/rejected": 0.0, "eval_runtime": 33.5681, "eval_samples_per_second": 9.235, "eval_steps_per_second": 0.298, "step": 600 }, { "epoch": 3.357541899441341, "grad_norm": 5.051227255910184, "learning_rate": 9.638123334223016e-07, "logits/chosen": -3.534510850906372, "logits/rejected": -3.752387523651123, "logps/chosen": -5.093735218048096, "logps/rejected": -64.19776153564453, "loss": 0.1873, "rewards/accuracies": 1.0, "rewards/chosen": 1.4543613195419312, "rewards/margins": 1.4543613195419312, "rewards/rejected": 0.0, "step": 601 }, { "epoch": 3.363128491620112, "grad_norm": 4.166054699411632, "learning_rate": 9.635843155869448e-07, "logits/chosen": -3.5230698585510254, "logits/rejected": -3.564009428024292, "logps/chosen": -26.481098175048828, "logps/rejected": -31.756576538085938, "loss": 0.1649, "rewards/accuracies": 1.0, "rewards/chosen": 2.0945353507995605, "rewards/margins": 2.0945353507995605, "rewards/rejected": 0.0, "step": 602 }, { "epoch": 3.368715083798883, "grad_norm": 4.527167153730143, "learning_rate": 9.63355608782735e-07, "logits/chosen": -3.5326149463653564, "logits/rejected": -3.4358036518096924, "logps/chosen": -25.61269760131836, "logps/rejected": -17.74207305908203, "loss": 0.1714, "rewards/accuracies": 1.0, "rewards/chosen": 2.192561149597168, "rewards/margins": 2.192561149597168, "rewards/rejected": 0.0, "step": 603 }, { "epoch": 3.3743016759776534, "grad_norm": 4.987759145273387, "learning_rate": 9.631262133495714e-07, "logits/chosen": -3.794609308242798, "logits/rejected": -3.7175488471984863, "logps/chosen": -8.738481521606445, "logps/rejected": -51.309539794921875, "loss": 0.1394, "rewards/accuracies": 1.0, "rewards/chosen": 1.7903088331222534, "rewards/margins": 1.7903088331222534, "rewards/rejected": 0.0, "step": 604 }, { "epoch": 3.3798882681564244, "grad_norm": 5.759532896560646, "learning_rate": 9.628961296283764e-07, "logits/chosen": -3.6752233505249023, "logits/rejected": -3.772005558013916, "logps/chosen": -5.829462051391602, "logps/rejected": -27.75981903076172, "loss": 0.1769, "rewards/accuracies": 1.0, "rewards/chosen": 1.6819231510162354, "rewards/margins": 1.6819231510162354, "rewards/rejected": 0.0, "step": 605 }, { "epoch": 3.3854748603351954, "grad_norm": 4.743147267197351, "learning_rate": 9.626653579610958e-07, "logits/chosen": -3.6155364513397217, "logits/rejected": -3.660079002380371, "logps/chosen": -13.428018569946289, "logps/rejected": -47.90098571777344, "loss": 0.1733, "rewards/accuracies": 1.0, "rewards/chosen": 2.271134614944458, "rewards/margins": 2.271134614944458, "rewards/rejected": 0.0, "step": 606 }, { "epoch": 3.3910614525139664, "grad_norm": 4.898652037325377, "learning_rate": 9.624338986906967e-07, "logits/chosen": -3.5013554096221924, "logits/rejected": -3.5532264709472656, "logps/chosen": -14.624397277832031, "logps/rejected": -30.702373504638672, "loss": 0.1768, "rewards/accuracies": 1.0, "rewards/chosen": 1.7548508644104004, "rewards/margins": 1.7548508644104004, "rewards/rejected": 0.0, "step": 607 }, { "epoch": 3.3966480446927374, "grad_norm": 6.47308077269089, "learning_rate": 9.622017521611697e-07, "logits/chosen": -3.6072585582733154, "logits/rejected": -3.499119997024536, "logps/chosen": -12.317621231079102, "logps/rejected": -28.305444717407227, "loss": 0.1748, "rewards/accuracies": 1.0, "rewards/chosen": 2.382249355316162, "rewards/margins": 2.382249355316162, "rewards/rejected": 0.0, "step": 608 }, { "epoch": 3.4022346368715084, "grad_norm": 4.6331298377841, "learning_rate": 9.619689187175248e-07, "logits/chosen": -3.594680070877075, "logits/rejected": -3.6533143520355225, "logps/chosen": -10.815997123718262, "logps/rejected": -52.948158264160156, "loss": 0.1669, "rewards/accuracies": 1.0, "rewards/chosen": 1.53365159034729, "rewards/margins": 1.53365159034729, "rewards/rejected": 0.0, "step": 609 }, { "epoch": 3.4078212290502794, "grad_norm": 4.438532761181371, "learning_rate": 9.61735398705795e-07, "logits/chosen": -3.5103824138641357, "logits/rejected": -3.302504062652588, "logps/chosen": -31.74606704711914, "logps/rejected": -17.0972957611084, "loss": 0.1491, "rewards/accuracies": 1.0, "rewards/chosen": 2.507812023162842, "rewards/margins": 2.507812023162842, "rewards/rejected": 0.0, "step": 610 }, { "epoch": 3.4134078212290504, "grad_norm": 5.218958127801242, "learning_rate": 9.615011924730318e-07, "logits/chosen": -3.8670122623443604, "logits/rejected": -3.679332733154297, "logps/chosen": -14.15910530090332, "logps/rejected": -21.305110931396484, "loss": 0.1687, "rewards/accuracies": 1.0, "rewards/chosen": 1.9713423252105713, "rewards/margins": 1.9713423252105713, "rewards/rejected": 0.0, "step": 611 }, { "epoch": 3.4189944134078214, "grad_norm": 4.811857730384431, "learning_rate": 9.612663003673077e-07, "logits/chosen": -3.7115883827209473, "logits/rejected": -3.7226953506469727, "logps/chosen": -9.147367477416992, "logps/rejected": -20.090017318725586, "loss": 0.1743, "rewards/accuracies": 1.0, "rewards/chosen": 1.535884141921997, "rewards/margins": 1.535884141921997, "rewards/rejected": 0.0, "step": 612 }, { "epoch": 3.4245810055865924, "grad_norm": 5.1760160227896135, "learning_rate": 9.610307227377145e-07, "logits/chosen": -4.140496253967285, "logits/rejected": -3.9840750694274902, "logps/chosen": -37.68970489501953, "logps/rejected": -21.389732360839844, "loss": 0.1513, "rewards/accuracies": 1.0, "rewards/chosen": 2.4988536834716797, "rewards/margins": 2.4988536834716797, "rewards/rejected": 0.0, "step": 613 }, { "epoch": 3.430167597765363, "grad_norm": 6.542052114074927, "learning_rate": 9.60794459934362e-07, "logits/chosen": -3.9749577045440674, "logits/rejected": -3.6949622631073, "logps/chosen": -7.262472152709961, "logps/rejected": -27.838661193847656, "loss": 0.1995, "rewards/accuracies": 1.0, "rewards/chosen": 1.7984397411346436, "rewards/margins": 1.7984397411346436, "rewards/rejected": 0.0, "step": 614 }, { "epoch": 3.435754189944134, "grad_norm": 4.723249123539818, "learning_rate": 9.60557512308379e-07, "logits/chosen": -3.4244918823242188, "logits/rejected": -3.4502954483032227, "logps/chosen": -16.041728973388672, "logps/rejected": -34.97552490234375, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 1.9372085332870483, "rewards/margins": 1.9372085332870483, "rewards/rejected": 0.0, "step": 615 }, { "epoch": 3.441340782122905, "grad_norm": 6.321918155936006, "learning_rate": 9.60319880211912e-07, "logits/chosen": -3.6923134326934814, "logits/rejected": -3.7828528881073, "logps/chosen": -8.835647583007812, "logps/rejected": -19.127933502197266, "loss": 0.1878, "rewards/accuracies": 1.0, "rewards/chosen": 1.688408374786377, "rewards/margins": 1.688408374786377, "rewards/rejected": 0.0, "step": 616 }, { "epoch": 3.446927374301676, "grad_norm": 5.15241985028114, "learning_rate": 9.600815639981248e-07, "logits/chosen": -3.5907249450683594, "logits/rejected": -3.7290914058685303, "logps/chosen": -20.6009578704834, "logps/rejected": -23.31732177734375, "loss": 0.1511, "rewards/accuracies": 1.0, "rewards/chosen": 2.027575969696045, "rewards/margins": 2.027575969696045, "rewards/rejected": 0.0, "step": 617 }, { "epoch": 3.452513966480447, "grad_norm": 6.098796287127406, "learning_rate": 9.598425640211971e-07, "logits/chosen": -3.561833620071411, "logits/rejected": -3.593796730041504, "logps/chosen": -14.328054428100586, "logps/rejected": -27.98879623413086, "loss": 0.1893, "rewards/accuracies": 1.0, "rewards/chosen": 1.8051799535751343, "rewards/margins": 1.8051799535751343, "rewards/rejected": 0.0, "step": 618 }, { "epoch": 3.458100558659218, "grad_norm": 6.881151069002195, "learning_rate": 9.596028806363258e-07, "logits/chosen": -3.9076590538024902, "logits/rejected": -3.656128406524658, "logps/chosen": -24.803630828857422, "logps/rejected": -40.879093170166016, "loss": 0.1708, "rewards/accuracies": 1.0, "rewards/chosen": 1.7125649452209473, "rewards/margins": 1.7125649452209473, "rewards/rejected": 0.0, "step": 619 }, { "epoch": 3.463687150837989, "grad_norm": 4.453490548371726, "learning_rate": 9.593625141997232e-07, "logits/chosen": -3.6640028953552246, "logits/rejected": -3.509269952774048, "logps/chosen": -9.31806755065918, "logps/rejected": -26.860980987548828, "loss": 0.1543, "rewards/accuracies": 1.0, "rewards/chosen": 1.7607637643814087, "rewards/margins": 1.7607637643814087, "rewards/rejected": 0.0, "step": 620 }, { "epoch": 3.463687150837989, "eval_logits/chosen": -3.4551804065704346, "eval_logits/rejected": -3.559019088745117, "eval_logps/chosen": -19.769975662231445, "eval_logps/rejected": -34.009849548339844, "eval_loss": 0.3931514024734497, "eval_rewards/accuracies": 0.925000011920929, "eval_rewards/chosen": 1.0300068855285645, "eval_rewards/margins": 1.0300068855285645, "eval_rewards/rejected": 0.0, "eval_runtime": 34.8509, "eval_samples_per_second": 8.895, "eval_steps_per_second": 0.287, "step": 620 }, { "epoch": 3.46927374301676, "grad_norm": 4.69081008723961, "learning_rate": 9.591214650686165e-07, "logits/chosen": -3.429396867752075, "logits/rejected": -3.533478260040283, "logps/chosen": -9.281721115112305, "logps/rejected": -41.42967224121094, "loss": 0.1535, "rewards/accuracies": 1.0, "rewards/chosen": 1.78550124168396, "rewards/margins": 1.78550124168396, "rewards/rejected": 0.0, "step": 621 }, { "epoch": 3.4748603351955305, "grad_norm": 5.138819012890861, "learning_rate": 9.588797336012476e-07, "logits/chosen": -3.8199877738952637, "logits/rejected": -3.800265073776245, "logps/chosen": -8.140040397644043, "logps/rejected": -24.334957122802734, "loss": 0.1587, "rewards/accuracies": 1.0, "rewards/chosen": 1.5762226581573486, "rewards/margins": 1.5762226581573486, "rewards/rejected": 0.0, "step": 622 }, { "epoch": 3.4804469273743015, "grad_norm": 5.094867464072331, "learning_rate": 9.586373201568727e-07, "logits/chosen": -3.436248540878296, "logits/rejected": -3.528372049331665, "logps/chosen": -6.533449172973633, "logps/rejected": -46.51861572265625, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": 1.7865111827850342, "rewards/margins": 1.7865111827850342, "rewards/rejected": 0.0, "step": 623 }, { "epoch": 3.4860335195530725, "grad_norm": 7.0326792611564315, "learning_rate": 9.58394225095761e-07, "logits/chosen": -3.6675281524658203, "logits/rejected": -3.461420774459839, "logps/chosen": -12.002155303955078, "logps/rejected": -31.428363800048828, "loss": 0.1891, "rewards/accuracies": 1.0, "rewards/chosen": 1.4570937156677246, "rewards/margins": 1.4570937156677246, "rewards/rejected": 0.0, "step": 624 }, { "epoch": 3.4916201117318435, "grad_norm": 5.30470515311673, "learning_rate": 9.581504487791954e-07, "logits/chosen": -3.8229575157165527, "logits/rejected": -3.6838696002960205, "logps/chosen": -5.357357501983643, "logps/rejected": -30.263629913330078, "loss": 0.186, "rewards/accuracies": 1.0, "rewards/chosen": 1.4722356796264648, "rewards/margins": 1.4722356796264648, "rewards/rejected": 0.0, "step": 625 }, { "epoch": 3.4972067039106145, "grad_norm": 5.262133806438183, "learning_rate": 9.579059915694706e-07, "logits/chosen": -3.353064775466919, "logits/rejected": -3.4130282402038574, "logps/chosen": -17.963783264160156, "logps/rejected": -30.49808120727539, "loss": 0.2108, "rewards/accuracies": 1.0, "rewards/chosen": 1.7664779424667358, "rewards/margins": 1.7664779424667358, "rewards/rejected": 0.0, "step": 626 }, { "epoch": 3.5027932960893855, "grad_norm": 5.004012675634213, "learning_rate": 9.57660853829894e-07, "logits/chosen": -3.4588680267333984, "logits/rejected": -3.574172258377075, "logps/chosen": -6.091219902038574, "logps/rejected": -29.69709014892578, "loss": 0.1659, "rewards/accuracies": 1.0, "rewards/chosen": 1.4152452945709229, "rewards/margins": 1.4152452945709229, "rewards/rejected": 0.0, "step": 627 }, { "epoch": 3.5083798882681565, "grad_norm": 4.626292696994778, "learning_rate": 9.574150359247835e-07, "logits/chosen": -3.640641689300537, "logits/rejected": -3.775499105453491, "logps/chosen": -15.644683837890625, "logps/rejected": -39.21204376220703, "loss": 0.1422, "rewards/accuracies": 1.0, "rewards/chosen": 2.2473812103271484, "rewards/margins": 2.2473812103271484, "rewards/rejected": 0.0, "step": 628 }, { "epoch": 3.5139664804469275, "grad_norm": 7.093816370215724, "learning_rate": 9.571685382194685e-07, "logits/chosen": -3.4563698768615723, "logits/rejected": -3.6638269424438477, "logps/chosen": -5.070566654205322, "logps/rejected": -36.20479965209961, "loss": 0.196, "rewards/accuracies": 1.0, "rewards/chosen": 1.20818293094635, "rewards/margins": 1.20818293094635, "rewards/rejected": 0.0, "step": 629 }, { "epoch": 3.5195530726256985, "grad_norm": 5.626556316151276, "learning_rate": 9.569213610802883e-07, "logits/chosen": -3.2967770099639893, "logits/rejected": -3.477936029434204, "logps/chosen": -20.65081024169922, "logps/rejected": -46.64710235595703, "loss": 0.1941, "rewards/accuracies": 1.0, "rewards/chosen": 1.5222539901733398, "rewards/margins": 1.5222539901733398, "rewards/rejected": 0.0, "step": 630 }, { "epoch": 3.5251396648044695, "grad_norm": 5.238010505375191, "learning_rate": 9.566735048745925e-07, "logits/chosen": -3.8448498249053955, "logits/rejected": -3.834787130355835, "logps/chosen": -12.810737609863281, "logps/rejected": -23.271757125854492, "loss": 0.1689, "rewards/accuracies": 1.0, "rewards/chosen": 1.8085863590240479, "rewards/margins": 1.8085863590240479, "rewards/rejected": 0.0, "step": 631 }, { "epoch": 3.5307262569832405, "grad_norm": 5.361118050321566, "learning_rate": 9.564249699707393e-07, "logits/chosen": -3.449556350708008, "logits/rejected": -3.5498440265655518, "logps/chosen": -14.951969146728516, "logps/rejected": -39.278709411621094, "loss": 0.1504, "rewards/accuracies": 1.0, "rewards/chosen": 2.020573616027832, "rewards/margins": 2.020573616027832, "rewards/rejected": 0.0, "step": 632 }, { "epoch": 3.536312849162011, "grad_norm": 5.868396347645879, "learning_rate": 9.561757567380958e-07, "logits/chosen": -3.4344379901885986, "logits/rejected": -3.4854111671447754, "logps/chosen": -10.592303276062012, "logps/rejected": -27.1976375579834, "loss": 0.1792, "rewards/accuracies": 1.0, "rewards/chosen": 1.5092116594314575, "rewards/margins": 1.5092116594314575, "rewards/rejected": 0.0, "step": 633 }, { "epoch": 3.541899441340782, "grad_norm": 5.212421228813011, "learning_rate": 9.559258655470375e-07, "logits/chosen": -3.630856513977051, "logits/rejected": -3.720335006713867, "logps/chosen": -11.691306114196777, "logps/rejected": -57.59962463378906, "loss": 0.1744, "rewards/accuracies": 1.0, "rewards/chosen": 2.189746379852295, "rewards/margins": 2.189746379852295, "rewards/rejected": 0.0, "step": 634 }, { "epoch": 3.547486033519553, "grad_norm": 5.150740438504914, "learning_rate": 9.556752967689469e-07, "logits/chosen": -3.617985725402832, "logits/rejected": -3.5494420528411865, "logps/chosen": -11.184094429016113, "logps/rejected": -33.076351165771484, "loss": 0.1861, "rewards/accuracies": 1.0, "rewards/chosen": 2.091069221496582, "rewards/margins": 2.091069221496582, "rewards/rejected": 0.0, "step": 635 }, { "epoch": 3.553072625698324, "grad_norm": 5.440410797672074, "learning_rate": 9.55424050776214e-07, "logits/chosen": -3.30224347114563, "logits/rejected": -3.57208514213562, "logps/chosen": -10.038273811340332, "logps/rejected": -29.3889217376709, "loss": 0.1916, "rewards/accuracies": 1.0, "rewards/chosen": 1.6328983306884766, "rewards/margins": 1.6328983306884766, "rewards/rejected": 0.0, "step": 636 }, { "epoch": 3.558659217877095, "grad_norm": 4.65723960855201, "learning_rate": 9.55172127942235e-07, "logits/chosen": -3.5845320224761963, "logits/rejected": -3.3195221424102783, "logps/chosen": -12.603439331054688, "logps/rejected": -73.64794921875, "loss": 0.1546, "rewards/accuracies": 1.0, "rewards/chosen": 1.90388822555542, "rewards/margins": 1.90388822555542, "rewards/rejected": 0.0, "step": 637 }, { "epoch": 3.564245810055866, "grad_norm": 6.528446821475113, "learning_rate": 9.549195286414122e-07, "logits/chosen": -3.185835123062134, "logits/rejected": -3.4180290699005127, "logps/chosen": -31.241548538208008, "logps/rejected": -42.23626708984375, "loss": 0.1527, "rewards/accuracies": 1.0, "rewards/chosen": 2.1054329872131348, "rewards/margins": 2.1054329872131348, "rewards/rejected": 0.0, "step": 638 }, { "epoch": 3.5698324022346366, "grad_norm": 4.677698484388385, "learning_rate": 9.54666253249153e-07, "logits/chosen": -3.7723662853240967, "logits/rejected": -3.773625373840332, "logps/chosen": -3.9417715072631836, "logps/rejected": -16.878204345703125, "loss": 0.1987, "rewards/accuracies": 1.0, "rewards/chosen": 1.1821473836898804, "rewards/margins": 1.1821473836898804, "rewards/rejected": 0.0, "step": 639 }, { "epoch": 3.5754189944134076, "grad_norm": 5.117589561037088, "learning_rate": 9.5441230214187e-07, "logits/chosen": -3.399118661880493, "logits/rejected": -3.243917942047119, "logps/chosen": -18.99847412109375, "logps/rejected": -34.145355224609375, "loss": 0.17, "rewards/accuracies": 1.0, "rewards/chosen": 1.8375072479248047, "rewards/margins": 1.8375072479248047, "rewards/rejected": 0.0, "step": 640 }, { "epoch": 3.5754189944134076, "eval_logits/chosen": -3.4527626037597656, "eval_logits/rejected": -3.556689739227295, "eval_logps/chosen": -19.967891693115234, "eval_logps/rejected": -34.24546432495117, "eval_loss": 0.40128594636917114, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 1.0102150440216064, "eval_rewards/margins": 1.0102150440216064, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7062, "eval_samples_per_second": 9.478, "eval_steps_per_second": 0.306, "step": 640 }, { "epoch": 3.5810055865921786, "grad_norm": 5.133411055097609, "learning_rate": 9.541576756969796e-07, "logits/chosen": -3.5344247817993164, "logits/rejected": -3.62461256980896, "logps/chosen": -23.3912296295166, "logps/rejected": -42.15888595581055, "loss": 0.1522, "rewards/accuracies": 1.0, "rewards/chosen": 2.3202648162841797, "rewards/margins": 2.3202648162841797, "rewards/rejected": 0.0, "step": 641 }, { "epoch": 3.5865921787709496, "grad_norm": 4.570418247843243, "learning_rate": 9.539023742929021e-07, "logits/chosen": -3.8138322830200195, "logits/rejected": -3.665846586227417, "logps/chosen": -9.992279052734375, "logps/rejected": -33.50912857055664, "loss": 0.1555, "rewards/accuracies": 1.0, "rewards/chosen": 2.0003204345703125, "rewards/margins": 2.0003204345703125, "rewards/rejected": 0.0, "step": 642 }, { "epoch": 3.5921787709497206, "grad_norm": 7.227085737302612, "learning_rate": 9.536463983090606e-07, "logits/chosen": -3.5766735076904297, "logits/rejected": -3.831467390060425, "logps/chosen": -8.467517852783203, "logps/rejected": -32.18593978881836, "loss": 0.1831, "rewards/accuracies": 1.0, "rewards/chosen": 1.8903911113739014, "rewards/margins": 1.8903911113739014, "rewards/rejected": 0.0, "step": 643 }, { "epoch": 3.5977653631284916, "grad_norm": 5.682147119312251, "learning_rate": 9.533897481258815e-07, "logits/chosen": -3.520840883255005, "logits/rejected": -3.321063280105591, "logps/chosen": -13.972931861877441, "logps/rejected": -66.25763702392578, "loss": 0.1799, "rewards/accuracies": 1.0, "rewards/chosen": 1.5537865161895752, "rewards/margins": 1.5537865161895752, "rewards/rejected": 0.0, "step": 644 }, { "epoch": 3.6033519553072626, "grad_norm": 5.696592487103718, "learning_rate": 9.531324241247922e-07, "logits/chosen": -3.572374105453491, "logits/rejected": -3.5924270153045654, "logps/chosen": -24.20823097229004, "logps/rejected": -24.128681182861328, "loss": 0.1591, "rewards/accuracies": 1.0, "rewards/chosen": 2.120861053466797, "rewards/margins": 2.120861053466797, "rewards/rejected": 0.0, "step": 645 }, { "epoch": 3.6089385474860336, "grad_norm": 6.012466721685493, "learning_rate": 9.528744266882223e-07, "logits/chosen": -3.735198497772217, "logits/rejected": -3.734330892562866, "logps/chosen": -26.19061851501465, "logps/rejected": -27.667203903198242, "loss": 0.1793, "rewards/accuracies": 1.0, "rewards/chosen": 2.7986745834350586, "rewards/margins": 2.7986745834350586, "rewards/rejected": 0.0, "step": 646 }, { "epoch": 3.6145251396648046, "grad_norm": 4.644339200519325, "learning_rate": 9.526157561996017e-07, "logits/chosen": -3.922447443008423, "logits/rejected": -3.8625240325927734, "logps/chosen": -17.55810546875, "logps/rejected": -28.501117706298828, "loss": 0.1665, "rewards/accuracies": 1.0, "rewards/chosen": 2.302061080932617, "rewards/margins": 2.302061080932617, "rewards/rejected": 0.0, "step": 647 }, { "epoch": 3.6201117318435756, "grad_norm": 4.455673398979726, "learning_rate": 9.523564130433609e-07, "logits/chosen": -3.790313959121704, "logits/rejected": -3.7689945697784424, "logps/chosen": -4.420231819152832, "logps/rejected": -21.001861572265625, "loss": 0.1689, "rewards/accuracies": 1.0, "rewards/chosen": 1.5885013341903687, "rewards/margins": 1.5885013341903687, "rewards/rejected": 0.0, "step": 648 }, { "epoch": 3.6256983240223466, "grad_norm": 5.067755669359323, "learning_rate": 9.520963976049301e-07, "logits/chosen": -3.3586041927337646, "logits/rejected": -3.1449713706970215, "logps/chosen": -25.404495239257812, "logps/rejected": -51.243751525878906, "loss": 0.1795, "rewards/accuracies": 1.0, "rewards/chosen": 1.6585184335708618, "rewards/margins": 1.6585184335708618, "rewards/rejected": 0.0, "step": 649 }, { "epoch": 3.631284916201117, "grad_norm": 5.296324090772392, "learning_rate": 9.518357102707386e-07, "logits/chosen": -3.651949882507324, "logits/rejected": -3.735245943069458, "logps/chosen": -4.745665550231934, "logps/rejected": -27.808101654052734, "loss": 0.1833, "rewards/accuracies": 1.0, "rewards/chosen": 1.3110535144805908, "rewards/margins": 1.3110535144805908, "rewards/rejected": 0.0, "step": 650 }, { "epoch": 3.636871508379888, "grad_norm": 4.365839520605308, "learning_rate": 9.51574351428214e-07, "logits/chosen": -3.827695608139038, "logits/rejected": -3.716557502746582, "logps/chosen": -11.460929870605469, "logps/rejected": -32.29026794433594, "loss": 0.2001, "rewards/accuracies": 1.0, "rewards/chosen": 1.666351079940796, "rewards/margins": 1.666351079940796, "rewards/rejected": 0.0, "step": 651 }, { "epoch": 3.642458100558659, "grad_norm": 4.479324786427868, "learning_rate": 9.513123214657821e-07, "logits/chosen": -3.68721866607666, "logits/rejected": -3.325680732727051, "logps/chosen": -9.46074390411377, "logps/rejected": -41.34431457519531, "loss": 0.1789, "rewards/accuracies": 1.0, "rewards/chosen": 1.501688838005066, "rewards/margins": 1.501688838005066, "rewards/rejected": 0.0, "step": 652 }, { "epoch": 3.64804469273743, "grad_norm": 5.149847701080681, "learning_rate": 9.510496207728662e-07, "logits/chosen": -3.6695034503936768, "logits/rejected": -3.5877413749694824, "logps/chosen": -15.89958381652832, "logps/rejected": -28.580312728881836, "loss": 0.1903, "rewards/accuracies": 1.0, "rewards/chosen": 2.0160269737243652, "rewards/margins": 2.0160269737243652, "rewards/rejected": 0.0, "step": 653 }, { "epoch": 3.653631284916201, "grad_norm": 5.109958931687544, "learning_rate": 9.507862497398864e-07, "logits/chosen": -4.000102996826172, "logits/rejected": -3.9663286209106445, "logps/chosen": -7.500527381896973, "logps/rejected": -24.804828643798828, "loss": 0.1609, "rewards/accuracies": 1.0, "rewards/chosen": 1.7702608108520508, "rewards/margins": 1.7702608108520508, "rewards/rejected": 0.0, "step": 654 }, { "epoch": 3.659217877094972, "grad_norm": 4.656953780482441, "learning_rate": 9.505222087582588e-07, "logits/chosen": -3.6987736225128174, "logits/rejected": -3.8206393718719482, "logps/chosen": -10.570422172546387, "logps/rejected": -27.170337677001953, "loss": 0.1954, "rewards/accuracies": 1.0, "rewards/chosen": 1.9592132568359375, "rewards/margins": 1.9592132568359375, "rewards/rejected": 0.0, "step": 655 }, { "epoch": 3.664804469273743, "grad_norm": 4.713062170943476, "learning_rate": 9.502574982203954e-07, "logits/chosen": -3.754716157913208, "logits/rejected": -3.68346905708313, "logps/chosen": -13.092695236206055, "logps/rejected": -32.32396697998047, "loss": 0.1857, "rewards/accuracies": 1.0, "rewards/chosen": 2.1699705123901367, "rewards/margins": 2.1699705123901367, "rewards/rejected": 0.0, "step": 656 }, { "epoch": 3.6703910614525137, "grad_norm": 4.215207701061461, "learning_rate": 9.499921185197032e-07, "logits/chosen": -3.3362271785736084, "logits/rejected": -3.278992176055908, "logps/chosen": -4.92309045791626, "logps/rejected": -34.32732009887695, "loss": 0.1664, "rewards/accuracies": 1.0, "rewards/chosen": 1.4023206233978271, "rewards/margins": 1.4023206233978271, "rewards/rejected": 0.0, "step": 657 }, { "epoch": 3.6759776536312847, "grad_norm": 5.333585474707274, "learning_rate": 9.497260700505835e-07, "logits/chosen": -3.3382411003112793, "logits/rejected": -3.0705416202545166, "logps/chosen": -14.22011947631836, "logps/rejected": -36.958946228027344, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 1.247107982635498, "rewards/margins": 1.247107982635498, "rewards/rejected": 0.0, "step": 658 }, { "epoch": 3.6815642458100557, "grad_norm": 5.676048291789358, "learning_rate": 9.494593532084321e-07, "logits/chosen": -3.778850793838501, "logits/rejected": -3.78900408744812, "logps/chosen": -7.684193134307861, "logps/rejected": -40.97362518310547, "loss": 0.1799, "rewards/accuracies": 1.0, "rewards/chosen": 1.6917026042938232, "rewards/margins": 1.6917026042938232, "rewards/rejected": 0.0, "step": 659 }, { "epoch": 3.6871508379888267, "grad_norm": 4.901932214135667, "learning_rate": 9.491919683896373e-07, "logits/chosen": -3.4505538940429688, "logits/rejected": -3.496281147003174, "logps/chosen": -4.362544059753418, "logps/rejected": -25.02591323852539, "loss": 0.1905, "rewards/accuracies": 1.0, "rewards/chosen": 1.1565850973129272, "rewards/margins": 1.1565850973129272, "rewards/rejected": 0.0, "step": 660 }, { "epoch": 3.6871508379888267, "eval_logits/chosen": -3.4496493339538574, "eval_logits/rejected": -3.5553760528564453, "eval_logps/chosen": -20.094425201416016, "eval_logps/rejected": -34.56037139892578, "eval_loss": 0.39944276213645935, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.9975619316101074, "eval_rewards/margins": 0.9975619316101074, "eval_rewards/rejected": 0.0, "eval_runtime": 32.736, "eval_samples_per_second": 9.47, "eval_steps_per_second": 0.305, "step": 660 }, { "epoch": 3.6927374301675977, "grad_norm": 7.205953665972467, "learning_rate": 9.48923915991581e-07, "logits/chosen": -3.631009101867676, "logits/rejected": -3.6369123458862305, "logps/chosen": -34.10837936401367, "logps/rejected": -23.194347381591797, "loss": 0.1604, "rewards/accuracies": 1.0, "rewards/chosen": 1.9607927799224854, "rewards/margins": 1.9607927799224854, "rewards/rejected": 0.0, "step": 661 }, { "epoch": 3.6983240223463687, "grad_norm": 5.019279966582101, "learning_rate": 9.486551964126362e-07, "logits/chosen": -3.6323862075805664, "logits/rejected": -3.6116278171539307, "logps/chosen": -5.30668830871582, "logps/rejected": -29.413761138916016, "loss": 0.1511, "rewards/accuracies": 1.0, "rewards/chosen": 1.2488832473754883, "rewards/margins": 1.2488832473754883, "rewards/rejected": 0.0, "step": 662 }, { "epoch": 3.7039106145251397, "grad_norm": 4.732609891488001, "learning_rate": 9.483858100521686e-07, "logits/chosen": -3.6550815105438232, "logits/rejected": -3.189901113510132, "logps/chosen": -8.990190505981445, "logps/rejected": -46.08880615234375, "loss": 0.1565, "rewards/accuracies": 1.0, "rewards/chosen": 1.7714471817016602, "rewards/margins": 1.7714471817016602, "rewards/rejected": 0.0, "step": 663 }, { "epoch": 3.7094972067039107, "grad_norm": 5.120186565591483, "learning_rate": 9.481157573105343e-07, "logits/chosen": -3.8208818435668945, "logits/rejected": -3.5488696098327637, "logps/chosen": -6.346719741821289, "logps/rejected": -25.36515998840332, "loss": 0.1615, "rewards/accuracies": 1.0, "rewards/chosen": 1.6430108547210693, "rewards/margins": 1.6430108547210693, "rewards/rejected": 0.0, "step": 664 }, { "epoch": 3.7150837988826817, "grad_norm": 4.466537483585655, "learning_rate": 9.478450385890793e-07, "logits/chosen": -3.73468279838562, "logits/rejected": -3.436859369277954, "logps/chosen": -14.974721908569336, "logps/rejected": -31.110218048095703, "loss": 0.1636, "rewards/accuracies": 1.0, "rewards/chosen": 2.5450804233551025, "rewards/margins": 2.5450804233551025, "rewards/rejected": 0.0, "step": 665 }, { "epoch": 3.7206703910614527, "grad_norm": 5.622882110084028, "learning_rate": 9.475736542901402e-07, "logits/chosen": -3.625474691390991, "logits/rejected": -3.738278865814209, "logps/chosen": -10.976929664611816, "logps/rejected": -31.13936996459961, "loss": 0.1662, "rewards/accuracies": 1.0, "rewards/chosen": 1.916973352432251, "rewards/margins": 1.916973352432251, "rewards/rejected": 0.0, "step": 666 }, { "epoch": 3.7262569832402237, "grad_norm": 6.155213830518234, "learning_rate": 9.473016048170424e-07, "logits/chosen": -3.7011091709136963, "logits/rejected": -3.6307482719421387, "logps/chosen": -9.683084487915039, "logps/rejected": -50.2158088684082, "loss": 0.2093, "rewards/accuracies": 1.0, "rewards/chosen": 1.3847639560699463, "rewards/margins": 1.3847639560699463, "rewards/rejected": 0.0, "step": 667 }, { "epoch": 3.7318435754189943, "grad_norm": 5.543091406250431, "learning_rate": 9.470288905740997e-07, "logits/chosen": -3.4428887367248535, "logits/rejected": -3.432431936264038, "logps/chosen": -23.892230987548828, "logps/rejected": -20.082015991210938, "loss": 0.1712, "rewards/accuracies": 1.0, "rewards/chosen": 2.1780343055725098, "rewards/margins": 2.1780343055725098, "rewards/rejected": 0.0, "step": 668 }, { "epoch": 3.7374301675977653, "grad_norm": 8.02166683785944, "learning_rate": 9.467555119666142e-07, "logits/chosen": -3.8772687911987305, "logits/rejected": -3.8502283096313477, "logps/chosen": -5.98099422454834, "logps/rejected": -44.68482208251953, "loss": 0.1743, "rewards/accuracies": 1.0, "rewards/chosen": 1.5072976350784302, "rewards/margins": 1.5072976350784302, "rewards/rejected": 0.0, "step": 669 }, { "epoch": 3.7430167597765363, "grad_norm": 6.802613644354898, "learning_rate": 9.464814694008751e-07, "logits/chosen": -3.6652045249938965, "logits/rejected": -3.6151578426361084, "logps/chosen": -13.424704551696777, "logps/rejected": -20.253692626953125, "loss": 0.2114, "rewards/accuracies": 1.0, "rewards/chosen": 2.2190957069396973, "rewards/margins": 2.2190957069396973, "rewards/rejected": 0.0, "step": 670 }, { "epoch": 3.7486033519553073, "grad_norm": 4.92290955136957, "learning_rate": 9.462067632841584e-07, "logits/chosen": -3.666156053543091, "logits/rejected": -3.7569448947906494, "logps/chosen": -7.920692443847656, "logps/rejected": -29.915081024169922, "loss": 0.1734, "rewards/accuracies": 1.0, "rewards/chosen": 1.3987126350402832, "rewards/margins": 1.3987126350402832, "rewards/rejected": 0.0, "step": 671 }, { "epoch": 3.7541899441340782, "grad_norm": 6.164093722805633, "learning_rate": 9.459313940247265e-07, "logits/chosen": -3.394481897354126, "logits/rejected": -3.5197486877441406, "logps/chosen": -29.63011360168457, "logps/rejected": -15.040891647338867, "loss": 0.1581, "rewards/accuracies": 1.0, "rewards/chosen": 1.591725468635559, "rewards/margins": 1.591725468635559, "rewards/rejected": 0.0, "step": 672 }, { "epoch": 3.7597765363128492, "grad_norm": 4.5768595129802225, "learning_rate": 9.456553620318268e-07, "logits/chosen": -3.614335298538208, "logits/rejected": -3.7072131633758545, "logps/chosen": -7.686119079589844, "logps/rejected": -48.3739128112793, "loss": 0.1537, "rewards/accuracies": 1.0, "rewards/chosen": 1.6674258708953857, "rewards/margins": 1.6674258708953857, "rewards/rejected": 0.0, "step": 673 }, { "epoch": 3.7653631284916202, "grad_norm": 5.756451394662651, "learning_rate": 9.453786677156925e-07, "logits/chosen": -3.025435209274292, "logits/rejected": -2.9831600189208984, "logps/chosen": -32.42550277709961, "logps/rejected": -43.135040283203125, "loss": 0.1592, "rewards/accuracies": 1.0, "rewards/chosen": 1.9021589756011963, "rewards/margins": 1.9021589756011963, "rewards/rejected": 0.0, "step": 674 }, { "epoch": 3.770949720670391, "grad_norm": 5.462064085578757, "learning_rate": 9.451013114875403e-07, "logits/chosen": -3.7452549934387207, "logits/rejected": -3.701018810272217, "logps/chosen": -23.871150970458984, "logps/rejected": -49.515480041503906, "loss": 0.1527, "rewards/accuracies": 1.0, "rewards/chosen": 2.078068733215332, "rewards/margins": 2.078068733215332, "rewards/rejected": 0.0, "step": 675 }, { "epoch": 3.776536312849162, "grad_norm": 5.547590022778659, "learning_rate": 9.448232937595709e-07, "logits/chosen": -3.4784772396087646, "logits/rejected": -3.3884263038635254, "logps/chosen": -25.33246612548828, "logps/rejected": -26.745288848876953, "loss": 0.1654, "rewards/accuracies": 1.0, "rewards/chosen": 2.1457791328430176, "rewards/margins": 2.1457791328430176, "rewards/rejected": 0.0, "step": 676 }, { "epoch": 3.782122905027933, "grad_norm": 6.291665828229705, "learning_rate": 9.445446149449686e-07, "logits/chosen": -3.669466018676758, "logits/rejected": -3.5027315616607666, "logps/chosen": -8.840606689453125, "logps/rejected": -26.029987335205078, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": 1.6782493591308594, "rewards/margins": 1.6782493591308594, "rewards/rejected": 0.0, "step": 677 }, { "epoch": 3.787709497206704, "grad_norm": 5.7516916496717645, "learning_rate": 9.442652754578991e-07, "logits/chosen": -3.5253586769104004, "logits/rejected": -3.5481929779052734, "logps/chosen": -38.82970428466797, "logps/rejected": -32.559940338134766, "loss": 0.1614, "rewards/accuracies": 1.0, "rewards/chosen": 1.7083141803741455, "rewards/margins": 1.7083141803741455, "rewards/rejected": 0.0, "step": 678 }, { "epoch": 3.793296089385475, "grad_norm": 5.676763386544318, "learning_rate": 9.439852757135109e-07, "logits/chosen": -3.507185935974121, "logits/rejected": -3.475709915161133, "logps/chosen": -24.707077026367188, "logps/rejected": -32.674659729003906, "loss": 0.1582, "rewards/accuracies": 1.0, "rewards/chosen": 2.5474183559417725, "rewards/margins": 2.5474183559417725, "rewards/rejected": 0.0, "step": 679 }, { "epoch": 3.798882681564246, "grad_norm": 6.652992280245279, "learning_rate": 9.437046161279338e-07, "logits/chosen": -3.5408294200897217, "logits/rejected": -3.46547794342041, "logps/chosen": -4.129613876342773, "logps/rejected": -43.75081253051758, "loss": 0.2188, "rewards/accuracies": 1.0, "rewards/chosen": 1.249681830406189, "rewards/margins": 1.249681830406189, "rewards/rejected": 0.0, "step": 680 }, { "epoch": 3.798882681564246, "eval_logits/chosen": -3.46787691116333, "eval_logits/rejected": -3.571978807449341, "eval_logps/chosen": -19.941822052001953, "eval_logps/rejected": -34.716636657714844, "eval_loss": 0.3982676863670349, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 1.012821912765503, "eval_rewards/margins": 1.012821912765503, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6974, "eval_samples_per_second": 9.481, "eval_steps_per_second": 0.306, "step": 680 }, { "epoch": 3.804469273743017, "grad_norm": 6.225168750349631, "learning_rate": 9.434232971182776e-07, "logits/chosen": -2.9479496479034424, "logits/rejected": -2.898322582244873, "logps/chosen": -31.89510154724121, "logps/rejected": -45.454803466796875, "loss": 0.1695, "rewards/accuracies": 1.0, "rewards/chosen": 1.8217171430587769, "rewards/margins": 1.8217171430587769, "rewards/rejected": 0.0, "step": 681 }, { "epoch": 3.810055865921788, "grad_norm": 8.990530591590625, "learning_rate": 9.431413191026324e-07, "logits/chosen": -3.5850820541381836, "logits/rejected": -3.771740674972534, "logps/chosen": -10.17590045928955, "logps/rejected": -54.2763671875, "loss": 0.1855, "rewards/accuracies": 1.0, "rewards/chosen": 1.7644659280776978, "rewards/margins": 1.7644659280776978, "rewards/rejected": 0.0, "step": 682 }, { "epoch": 3.815642458100559, "grad_norm": 5.372783330517193, "learning_rate": 9.42858682500068e-07, "logits/chosen": -3.7883987426757812, "logits/rejected": -3.7369813919067383, "logps/chosen": -10.788247108459473, "logps/rejected": -33.16901397705078, "loss": 0.1667, "rewards/accuracies": 1.0, "rewards/chosen": 1.7213985919952393, "rewards/margins": 1.7213985919952393, "rewards/rejected": 0.0, "step": 683 }, { "epoch": 3.82122905027933, "grad_norm": 5.640058295563442, "learning_rate": 9.425753877306325e-07, "logits/chosen": -3.765312671661377, "logits/rejected": -3.813652753829956, "logps/chosen": -21.540424346923828, "logps/rejected": -34.084686279296875, "loss": 0.1856, "rewards/accuracies": 1.0, "rewards/chosen": 1.9886975288391113, "rewards/margins": 1.9886975288391113, "rewards/rejected": 0.0, "step": 684 }, { "epoch": 3.826815642458101, "grad_norm": 4.9159808943093335, "learning_rate": 9.422914352153523e-07, "logits/chosen": -3.9867115020751953, "logits/rejected": -3.6477715969085693, "logps/chosen": -8.451070785522461, "logps/rejected": -68.41416931152344, "loss": 0.1519, "rewards/accuracies": 1.0, "rewards/chosen": 1.7793676853179932, "rewards/margins": 1.7793676853179932, "rewards/rejected": 0.0, "step": 685 }, { "epoch": 3.8324022346368714, "grad_norm": 5.463507510300138, "learning_rate": 9.420068253762318e-07, "logits/chosen": -3.3537416458129883, "logits/rejected": -3.5590171813964844, "logps/chosen": -43.074703216552734, "logps/rejected": -40.1202507019043, "loss": 0.1659, "rewards/accuracies": 1.0, "rewards/chosen": 2.508090019226074, "rewards/margins": 2.508090019226074, "rewards/rejected": 0.0, "step": 686 }, { "epoch": 3.8379888268156424, "grad_norm": 5.973513320758181, "learning_rate": 9.417215586362517e-07, "logits/chosen": -3.8050286769866943, "logits/rejected": -3.7647716999053955, "logps/chosen": -13.847637176513672, "logps/rejected": -22.573652267456055, "loss": 0.2194, "rewards/accuracies": 1.0, "rewards/chosen": 1.968073844909668, "rewards/margins": 1.968073844909668, "rewards/rejected": 0.0, "step": 687 }, { "epoch": 3.8435754189944134, "grad_norm": 5.376646776434792, "learning_rate": 9.414356354193691e-07, "logits/chosen": -3.6765923500061035, "logits/rejected": -3.552823305130005, "logps/chosen": -6.152892112731934, "logps/rejected": -32.36994171142578, "loss": 0.1739, "rewards/accuracies": 1.0, "rewards/chosen": 1.7726759910583496, "rewards/margins": 1.7726759910583496, "rewards/rejected": 0.0, "step": 688 }, { "epoch": 3.8491620111731844, "grad_norm": 7.975341527703978, "learning_rate": 9.41149056150517e-07, "logits/chosen": -3.5974161624908447, "logits/rejected": -3.5041980743408203, "logps/chosen": -5.884551048278809, "logps/rejected": -22.4666748046875, "loss": 0.2047, "rewards/accuracies": 1.0, "rewards/chosen": 1.4673993587493896, "rewards/margins": 1.4673993587493896, "rewards/rejected": 0.0, "step": 689 }, { "epoch": 3.8547486033519553, "grad_norm": 5.3843626924306145, "learning_rate": 9.40861821255603e-07, "logits/chosen": -3.120640277862549, "logits/rejected": -3.1205742359161377, "logps/chosen": -16.056499481201172, "logps/rejected": -31.95566177368164, "loss": 0.1774, "rewards/accuracies": 1.0, "rewards/chosen": 2.2459497451782227, "rewards/margins": 2.2459497451782227, "rewards/rejected": 0.0, "step": 690 }, { "epoch": 3.8603351955307263, "grad_norm": 5.970205138837227, "learning_rate": 9.405739311615093e-07, "logits/chosen": -3.544402837753296, "logits/rejected": -3.556819438934326, "logps/chosen": -8.344869613647461, "logps/rejected": -27.311933517456055, "loss": 0.1976, "rewards/accuracies": 1.0, "rewards/chosen": 1.778423547744751, "rewards/margins": 1.778423547744751, "rewards/rejected": 0.0, "step": 691 }, { "epoch": 3.8659217877094973, "grad_norm": 5.514562150326068, "learning_rate": 9.402853862960922e-07, "logits/chosen": -3.2727842330932617, "logits/rejected": -3.3912999629974365, "logps/chosen": -16.866605758666992, "logps/rejected": -35.99789810180664, "loss": 0.1617, "rewards/accuracies": 1.0, "rewards/chosen": 2.2165558338165283, "rewards/margins": 2.2165558338165283, "rewards/rejected": 0.0, "step": 692 }, { "epoch": 3.871508379888268, "grad_norm": 5.08182089806298, "learning_rate": 9.399961870881803e-07, "logits/chosen": -3.807812213897705, "logits/rejected": -3.3577778339385986, "logps/chosen": -11.646402359008789, "logps/rejected": -44.892578125, "loss": 0.1617, "rewards/accuracies": 1.0, "rewards/chosen": 2.149644136428833, "rewards/margins": 2.149644136428833, "rewards/rejected": 0.0, "step": 693 }, { "epoch": 3.877094972067039, "grad_norm": 5.277433404269871, "learning_rate": 9.397063339675755e-07, "logits/chosen": -3.4058542251586914, "logits/rejected": -3.51452898979187, "logps/chosen": -16.585853576660156, "logps/rejected": -25.68677520751953, "loss": 0.1634, "rewards/accuracies": 1.0, "rewards/chosen": 2.123507261276245, "rewards/margins": 2.123507261276245, "rewards/rejected": 0.0, "step": 694 }, { "epoch": 3.88268156424581, "grad_norm": 5.577486384805503, "learning_rate": 9.39415827365051e-07, "logits/chosen": -3.2718281745910645, "logits/rejected": -3.2168197631835938, "logps/chosen": -39.55937957763672, "logps/rejected": -34.023231506347656, "loss": 0.18, "rewards/accuracies": 1.0, "rewards/chosen": 1.823746681213379, "rewards/margins": 1.823746681213379, "rewards/rejected": 0.0, "step": 695 }, { "epoch": 3.888268156424581, "grad_norm": 6.045947313979074, "learning_rate": 9.391246677123514e-07, "logits/chosen": -3.422668695449829, "logits/rejected": -3.5968868732452393, "logps/chosen": -20.796070098876953, "logps/rejected": -29.368831634521484, "loss": 0.191, "rewards/accuracies": 1.0, "rewards/chosen": 1.6497948169708252, "rewards/margins": 1.6497948169708252, "rewards/rejected": 0.0, "step": 696 }, { "epoch": 3.893854748603352, "grad_norm": 7.031171292255311, "learning_rate": 9.388328554421916e-07, "logits/chosen": -3.605628728866577, "logits/rejected": -3.9246270656585693, "logps/chosen": -7.370364189147949, "logps/rejected": -28.591331481933594, "loss": 0.1916, "rewards/accuracies": 1.0, "rewards/chosen": 1.4823625087738037, "rewards/margins": 1.4823625087738037, "rewards/rejected": 0.0, "step": 697 }, { "epoch": 3.899441340782123, "grad_norm": 5.939034876154746, "learning_rate": 9.385403909882568e-07, "logits/chosen": -3.5976762771606445, "logits/rejected": -3.3776772022247314, "logps/chosen": -47.60350036621094, "logps/rejected": -34.28583526611328, "loss": 0.1746, "rewards/accuracies": 1.0, "rewards/chosen": 2.683462619781494, "rewards/margins": 2.683462619781494, "rewards/rejected": 0.0, "step": 698 }, { "epoch": 3.905027932960894, "grad_norm": 5.466067710144353, "learning_rate": 9.382472747852013e-07, "logits/chosen": -3.800515651702881, "logits/rejected": -3.8023061752319336, "logps/chosen": -8.329952239990234, "logps/rejected": -18.521623611450195, "loss": 0.1745, "rewards/accuracies": 1.0, "rewards/chosen": 1.788851261138916, "rewards/margins": 1.788851261138916, "rewards/rejected": 0.0, "step": 699 }, { "epoch": 3.910614525139665, "grad_norm": 5.108787830059381, "learning_rate": 9.379535072686479e-07, "logits/chosen": -4.01161003112793, "logits/rejected": -3.925262689590454, "logps/chosen": -13.309797286987305, "logps/rejected": -23.609291076660156, "loss": 0.1544, "rewards/accuracies": 1.0, "rewards/chosen": 2.0578761100769043, "rewards/margins": 2.0578761100769043, "rewards/rejected": 0.0, "step": 700 }, { "epoch": 3.910614525139665, "eval_logits/chosen": -3.4569759368896484, "eval_logits/rejected": -3.5611720085144043, "eval_logps/chosen": -19.63014793395996, "eval_logps/rejected": -34.3280029296875, "eval_loss": 0.3954744040966034, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 1.0439895391464233, "eval_rewards/margins": 1.0439895391464233, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7175, "eval_samples_per_second": 9.475, "eval_steps_per_second": 0.306, "step": 700 }, { "epoch": 3.916201117318436, "grad_norm": 6.2322334272570235, "learning_rate": 9.376590888751874e-07, "logits/chosen": -3.695904016494751, "logits/rejected": -3.675644636154175, "logps/chosen": -6.758902549743652, "logps/rejected": -61.0106086730957, "loss": 0.1784, "rewards/accuracies": 1.0, "rewards/chosen": 1.6046574115753174, "rewards/margins": 1.6046574115753174, "rewards/rejected": 0.0, "step": 701 }, { "epoch": 3.921787709497207, "grad_norm": 4.9212756999100735, "learning_rate": 9.373640200423781e-07, "logits/chosen": -3.793055295944214, "logits/rejected": -3.6425044536590576, "logps/chosen": -13.498632431030273, "logps/rejected": -17.117267608642578, "loss": 0.1791, "rewards/accuracies": 1.0, "rewards/chosen": 2.3323898315429688, "rewards/margins": 2.3323898315429688, "rewards/rejected": 0.0, "step": 702 }, { "epoch": 3.927374301675978, "grad_norm": 8.869097302222126, "learning_rate": 9.370683012087446e-07, "logits/chosen": -3.450406551361084, "logits/rejected": -3.642115831375122, "logps/chosen": -34.950592041015625, "logps/rejected": -32.08348846435547, "loss": 0.1862, "rewards/accuracies": 1.0, "rewards/chosen": 2.3248953819274902, "rewards/margins": 2.3248953819274902, "rewards/rejected": 0.0, "step": 703 }, { "epoch": 3.9329608938547485, "grad_norm": 5.4792668465797965, "learning_rate": 9.36771932813778e-07, "logits/chosen": -3.65859317779541, "logits/rejected": -3.6708462238311768, "logps/chosen": -9.614830017089844, "logps/rejected": -31.10177993774414, "loss": 0.1733, "rewards/accuracies": 1.0, "rewards/chosen": 1.6424806118011475, "rewards/margins": 1.6424806118011475, "rewards/rejected": 0.0, "step": 704 }, { "epoch": 3.9385474860335195, "grad_norm": 5.624524800100163, "learning_rate": 9.364749152979343e-07, "logits/chosen": -3.3821287155151367, "logits/rejected": -3.666095733642578, "logps/chosen": -19.01564598083496, "logps/rejected": -27.14424705505371, "loss": 0.1645, "rewards/accuracies": 1.0, "rewards/chosen": 2.2867445945739746, "rewards/margins": 2.2867445945739746, "rewards/rejected": 0.0, "step": 705 }, { "epoch": 3.9441340782122905, "grad_norm": 5.402923801872257, "learning_rate": 9.361772491026346e-07, "logits/chosen": -3.9651875495910645, "logits/rejected": -4.0007171630859375, "logps/chosen": -9.126187324523926, "logps/rejected": -20.278427124023438, "loss": 0.1801, "rewards/accuracies": 1.0, "rewards/chosen": 1.7250056266784668, "rewards/margins": 1.7250056266784668, "rewards/rejected": 0.0, "step": 706 }, { "epoch": 3.9497206703910615, "grad_norm": 4.557502412492466, "learning_rate": 9.358789346702638e-07, "logits/chosen": -3.9285929203033447, "logits/rejected": -3.7418060302734375, "logps/chosen": -31.137500762939453, "logps/rejected": -34.382118225097656, "loss": 0.1544, "rewards/accuracies": 1.0, "rewards/chosen": 2.618072509765625, "rewards/margins": 2.618072509765625, "rewards/rejected": 0.0, "step": 707 }, { "epoch": 3.9553072625698324, "grad_norm": 5.425198650099388, "learning_rate": 9.355799724441703e-07, "logits/chosen": -3.633681297302246, "logits/rejected": -3.6181023120880127, "logps/chosen": -20.004682540893555, "logps/rejected": -15.314538955688477, "loss": 0.1568, "rewards/accuracies": 1.0, "rewards/chosen": 2.233163356781006, "rewards/margins": 2.233163356781006, "rewards/rejected": 0.0, "step": 708 }, { "epoch": 3.9608938547486034, "grad_norm": 7.821883107455096, "learning_rate": 9.352803628686652e-07, "logits/chosen": -3.8305516242980957, "logits/rejected": -3.6730797290802, "logps/chosen": -11.294788360595703, "logps/rejected": -34.06853103637695, "loss": 0.1885, "rewards/accuracies": 1.0, "rewards/chosen": 2.016578435897827, "rewards/margins": 2.016578435897827, "rewards/rejected": 0.0, "step": 709 }, { "epoch": 3.9664804469273744, "grad_norm": 6.80654800888443, "learning_rate": 9.349801063890217e-07, "logits/chosen": -3.607896089553833, "logits/rejected": -3.580332040786743, "logps/chosen": -7.952505111694336, "logps/rejected": -50.59882354736328, "loss": 0.1772, "rewards/accuracies": 1.0, "rewards/chosen": 1.695425033569336, "rewards/margins": 1.695425033569336, "rewards/rejected": 0.0, "step": 710 }, { "epoch": 3.972067039106145, "grad_norm": 5.817496764436292, "learning_rate": 9.346792034514745e-07, "logits/chosen": -3.566187858581543, "logits/rejected": -3.4624316692352295, "logps/chosen": -12.475229263305664, "logps/rejected": -24.530345916748047, "loss": 0.1821, "rewards/accuracies": 1.0, "rewards/chosen": 1.693335771560669, "rewards/margins": 1.693335771560669, "rewards/rejected": 0.0, "step": 711 }, { "epoch": 3.977653631284916, "grad_norm": 5.012839322578175, "learning_rate": 9.343776545032187e-07, "logits/chosen": -3.9504566192626953, "logits/rejected": -3.617793560028076, "logps/chosen": -14.308114051818848, "logps/rejected": -35.20501708984375, "loss": 0.16, "rewards/accuracies": 1.0, "rewards/chosen": 1.9731078147888184, "rewards/margins": 1.9731078147888184, "rewards/rejected": 0.0, "step": 712 }, { "epoch": 3.983240223463687, "grad_norm": 5.075294614685221, "learning_rate": 9.340754599924098e-07, "logits/chosen": -3.4559686183929443, "logits/rejected": -3.4902491569519043, "logps/chosen": -10.13359260559082, "logps/rejected": -24.145774841308594, "loss": 0.156, "rewards/accuracies": 1.0, "rewards/chosen": 1.5532363653182983, "rewards/margins": 1.5532363653182983, "rewards/rejected": 0.0, "step": 713 }, { "epoch": 3.988826815642458, "grad_norm": 6.10877500712863, "learning_rate": 9.337726203681628e-07, "logits/chosen": -3.6558167934417725, "logits/rejected": -3.7465085983276367, "logps/chosen": -9.029916763305664, "logps/rejected": -35.61157989501953, "loss": 0.1821, "rewards/accuracies": 1.0, "rewards/chosen": 1.6430715322494507, "rewards/margins": 1.6430715322494507, "rewards/rejected": 0.0, "step": 714 }, { "epoch": 3.994413407821229, "grad_norm": 5.645512806316481, "learning_rate": 9.334691360805515e-07, "logits/chosen": -3.5487918853759766, "logits/rejected": -3.520137310028076, "logps/chosen": -22.947519302368164, "logps/rejected": -24.66504669189453, "loss": 0.1675, "rewards/accuracies": 1.0, "rewards/chosen": 2.2151451110839844, "rewards/margins": 2.2151451110839844, "rewards/rejected": 0.0, "step": 715 }, { "epoch": 4.0, "grad_norm": 3.3119225096540705, "learning_rate": 9.33165007580607e-07, "logits/chosen": -3.453325033187866, "logits/rejected": -3.3989417552948, "logps/chosen": -10.77746868133545, "logps/rejected": -19.99879264831543, "loss": 0.1574, "rewards/accuracies": 1.0, "rewards/chosen": 1.782940149307251, "rewards/margins": 1.782940149307251, "rewards/rejected": 0.0, "step": 716 }, { "epoch": 4.005586592178771, "grad_norm": 3.1066199059131994, "learning_rate": 9.32860235320319e-07, "logits/chosen": -3.86627197265625, "logits/rejected": -3.867809534072876, "logps/chosen": -7.358992099761963, "logps/rejected": -28.105648040771484, "loss": 0.1327, "rewards/accuracies": 1.0, "rewards/chosen": 1.9243531227111816, "rewards/margins": 1.9243531227111816, "rewards/rejected": 0.0, "step": 717 }, { "epoch": 4.011173184357542, "grad_norm": 3.139164996115159, "learning_rate": 9.325548197526329e-07, "logits/chosen": -3.582491874694824, "logits/rejected": -3.5838065147399902, "logps/chosen": -12.756607055664062, "logps/rejected": -31.588409423828125, "loss": 0.1393, "rewards/accuracies": 1.0, "rewards/chosen": 1.5849655866622925, "rewards/margins": 1.5849655866622925, "rewards/rejected": 0.0, "step": 718 }, { "epoch": 4.016759776536313, "grad_norm": 2.8827433677868246, "learning_rate": 9.322487613314511e-07, "logits/chosen": -3.928121566772461, "logits/rejected": -3.5920283794403076, "logps/chosen": -5.253528594970703, "logps/rejected": -29.994462966918945, "loss": 0.1629, "rewards/accuracies": 1.0, "rewards/chosen": 1.885958194732666, "rewards/margins": 1.885958194732666, "rewards/rejected": 0.0, "step": 719 }, { "epoch": 4.022346368715084, "grad_norm": 4.710942819453221, "learning_rate": 9.319420605116306e-07, "logits/chosen": -3.8790347576141357, "logits/rejected": -3.5468173027038574, "logps/chosen": -5.437746524810791, "logps/rejected": -17.863117218017578, "loss": 0.1485, "rewards/accuracies": 1.0, "rewards/chosen": 1.861025333404541, "rewards/margins": 1.861025333404541, "rewards/rejected": 0.0, "step": 720 }, { "epoch": 4.022346368715084, "eval_logits/chosen": -3.4488277435302734, "eval_logits/rejected": -3.556626796722412, "eval_logps/chosen": -19.6168270111084, "eval_logps/rejected": -34.507118225097656, "eval_loss": 0.39809659123420715, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 1.0453212261199951, "eval_rewards/margins": 1.0453212261199951, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7389, "eval_samples_per_second": 9.469, "eval_steps_per_second": 0.305, "step": 720 }, { "epoch": 4.027932960893855, "grad_norm": 2.705266372688293, "learning_rate": 9.316347177489834e-07, "logits/chosen": -3.5220789909362793, "logits/rejected": -3.2003581523895264, "logps/chosen": -10.75131607055664, "logps/rejected": -18.33860969543457, "loss": 0.1392, "rewards/accuracies": 1.0, "rewards/chosen": 2.5228328704833984, "rewards/margins": 2.5228328704833984, "rewards/rejected": 0.0, "step": 721 }, { "epoch": 4.033519553072626, "grad_norm": 2.790758874166294, "learning_rate": 9.313267335002758e-07, "logits/chosen": -3.6527023315429688, "logits/rejected": -3.6668760776519775, "logps/chosen": -13.89764404296875, "logps/rejected": -54.01905822753906, "loss": 0.1253, "rewards/accuracies": 1.0, "rewards/chosen": 2.4421184062957764, "rewards/margins": 2.4421184062957764, "rewards/rejected": 0.0, "step": 722 }, { "epoch": 4.039106145251397, "grad_norm": 3.0446453914168727, "learning_rate": 9.310181082232272e-07, "logits/chosen": -3.152895927429199, "logits/rejected": -2.9274024963378906, "logps/chosen": -18.17841339111328, "logps/rejected": -21.236652374267578, "loss": 0.137, "rewards/accuracies": 1.0, "rewards/chosen": 2.428865432739258, "rewards/margins": 2.428865432739258, "rewards/rejected": 0.0, "step": 723 }, { "epoch": 4.044692737430168, "grad_norm": 3.4419644926394546, "learning_rate": 9.307088423765095e-07, "logits/chosen": -3.4919638633728027, "logits/rejected": -3.390697956085205, "logps/chosen": -6.289459705352783, "logps/rejected": -33.2501335144043, "loss": 0.1375, "rewards/accuracies": 1.0, "rewards/chosen": 1.4905641078948975, "rewards/margins": 1.4905641078948975, "rewards/rejected": 0.0, "step": 724 }, { "epoch": 4.050279329608939, "grad_norm": 3.115348890510477, "learning_rate": 9.303989364197468e-07, "logits/chosen": -3.5298280715942383, "logits/rejected": -3.65049409866333, "logps/chosen": -8.518899917602539, "logps/rejected": -64.3823013305664, "loss": 0.1311, "rewards/accuracies": 1.0, "rewards/chosen": 2.480935573577881, "rewards/margins": 2.480935573577881, "rewards/rejected": 0.0, "step": 725 }, { "epoch": 4.055865921787709, "grad_norm": 3.074773937376554, "learning_rate": 9.300883908135151e-07, "logits/chosen": -3.6087746620178223, "logits/rejected": -3.4459707736968994, "logps/chosen": -11.586292266845703, "logps/rejected": -28.29360008239746, "loss": 0.1524, "rewards/accuracies": 1.0, "rewards/chosen": 2.2884156703948975, "rewards/margins": 2.2884156703948975, "rewards/rejected": 0.0, "step": 726 }, { "epoch": 4.06145251396648, "grad_norm": 3.724823833704586, "learning_rate": 9.297772060193399e-07, "logits/chosen": -3.662719488143921, "logits/rejected": -3.6215732097625732, "logps/chosen": -8.40018081665039, "logps/rejected": -24.1854190826416, "loss": 0.146, "rewards/accuracies": 1.0, "rewards/chosen": 1.9744935035705566, "rewards/margins": 1.9744935035705566, "rewards/rejected": 0.0, "step": 727 }, { "epoch": 4.067039106145251, "grad_norm": 2.8456632938699813, "learning_rate": 9.294653824996974e-07, "logits/chosen": -3.7286922931671143, "logits/rejected": -3.5332837104797363, "logps/chosen": -3.7866902351379395, "logps/rejected": -48.27838134765625, "loss": 0.1576, "rewards/accuracies": 1.0, "rewards/chosen": 1.6437840461730957, "rewards/margins": 1.6437840461730957, "rewards/rejected": 0.0, "step": 728 }, { "epoch": 4.072625698324022, "grad_norm": 3.3654213272524216, "learning_rate": 9.291529207180131e-07, "logits/chosen": -3.652053117752075, "logits/rejected": -3.7424676418304443, "logps/chosen": -4.034215927124023, "logps/rejected": -38.23637390136719, "loss": 0.1525, "rewards/accuracies": 1.0, "rewards/chosen": 1.381847858428955, "rewards/margins": 1.381847858428955, "rewards/rejected": 0.0, "step": 729 }, { "epoch": 4.078212290502793, "grad_norm": 2.8708320826782803, "learning_rate": 9.288398211386606e-07, "logits/chosen": -3.9891786575317383, "logits/rejected": -3.8775477409362793, "logps/chosen": -8.785106658935547, "logps/rejected": -16.385774612426758, "loss": 0.1338, "rewards/accuracies": 1.0, "rewards/chosen": 2.3422060012817383, "rewards/margins": 2.3422060012817383, "rewards/rejected": 0.0, "step": 730 }, { "epoch": 4.083798882681564, "grad_norm": 2.882045035043276, "learning_rate": 9.285260842269616e-07, "logits/chosen": -3.5995490550994873, "logits/rejected": -3.3151626586914062, "logps/chosen": -6.383079528808594, "logps/rejected": -46.785770416259766, "loss": 0.1506, "rewards/accuracies": 1.0, "rewards/chosen": 1.5607208013534546, "rewards/margins": 1.5607208013534546, "rewards/rejected": 0.0, "step": 731 }, { "epoch": 4.089385474860335, "grad_norm": 3.2785859506970882, "learning_rate": 9.28211710449185e-07, "logits/chosen": -3.473612070083618, "logits/rejected": -3.66225266456604, "logps/chosen": -2.542301654815674, "logps/rejected": -73.18181610107422, "loss": 0.1471, "rewards/accuracies": 1.0, "rewards/chosen": 1.4122735261917114, "rewards/margins": 1.4122735261917114, "rewards/rejected": 0.0, "step": 732 }, { "epoch": 4.094972067039106, "grad_norm": 2.974385466628785, "learning_rate": 9.278967002725464e-07, "logits/chosen": -3.808043956756592, "logits/rejected": -3.915055513381958, "logps/chosen": -11.477127075195312, "logps/rejected": -49.63106155395508, "loss": 0.1653, "rewards/accuracies": 1.0, "rewards/chosen": 1.7113841772079468, "rewards/margins": 1.7113841772079468, "rewards/rejected": 0.0, "step": 733 }, { "epoch": 4.100558659217877, "grad_norm": 2.511075806636554, "learning_rate": 9.275810541652071e-07, "logits/chosen": -3.7008209228515625, "logits/rejected": -3.6690826416015625, "logps/chosen": -5.514899253845215, "logps/rejected": -39.429718017578125, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": 2.1070127487182617, "rewards/margins": 2.1070127487182617, "rewards/rejected": 0.0, "step": 734 }, { "epoch": 4.106145251396648, "grad_norm": 3.211602338759111, "learning_rate": 9.272647725962732e-07, "logits/chosen": -3.5125110149383545, "logits/rejected": -3.681480646133423, "logps/chosen": -32.70913314819336, "logps/rejected": -31.92466926574707, "loss": 0.1502, "rewards/accuracies": 1.0, "rewards/chosen": 2.6375980377197266, "rewards/margins": 2.6375980377197266, "rewards/rejected": 0.0, "step": 735 }, { "epoch": 4.111731843575419, "grad_norm": 3.522666740853587, "learning_rate": 9.269478560357956e-07, "logits/chosen": -3.8538997173309326, "logits/rejected": -3.648585796356201, "logps/chosen": -19.528947830200195, "logps/rejected": -23.962234497070312, "loss": 0.1666, "rewards/accuracies": 1.0, "rewards/chosen": 2.0669362545013428, "rewards/margins": 2.0669362545013428, "rewards/rejected": 0.0, "step": 736 }, { "epoch": 4.11731843575419, "grad_norm": 3.204503869499785, "learning_rate": 9.266303049547687e-07, "logits/chosen": -3.47011137008667, "logits/rejected": -3.659250259399414, "logps/chosen": -10.742396354675293, "logps/rejected": -27.028169631958008, "loss": 0.1412, "rewards/accuracies": 1.0, "rewards/chosen": 2.4456121921539307, "rewards/margins": 2.4456121921539307, "rewards/rejected": 0.0, "step": 737 }, { "epoch": 4.122905027932961, "grad_norm": 3.167416898284249, "learning_rate": 9.263121198251298e-07, "logits/chosen": -3.63665771484375, "logits/rejected": -3.7554972171783447, "logps/chosen": -8.60072135925293, "logps/rejected": -38.334529876708984, "loss": 0.1236, "rewards/accuracies": 1.0, "rewards/chosen": 1.9187707901000977, "rewards/margins": 1.9187707901000977, "rewards/rejected": 0.0, "step": 738 }, { "epoch": 4.128491620111732, "grad_norm": 6.0027105137760906, "learning_rate": 9.25993301119759e-07, "logits/chosen": -3.747448444366455, "logits/rejected": -3.8216421604156494, "logps/chosen": -6.165863990783691, "logps/rejected": -30.87430191040039, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": 1.2946873903274536, "rewards/margins": 1.2946873903274536, "rewards/rejected": 0.0, "step": 739 }, { "epoch": 4.134078212290503, "grad_norm": 2.9984148756243285, "learning_rate": 9.256738493124773e-07, "logits/chosen": -3.3836264610290527, "logits/rejected": -3.621556520462036, "logps/chosen": -24.779489517211914, "logps/rejected": -45.01148223876953, "loss": 0.1312, "rewards/accuracies": 1.0, "rewards/chosen": 2.5112781524658203, "rewards/margins": 2.5112781524658203, "rewards/rejected": 0.0, "step": 740 }, { "epoch": 4.134078212290503, "eval_logits/chosen": -3.4461867809295654, "eval_logits/rejected": -3.5609054565429688, "eval_logps/chosen": -20.51335906982422, "eval_logps/rejected": -37.09926986694336, "eval_loss": 0.45483899116516113, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.9556684494018555, "eval_rewards/margins": 0.9556684494018555, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7462, "eval_samples_per_second": 9.467, "eval_steps_per_second": 0.305, "step": 740 }, { "epoch": 4.139664804469274, "grad_norm": 3.6893463435658416, "learning_rate": 9.253537648780472e-07, "logits/chosen": -3.7558329105377197, "logits/rejected": -3.8279576301574707, "logps/chosen": -14.115180969238281, "logps/rejected": -22.908321380615234, "loss": 0.1611, "rewards/accuracies": 1.0, "rewards/chosen": 2.6616134643554688, "rewards/margins": 2.6616134643554688, "rewards/rejected": 0.0, "step": 741 }, { "epoch": 4.145251396648045, "grad_norm": 3.81926847554455, "learning_rate": 9.250330482921711e-07, "logits/chosen": -3.778927803039551, "logits/rejected": -3.671673536300659, "logps/chosen": -1.6724507808685303, "logps/rejected": -75.70147705078125, "loss": 0.1529, "rewards/accuracies": 1.0, "rewards/chosen": 1.1681830883026123, "rewards/margins": 1.1681830883026123, "rewards/rejected": 0.0, "step": 742 }, { "epoch": 4.150837988826815, "grad_norm": 6.57804840798006, "learning_rate": 9.24711700031491e-07, "logits/chosen": -3.6145131587982178, "logits/rejected": -3.5747292041778564, "logps/chosen": -13.17055892944336, "logps/rejected": -25.868175506591797, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 2.250964403152466, "rewards/margins": 2.250964403152466, "rewards/rejected": 0.0, "step": 743 }, { "epoch": 4.156424581005586, "grad_norm": 2.9823534594630368, "learning_rate": 9.243897205735878e-07, "logits/chosen": -3.810460090637207, "logits/rejected": -3.8142824172973633, "logps/chosen": -3.4041643142700195, "logps/rejected": -65.01181030273438, "loss": 0.1223, "rewards/accuracies": 1.0, "rewards/chosen": 1.5365550518035889, "rewards/margins": 1.5365550518035889, "rewards/rejected": 0.0, "step": 744 }, { "epoch": 4.162011173184357, "grad_norm": 3.0972227791265206, "learning_rate": 9.2406711039698e-07, "logits/chosen": -3.4885849952697754, "logits/rejected": -3.400820255279541, "logps/chosen": -3.5544004440307617, "logps/rejected": -31.687374114990234, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": 1.8177826404571533, "rewards/margins": 1.8177826404571533, "rewards/rejected": 0.0, "step": 745 }, { "epoch": 4.167597765363128, "grad_norm": 3.0874643852325176, "learning_rate": 9.237438699811238e-07, "logits/chosen": -3.5004031658172607, "logits/rejected": -3.6086864471435547, "logps/chosen": -8.36198616027832, "logps/rejected": -24.751819610595703, "loss": 0.1078, "rewards/accuracies": 1.0, "rewards/chosen": 2.266913414001465, "rewards/margins": 2.266913414001465, "rewards/rejected": 0.0, "step": 746 }, { "epoch": 4.173184357541899, "grad_norm": 3.213656743187189, "learning_rate": 9.234199998064124e-07, "logits/chosen": -3.846193313598633, "logits/rejected": -3.6449530124664307, "logps/chosen": -8.9894380569458, "logps/rejected": -40.17134094238281, "loss": 0.1473, "rewards/accuracies": 1.0, "rewards/chosen": 1.7603962421417236, "rewards/margins": 1.7603962421417236, "rewards/rejected": 0.0, "step": 747 }, { "epoch": 4.17877094972067, "grad_norm": 3.039480742921061, "learning_rate": 9.230955003541742e-07, "logits/chosen": -3.5335752964019775, "logits/rejected": -3.739990711212158, "logps/chosen": -2.6747734546661377, "logps/rejected": -38.45859909057617, "loss": 0.1374, "rewards/accuracies": 1.0, "rewards/chosen": 1.272268295288086, "rewards/margins": 1.272268295288086, "rewards/rejected": 0.0, "step": 748 }, { "epoch": 4.184357541899441, "grad_norm": 3.1356275674508804, "learning_rate": 9.227703721066733e-07, "logits/chosen": -3.3352243900299072, "logits/rejected": -3.300250768661499, "logps/chosen": -21.90550994873047, "logps/rejected": -33.50829315185547, "loss": 0.1298, "rewards/accuracies": 1.0, "rewards/chosen": 2.1247329711914062, "rewards/margins": 2.1247329711914062, "rewards/rejected": 0.0, "step": 749 }, { "epoch": 4.189944134078212, "grad_norm": 3.3401965954263444, "learning_rate": 9.224446155471082e-07, "logits/chosen": -3.4573476314544678, "logits/rejected": -3.706101179122925, "logps/chosen": -42.64149856567383, "logps/rejected": -27.08307456970215, "loss": 0.1482, "rewards/accuracies": 1.0, "rewards/chosen": 2.8673996925354004, "rewards/margins": 2.8673996925354004, "rewards/rejected": 0.0, "step": 750 }, { "epoch": 4.195530726256983, "grad_norm": 3.8568903726565766, "learning_rate": 9.221182311596111e-07, "logits/chosen": -3.56864070892334, "logits/rejected": -3.4958670139312744, "logps/chosen": -19.352327346801758, "logps/rejected": -35.47402572631836, "loss": 0.1408, "rewards/accuracies": 1.0, "rewards/chosen": 2.6086671352386475, "rewards/margins": 2.6086671352386475, "rewards/rejected": 0.0, "step": 751 }, { "epoch": 4.201117318435754, "grad_norm": 4.538585470300417, "learning_rate": 9.217912194292473e-07, "logits/chosen": -3.2495787143707275, "logits/rejected": -3.3900630474090576, "logps/chosen": -3.3145623207092285, "logps/rejected": -51.09520721435547, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": 1.7654829025268555, "rewards/margins": 1.7654829025268555, "rewards/rejected": 0.0, "step": 752 }, { "epoch": 4.206703910614525, "grad_norm": 5.442019841494312, "learning_rate": 9.214635808420146e-07, "logits/chosen": -3.792428731918335, "logits/rejected": -3.8434760570526123, "logps/chosen": -12.232793807983398, "logps/rejected": -32.03156661987305, "loss": 0.15, "rewards/accuracies": 1.0, "rewards/chosen": 2.4639711380004883, "rewards/margins": 2.4639711380004883, "rewards/rejected": 0.0, "step": 753 }, { "epoch": 4.212290502793296, "grad_norm": 3.116250654254641, "learning_rate": 9.211353158848421e-07, "logits/chosen": -3.395803689956665, "logits/rejected": -3.3356313705444336, "logps/chosen": -11.544515609741211, "logps/rejected": -31.22180938720703, "loss": 0.1504, "rewards/accuracies": 1.0, "rewards/chosen": 2.2545347213745117, "rewards/margins": 2.2545347213745117, "rewards/rejected": 0.0, "step": 754 }, { "epoch": 4.217877094972067, "grad_norm": 3.321881698001037, "learning_rate": 9.208064250455902e-07, "logits/chosen": -3.676140069961548, "logits/rejected": -3.8750996589660645, "logps/chosen": -4.331568717956543, "logps/rejected": -27.3017578125, "loss": 0.1454, "rewards/accuracies": 1.0, "rewards/chosen": 1.8052383661270142, "rewards/margins": 1.8052383661270142, "rewards/rejected": 0.0, "step": 755 }, { "epoch": 4.223463687150838, "grad_norm": 3.4889153207537933, "learning_rate": 9.204769088130491e-07, "logits/chosen": -3.7074637413024902, "logits/rejected": -3.801008939743042, "logps/chosen": -9.631816864013672, "logps/rejected": -45.95659637451172, "loss": 0.1231, "rewards/accuracies": 1.0, "rewards/chosen": 2.324375629425049, "rewards/margins": 2.324375629425049, "rewards/rejected": 0.0, "step": 756 }, { "epoch": 4.229050279329609, "grad_norm": 3.147759262629917, "learning_rate": 9.201467676769388e-07, "logits/chosen": -3.383687973022461, "logits/rejected": -3.453930139541626, "logps/chosen": -31.3158016204834, "logps/rejected": -25.914657592773438, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": 2.4040439128875732, "rewards/margins": 2.4040439128875732, "rewards/rejected": 0.0, "step": 757 }, { "epoch": 4.23463687150838, "grad_norm": 3.2858190977651427, "learning_rate": 9.198160021279076e-07, "logits/chosen": -3.4089393615722656, "logits/rejected": -3.566396474838257, "logps/chosen": -10.551408767700195, "logps/rejected": -35.372467041015625, "loss": 0.1546, "rewards/accuracies": 1.0, "rewards/chosen": 1.9044461250305176, "rewards/margins": 1.9044461250305176, "rewards/rejected": 0.0, "step": 758 }, { "epoch": 4.240223463687151, "grad_norm": 3.293436579521988, "learning_rate": 9.19484612657532e-07, "logits/chosen": -4.007391929626465, "logits/rejected": -3.845597505569458, "logps/chosen": -3.860405683517456, "logps/rejected": -29.047256469726562, "loss": 0.1478, "rewards/accuracies": 1.0, "rewards/chosen": 1.710767388343811, "rewards/margins": 1.710767388343811, "rewards/rejected": 0.0, "step": 759 }, { "epoch": 4.245810055865922, "grad_norm": 3.6355080196955285, "learning_rate": 9.19152599758316e-07, "logits/chosen": -3.7784767150878906, "logits/rejected": -3.559087038040161, "logps/chosen": -7.721414566040039, "logps/rejected": -39.091068267822266, "loss": 0.137, "rewards/accuracies": 1.0, "rewards/chosen": 2.056973457336426, "rewards/margins": 2.056973457336426, "rewards/rejected": 0.0, "step": 760 }, { "epoch": 4.245810055865922, "eval_logits/chosen": -3.4631340503692627, "eval_logits/rejected": -3.580005645751953, "eval_logps/chosen": -20.73528480529785, "eval_logps/rejected": -37.09739303588867, "eval_loss": 0.4545563757419586, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.9334754943847656, "eval_rewards/margins": 0.9334754943847656, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6984, "eval_samples_per_second": 9.481, "eval_steps_per_second": 0.306, "step": 760 }, { "epoch": 4.251396648044693, "grad_norm": 4.944444500555194, "learning_rate": 9.188199639236896e-07, "logits/chosen": -3.581573963165283, "logits/rejected": -3.8438568115234375, "logps/chosen": -5.980813026428223, "logps/rejected": -34.75967025756836, "loss": 0.1302, "rewards/accuracies": 1.0, "rewards/chosen": 2.1940486431121826, "rewards/margins": 2.1940486431121826, "rewards/rejected": 0.0, "step": 761 }, { "epoch": 4.256983240223463, "grad_norm": 3.0382683861843462, "learning_rate": 9.184867056480092e-07, "logits/chosen": -3.32403826713562, "logits/rejected": -3.279866933822632, "logps/chosen": -15.873481750488281, "logps/rejected": -32.66768264770508, "loss": 0.1418, "rewards/accuracies": 1.0, "rewards/chosen": 2.380401134490967, "rewards/margins": 2.380401134490967, "rewards/rejected": 0.0, "step": 762 }, { "epoch": 4.262569832402234, "grad_norm": 3.525863356895964, "learning_rate": 9.181528254265558e-07, "logits/chosen": -3.561990737915039, "logits/rejected": -3.3689842224121094, "logps/chosen": -14.201573371887207, "logps/rejected": -36.46363067626953, "loss": 0.1306, "rewards/accuracies": 1.0, "rewards/chosen": 1.926465392112732, "rewards/margins": 1.926465392112732, "rewards/rejected": 0.0, "step": 763 }, { "epoch": 4.268156424581005, "grad_norm": 3.4135660098550615, "learning_rate": 9.178183237555348e-07, "logits/chosen": -3.6096136569976807, "logits/rejected": -3.447056770324707, "logps/chosen": -4.089483261108398, "logps/rejected": -35.93095397949219, "loss": 0.1558, "rewards/accuracies": 1.0, "rewards/chosen": 1.7403192520141602, "rewards/margins": 1.7403192520141602, "rewards/rejected": 0.0, "step": 764 }, { "epoch": 4.273743016759776, "grad_norm": 3.3652695327928948, "learning_rate": 9.174832011320755e-07, "logits/chosen": -3.407003164291382, "logits/rejected": -3.7466068267822266, "logps/chosen": -4.882349014282227, "logps/rejected": -75.54251098632812, "loss": 0.1344, "rewards/accuracies": 1.0, "rewards/chosen": 1.7903443574905396, "rewards/margins": 1.7903443574905396, "rewards/rejected": 0.0, "step": 765 }, { "epoch": 4.279329608938547, "grad_norm": 3.6507183315670475, "learning_rate": 9.171474580542295e-07, "logits/chosen": -3.983333110809326, "logits/rejected": -3.87296199798584, "logps/chosen": -5.923532485961914, "logps/rejected": -20.535736083984375, "loss": 0.1247, "rewards/accuracies": 1.0, "rewards/chosen": 1.8490628004074097, "rewards/margins": 1.8490628004074097, "rewards/rejected": 0.0, "step": 766 }, { "epoch": 4.284916201117318, "grad_norm": 3.592633095633263, "learning_rate": 9.168110950209709e-07, "logits/chosen": -3.6785390377044678, "logits/rejected": -3.9248623847961426, "logps/chosen": -20.322904586791992, "logps/rejected": -33.33665466308594, "loss": 0.1372, "rewards/accuracies": 1.0, "rewards/chosen": 2.527787446975708, "rewards/margins": 2.527787446975708, "rewards/rejected": 0.0, "step": 767 }, { "epoch": 4.290502793296089, "grad_norm": 3.8296725215002936, "learning_rate": 9.164741125321951e-07, "logits/chosen": -3.448434591293335, "logits/rejected": -3.6976163387298584, "logps/chosen": -6.484898567199707, "logps/rejected": -96.32450103759766, "loss": 0.1548, "rewards/accuracies": 1.0, "rewards/chosen": 2.413029432296753, "rewards/margins": 2.413029432296753, "rewards/rejected": 0.0, "step": 768 }, { "epoch": 4.29608938547486, "grad_norm": 3.7465486737345475, "learning_rate": 9.16136511088718e-07, "logits/chosen": -3.265479803085327, "logits/rejected": -3.4205503463745117, "logps/chosen": -9.510969161987305, "logps/rejected": -42.40364074707031, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": 1.9431638717651367, "rewards/margins": 1.9431638717651367, "rewards/rejected": 0.0, "step": 769 }, { "epoch": 4.301675977653631, "grad_norm": 4.749079193531236, "learning_rate": 9.157982911922758e-07, "logits/chosen": -3.686668872833252, "logits/rejected": -3.5783064365386963, "logps/chosen": -11.030715942382812, "logps/rejected": -29.405502319335938, "loss": 0.1341, "rewards/accuracies": 1.0, "rewards/chosen": 2.23626971244812, "rewards/margins": 2.23626971244812, "rewards/rejected": 0.0, "step": 770 }, { "epoch": 4.307262569832402, "grad_norm": 3.234669849885449, "learning_rate": 9.154594533455231e-07, "logits/chosen": -3.8428173065185547, "logits/rejected": -3.558997869491577, "logps/chosen": -3.6590447425842285, "logps/rejected": -25.97920799255371, "loss": 0.1443, "rewards/accuracies": 1.0, "rewards/chosen": 1.66550874710083, "rewards/margins": 1.66550874710083, "rewards/rejected": 0.0, "step": 771 }, { "epoch": 4.312849162011173, "grad_norm": 3.2309803622647215, "learning_rate": 9.151199980520335e-07, "logits/chosen": -3.673877477645874, "logits/rejected": -3.547835350036621, "logps/chosen": -11.635046005249023, "logps/rejected": -38.26511001586914, "loss": 0.1473, "rewards/accuracies": 1.0, "rewards/chosen": 2.192197561264038, "rewards/margins": 2.192197561264038, "rewards/rejected": 0.0, "step": 772 }, { "epoch": 4.318435754189944, "grad_norm": 2.866293349271251, "learning_rate": 9.147799258162981e-07, "logits/chosen": -3.293854236602783, "logits/rejected": -3.5211727619171143, "logps/chosen": -8.397584915161133, "logps/rejected": -41.11590576171875, "loss": 0.1224, "rewards/accuracies": 1.0, "rewards/chosen": 1.8701279163360596, "rewards/margins": 1.8701279163360596, "rewards/rejected": 0.0, "step": 773 }, { "epoch": 4.324022346368715, "grad_norm": 3.5849916471280334, "learning_rate": 9.144392371437244e-07, "logits/chosen": -3.6938962936401367, "logits/rejected": -3.7141709327697754, "logps/chosen": -4.073142051696777, "logps/rejected": -26.94833755493164, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": 1.8245025873184204, "rewards/margins": 1.8245025873184204, "rewards/rejected": 0.0, "step": 774 }, { "epoch": 4.329608938547486, "grad_norm": 2.7710639424681016, "learning_rate": 9.140979325406369e-07, "logits/chosen": -3.309889554977417, "logits/rejected": -3.4348950386047363, "logps/chosen": -4.105931758880615, "logps/rejected": -58.37015914916992, "loss": 0.1559, "rewards/accuracies": 1.0, "rewards/chosen": 1.727717638015747, "rewards/margins": 1.727717638015747, "rewards/rejected": 0.0, "step": 775 }, { "epoch": 4.335195530726257, "grad_norm": 4.0928340001123775, "learning_rate": 9.137560125142748e-07, "logits/chosen": -3.6457533836364746, "logits/rejected": -3.8680081367492676, "logps/chosen": -5.9027228355407715, "logps/rejected": -48.547664642333984, "loss": 0.1387, "rewards/accuracies": 1.0, "rewards/chosen": 1.5917106866836548, "rewards/margins": 1.5917106866836548, "rewards/rejected": 0.0, "step": 776 }, { "epoch": 4.340782122905028, "grad_norm": 3.2766470621291197, "learning_rate": 9.134134775727921e-07, "logits/chosen": -3.70273756980896, "logits/rejected": -3.5022263526916504, "logps/chosen": -6.853073596954346, "logps/rejected": -37.53110885620117, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": 2.01503324508667, "rewards/margins": 2.01503324508667, "rewards/rejected": 0.0, "step": 777 }, { "epoch": 4.346368715083799, "grad_norm": 4.640038749004769, "learning_rate": 9.13070328225257e-07, "logits/chosen": -3.47983455657959, "logits/rejected": -3.3371739387512207, "logps/chosen": -7.3134355545043945, "logps/rejected": -32.42387390136719, "loss": 0.1463, "rewards/accuracies": 1.0, "rewards/chosen": 2.5095090866088867, "rewards/margins": 2.5095090866088867, "rewards/rejected": 0.0, "step": 778 }, { "epoch": 4.351955307262569, "grad_norm": 3.844517908712181, "learning_rate": 9.127265649816503e-07, "logits/chosen": -3.4332523345947266, "logits/rejected": -3.6509153842926025, "logps/chosen": -13.059277534484863, "logps/rejected": -48.963497161865234, "loss": 0.1535, "rewards/accuracies": 1.0, "rewards/chosen": 1.9094536304473877, "rewards/margins": 1.9094536304473877, "rewards/rejected": 0.0, "step": 779 }, { "epoch": 4.35754189944134, "grad_norm": 2.9517341839947115, "learning_rate": 9.123821883528653e-07, "logits/chosen": -3.7786710262298584, "logits/rejected": -3.6058902740478516, "logps/chosen": -2.9812262058258057, "logps/rejected": -54.710079193115234, "loss": 0.1264, "rewards/accuracies": 1.0, "rewards/chosen": 1.6125730276107788, "rewards/margins": 1.6125730276107788, "rewards/rejected": 0.0, "step": 780 }, { "epoch": 4.35754189944134, "eval_logits/chosen": -3.43599271774292, "eval_logits/rejected": -3.5524775981903076, "eval_logps/chosen": -20.9012393951416, "eval_logps/rejected": -38.057952880859375, "eval_loss": 0.4690379500389099, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.9168804883956909, "eval_rewards/margins": 0.9168804883956909, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7009, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.306, "step": 780 }, { "epoch": 4.363128491620111, "grad_norm": 5.343100546730451, "learning_rate": 9.120371988507073e-07, "logits/chosen": -3.424809217453003, "logits/rejected": -3.5738799571990967, "logps/chosen": -35.25627899169922, "logps/rejected": -46.489288330078125, "loss": 0.1704, "rewards/accuracies": 1.0, "rewards/chosen": 1.8060996532440186, "rewards/margins": 1.8060996532440186, "rewards/rejected": 0.0, "step": 781 }, { "epoch": 4.368715083798882, "grad_norm": 3.10295127317863, "learning_rate": 9.11691596987892e-07, "logits/chosen": -3.82004976272583, "logits/rejected": -3.744163751602173, "logps/chosen": -18.236862182617188, "logps/rejected": -40.948020935058594, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": 2.6354761123657227, "rewards/margins": 2.6354761123657227, "rewards/rejected": 0.0, "step": 782 }, { "epoch": 4.374301675977653, "grad_norm": 6.635272543839055, "learning_rate": 9.113453832780452e-07, "logits/chosen": -3.6957621574401855, "logits/rejected": -3.819979429244995, "logps/chosen": -11.850873947143555, "logps/rejected": -44.036354064941406, "loss": 0.1585, "rewards/accuracies": 1.0, "rewards/chosen": 2.082808494567871, "rewards/margins": 2.082808494567871, "rewards/rejected": 0.0, "step": 783 }, { "epoch": 4.379888268156424, "grad_norm": 3.854798709356382, "learning_rate": 9.109985582357024e-07, "logits/chosen": -3.4215121269226074, "logits/rejected": -3.3615832328796387, "logps/chosen": -9.830392837524414, "logps/rejected": -46.99778366088867, "loss": 0.1185, "rewards/accuracies": 1.0, "rewards/chosen": 2.1329922676086426, "rewards/margins": 2.1329922676086426, "rewards/rejected": 0.0, "step": 784 }, { "epoch": 4.385474860335195, "grad_norm": 3.787443142878194, "learning_rate": 9.106511223763072e-07, "logits/chosen": -3.693557024002075, "logits/rejected": -3.639054298400879, "logps/chosen": -12.23244857788086, "logps/rejected": -44.668846130371094, "loss": 0.1374, "rewards/accuracies": 1.0, "rewards/chosen": 2.178089141845703, "rewards/margins": 2.178089141845703, "rewards/rejected": 0.0, "step": 785 }, { "epoch": 4.391061452513966, "grad_norm": 8.74467544616392, "learning_rate": 9.10303076216211e-07, "logits/chosen": -3.5624606609344482, "logits/rejected": -3.3727633953094482, "logps/chosen": -10.703680038452148, "logps/rejected": -34.77155685424805, "loss": 0.1832, "rewards/accuracies": 1.0, "rewards/chosen": 1.7109159231185913, "rewards/margins": 1.7109159231185913, "rewards/rejected": 0.0, "step": 786 }, { "epoch": 4.396648044692737, "grad_norm": 3.4825215321185357, "learning_rate": 9.099544202726727e-07, "logits/chosen": -3.783106803894043, "logits/rejected": -3.583214282989502, "logps/chosen": -23.693105697631836, "logps/rejected": -20.57411003112793, "loss": 0.136, "rewards/accuracies": 1.0, "rewards/chosen": 2.845611333847046, "rewards/margins": 2.845611333847046, "rewards/rejected": 0.0, "step": 787 }, { "epoch": 4.402234636871508, "grad_norm": 4.135132450259034, "learning_rate": 9.096051550638571e-07, "logits/chosen": -3.5946578979492188, "logits/rejected": -3.6001033782958984, "logps/chosen": -41.17884063720703, "logps/rejected": -32.217872619628906, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 2.4946250915527344, "rewards/margins": 2.4946250915527344, "rewards/rejected": 0.0, "step": 788 }, { "epoch": 4.407821229050279, "grad_norm": 8.7635921333923, "learning_rate": 9.092552811088341e-07, "logits/chosen": -3.9743850231170654, "logits/rejected": -3.7202308177948, "logps/chosen": -10.13995361328125, "logps/rejected": -32.595420837402344, "loss": 0.199, "rewards/accuracies": 1.0, "rewards/chosen": 2.177818775177002, "rewards/margins": 2.177818775177002, "rewards/rejected": 0.0, "step": 789 }, { "epoch": 4.41340782122905, "grad_norm": 4.1633615918267655, "learning_rate": 9.089047989275792e-07, "logits/chosen": -3.6054015159606934, "logits/rejected": -3.681760787963867, "logps/chosen": -6.8949151039123535, "logps/rejected": -27.745790481567383, "loss": 0.1358, "rewards/accuracies": 1.0, "rewards/chosen": 2.0137648582458496, "rewards/margins": 2.0137648582458496, "rewards/rejected": 0.0, "step": 790 }, { "epoch": 4.418994413407821, "grad_norm": 2.9385780581086913, "learning_rate": 9.085537090409711e-07, "logits/chosen": -3.7114686965942383, "logits/rejected": -3.642517328262329, "logps/chosen": -9.158451080322266, "logps/rejected": -57.30244064331055, "loss": 0.1191, "rewards/accuracies": 1.0, "rewards/chosen": 2.2197651863098145, "rewards/margins": 2.2197651863098145, "rewards/rejected": 0.0, "step": 791 }, { "epoch": 4.424581005586592, "grad_norm": 4.421768930792704, "learning_rate": 9.082020119707919e-07, "logits/chosen": -3.851954698562622, "logits/rejected": -3.7828562259674072, "logps/chosen": -9.329696655273438, "logps/rejected": -44.28007507324219, "loss": 0.1303, "rewards/accuracies": 1.0, "rewards/chosen": 2.448639392852783, "rewards/margins": 2.448639392852783, "rewards/rejected": 0.0, "step": 792 }, { "epoch": 4.430167597765363, "grad_norm": 4.710272386727113, "learning_rate": 9.078497082397262e-07, "logits/chosen": -3.7823007106781006, "logits/rejected": -3.5928187370300293, "logps/chosen": -12.154468536376953, "logps/rejected": -22.485620498657227, "loss": 0.1354, "rewards/accuracies": 1.0, "rewards/chosen": 2.6742687225341797, "rewards/margins": 2.6742687225341797, "rewards/rejected": 0.0, "step": 793 }, { "epoch": 4.435754189944134, "grad_norm": 3.4723079306981233, "learning_rate": 9.074967983713599e-07, "logits/chosen": -3.509960174560547, "logits/rejected": -3.4457461833953857, "logps/chosen": -21.281532287597656, "logps/rejected": -21.520544052124023, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": 2.3013064861297607, "rewards/margins": 2.3013064861297607, "rewards/rejected": 0.0, "step": 794 }, { "epoch": 4.441340782122905, "grad_norm": 4.243171534513039, "learning_rate": 9.071432828901799e-07, "logits/chosen": -3.7948055267333984, "logits/rejected": -3.835109233856201, "logps/chosen": -5.634298324584961, "logps/rejected": -28.5150089263916, "loss": 0.1525, "rewards/accuracies": 1.0, "rewards/chosen": 2.071786403656006, "rewards/margins": 2.071786403656006, "rewards/rejected": 0.0, "step": 795 }, { "epoch": 4.446927374301676, "grad_norm": 4.408833849680875, "learning_rate": 9.067891623215735e-07, "logits/chosen": -3.2833240032196045, "logits/rejected": -3.6118528842926025, "logps/chosen": -38.4232177734375, "logps/rejected": -52.00761795043945, "loss": 0.1572, "rewards/accuracies": 1.0, "rewards/chosen": 2.1886892318725586, "rewards/margins": 2.1886892318725586, "rewards/rejected": 0.0, "step": 796 }, { "epoch": 4.452513966480447, "grad_norm": 3.505128841617623, "learning_rate": 9.064344371918267e-07, "logits/chosen": -2.8041765689849854, "logits/rejected": -2.882662057876587, "logps/chosen": -13.393410682678223, "logps/rejected": -55.70225143432617, "loss": 0.1425, "rewards/accuracies": 1.0, "rewards/chosen": 1.9918265342712402, "rewards/margins": 1.9918265342712402, "rewards/rejected": 0.0, "step": 797 }, { "epoch": 4.4581005586592175, "grad_norm": 4.033312450932131, "learning_rate": 9.06079108028124e-07, "logits/chosen": -3.5890114307403564, "logits/rejected": -3.583430051803589, "logps/chosen": -15.296281814575195, "logps/rejected": -51.344356536865234, "loss": 0.1116, "rewards/accuracies": 1.0, "rewards/chosen": 2.810650587081909, "rewards/margins": 2.810650587081909, "rewards/rejected": 0.0, "step": 798 }, { "epoch": 4.4636871508379885, "grad_norm": 3.3631755261112306, "learning_rate": 9.057231753585483e-07, "logits/chosen": -3.6827738285064697, "logits/rejected": -3.0112385749816895, "logps/chosen": -14.723047256469727, "logps/rejected": -47.55908966064453, "loss": 0.1522, "rewards/accuracies": 1.0, "rewards/chosen": 2.315462589263916, "rewards/margins": 2.315462589263916, "rewards/rejected": 0.0, "step": 799 }, { "epoch": 4.4692737430167595, "grad_norm": 5.462703338360426, "learning_rate": 9.053666397120785e-07, "logits/chosen": -3.467860221862793, "logits/rejected": -3.461380958557129, "logps/chosen": -5.588186264038086, "logps/rejected": -35.241554260253906, "loss": 0.1676, "rewards/accuracies": 1.0, "rewards/chosen": 1.9312646389007568, "rewards/margins": 1.9312646389007568, "rewards/rejected": 0.0, "step": 800 }, { "epoch": 4.4692737430167595, "eval_logits/chosen": -3.439868927001953, "eval_logits/rejected": -3.557157516479492, "eval_logps/chosen": -20.577713012695312, "eval_logps/rejected": -36.924678802490234, "eval_loss": 0.4478033185005188, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.9492327570915222, "eval_rewards/margins": 0.9492327570915222, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6994, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.306, "step": 800 }, { "epoch": 4.4748603351955305, "grad_norm": 2.7917272173132552, "learning_rate": 9.050095016185902e-07, "logits/chosen": -3.5902280807495117, "logits/rejected": -3.9027185440063477, "logps/chosen": -15.269607543945312, "logps/rejected": -32.67121124267578, "loss": 0.1222, "rewards/accuracies": 1.0, "rewards/chosen": 2.40834903717041, "rewards/margins": 2.40834903717041, "rewards/rejected": 0.0, "step": 801 }, { "epoch": 4.4804469273743015, "grad_norm": 4.542744408179407, "learning_rate": 9.046517616088544e-07, "logits/chosen": -3.6178178787231445, "logits/rejected": -3.6688997745513916, "logps/chosen": -8.076221466064453, "logps/rejected": -34.97129440307617, "loss": 0.1415, "rewards/accuracies": 1.0, "rewards/chosen": 2.155116558074951, "rewards/margins": 2.155116558074951, "rewards/rejected": 0.0, "step": 802 }, { "epoch": 4.4860335195530725, "grad_norm": 3.7891416567916476, "learning_rate": 9.042934202145362e-07, "logits/chosen": -3.5725836753845215, "logits/rejected": -3.2765321731567383, "logps/chosen": -20.888517379760742, "logps/rejected": -30.360881805419922, "loss": 0.1447, "rewards/accuracies": 1.0, "rewards/chosen": 2.8129642009735107, "rewards/margins": 2.8129642009735107, "rewards/rejected": 0.0, "step": 803 }, { "epoch": 4.4916201117318435, "grad_norm": 3.25904645570946, "learning_rate": 9.039344779681949e-07, "logits/chosen": -3.187351942062378, "logits/rejected": -3.4457833766937256, "logps/chosen": -4.032071590423584, "logps/rejected": -53.12700271606445, "loss": 0.1552, "rewards/accuracies": 1.0, "rewards/chosen": 1.8995722532272339, "rewards/margins": 1.8995722532272339, "rewards/rejected": 0.0, "step": 804 }, { "epoch": 4.4972067039106145, "grad_norm": 2.635557739396476, "learning_rate": 9.035749354032825e-07, "logits/chosen": -3.5149707794189453, "logits/rejected": -3.59647536277771, "logps/chosen": -5.334439277648926, "logps/rejected": -32.08082580566406, "loss": 0.1218, "rewards/accuracies": 1.0, "rewards/chosen": 1.8468266725540161, "rewards/margins": 1.8468266725540161, "rewards/rejected": 0.0, "step": 805 }, { "epoch": 4.5027932960893855, "grad_norm": 5.125379045597751, "learning_rate": 9.032147930541434e-07, "logits/chosen": -3.469554901123047, "logits/rejected": -3.7115323543548584, "logps/chosen": -8.944414138793945, "logps/rejected": -29.938426971435547, "loss": 0.1352, "rewards/accuracies": 1.0, "rewards/chosen": 2.080336570739746, "rewards/margins": 2.080336570739746, "rewards/rejected": 0.0, "step": 806 }, { "epoch": 4.5083798882681565, "grad_norm": 4.342979054517213, "learning_rate": 9.028540514560131e-07, "logits/chosen": -3.495372772216797, "logits/rejected": -3.6461708545684814, "logps/chosen": -6.056365489959717, "logps/rejected": -36.85601043701172, "loss": 0.1792, "rewards/accuracies": 1.0, "rewards/chosen": 2.217763900756836, "rewards/margins": 2.217763900756836, "rewards/rejected": 0.0, "step": 807 }, { "epoch": 4.5139664804469275, "grad_norm": 6.01102840188483, "learning_rate": 9.02492711145018e-07, "logits/chosen": -3.437669038772583, "logits/rejected": -3.7074410915374756, "logps/chosen": -77.30291748046875, "logps/rejected": -25.93654441833496, "loss": 0.152, "rewards/accuracies": 1.0, "rewards/chosen": 2.7220306396484375, "rewards/margins": 2.7220306396484375, "rewards/rejected": 0.0, "step": 808 }, { "epoch": 4.5195530726256985, "grad_norm": 11.144390793167407, "learning_rate": 9.021307726581742e-07, "logits/chosen": -3.0836281776428223, "logits/rejected": -3.0821447372436523, "logps/chosen": -31.108070373535156, "logps/rejected": -50.718505859375, "loss": 0.1831, "rewards/accuracies": 1.0, "rewards/chosen": 3.1599106788635254, "rewards/margins": 3.1599106788635254, "rewards/rejected": 0.0, "step": 809 }, { "epoch": 4.5251396648044695, "grad_norm": 6.333980529322715, "learning_rate": 9.017682365333866e-07, "logits/chosen": -3.517589569091797, "logits/rejected": -3.588407278060913, "logps/chosen": -7.623435974121094, "logps/rejected": -25.48214340209961, "loss": 0.134, "rewards/accuracies": 1.0, "rewards/chosen": 2.081376552581787, "rewards/margins": 2.081376552581787, "rewards/rejected": 0.0, "step": 810 }, { "epoch": 4.5307262569832405, "grad_norm": 3.1007002978569576, "learning_rate": 9.014051033094483e-07, "logits/chosen": -3.4597530364990234, "logits/rejected": -3.6112377643585205, "logps/chosen": -10.739227294921875, "logps/rejected": -32.503944396972656, "loss": 0.1334, "rewards/accuracies": 1.0, "rewards/chosen": 1.9882330894470215, "rewards/margins": 1.9882330894470215, "rewards/rejected": 0.0, "step": 811 }, { "epoch": 4.5363128491620115, "grad_norm": 4.298191902823217, "learning_rate": 9.010413735260404e-07, "logits/chosen": -3.3846542835235596, "logits/rejected": -3.2056400775909424, "logps/chosen": -14.382534980773926, "logps/rejected": -53.71867752075195, "loss": 0.1465, "rewards/accuracies": 1.0, "rewards/chosen": 2.1941494941711426, "rewards/margins": 2.1941494941711426, "rewards/rejected": 0.0, "step": 812 }, { "epoch": 4.5418994413407825, "grad_norm": 6.505601145909155, "learning_rate": 9.006770477237298e-07, "logits/chosen": -3.448770523071289, "logits/rejected": -3.4331557750701904, "logps/chosen": -4.308927536010742, "logps/rejected": -28.490875244140625, "loss": 0.1749, "rewards/accuracies": 1.0, "rewards/chosen": 1.4533464908599854, "rewards/margins": 1.4533464908599854, "rewards/rejected": 0.0, "step": 813 }, { "epoch": 4.547486033519553, "grad_norm": 6.117768020643388, "learning_rate": 9.003121264439696e-07, "logits/chosen": -3.6977038383483887, "logits/rejected": -3.5017526149749756, "logps/chosen": -7.282244682312012, "logps/rejected": -50.60511779785156, "loss": 0.1413, "rewards/accuracies": 1.0, "rewards/chosen": 1.1659576892852783, "rewards/margins": 1.1659576892852783, "rewards/rejected": 0.0, "step": 814 }, { "epoch": 4.553072625698324, "grad_norm": 3.6236896664606175, "learning_rate": 8.999466102290978e-07, "logits/chosen": -3.7826175689697266, "logits/rejected": -3.6877007484436035, "logps/chosen": -8.810380935668945, "logps/rejected": -30.6740779876709, "loss": 0.1511, "rewards/accuracies": 1.0, "rewards/chosen": 1.8914289474487305, "rewards/margins": 1.8914289474487305, "rewards/rejected": 0.0, "step": 815 }, { "epoch": 4.558659217877095, "grad_norm": 6.724955984642115, "learning_rate": 8.995804996223366e-07, "logits/chosen": -3.9157938957214355, "logits/rejected": -3.9207956790924072, "logps/chosen": -15.186198234558105, "logps/rejected": -35.508544921875, "loss": 0.149, "rewards/accuracies": 1.0, "rewards/chosen": 1.9603586196899414, "rewards/margins": 1.9603586196899414, "rewards/rejected": 0.0, "step": 816 }, { "epoch": 4.564245810055866, "grad_norm": 7.206472291417148, "learning_rate": 8.992137951677916e-07, "logits/chosen": -3.4340057373046875, "logits/rejected": -3.5320987701416016, "logps/chosen": -3.658921957015991, "logps/rejected": -24.223121643066406, "loss": 0.1704, "rewards/accuracies": 1.0, "rewards/chosen": 1.5331815481185913, "rewards/margins": 1.5331815481185913, "rewards/rejected": 0.0, "step": 817 }, { "epoch": 4.569832402234637, "grad_norm": 3.2631195235899497, "learning_rate": 8.988464974104509e-07, "logits/chosen": -3.5969433784484863, "logits/rejected": -3.9172513484954834, "logps/chosen": -5.465658187866211, "logps/rejected": -56.673118591308594, "loss": 0.1523, "rewards/accuracies": 1.0, "rewards/chosen": 1.8671543598175049, "rewards/margins": 1.8671543598175049, "rewards/rejected": 0.0, "step": 818 }, { "epoch": 4.575418994413408, "grad_norm": 3.32968828017197, "learning_rate": 8.984786068961843e-07, "logits/chosen": -3.925957202911377, "logits/rejected": -3.8275678157806396, "logps/chosen": -14.856788635253906, "logps/rejected": -25.89576530456543, "loss": 0.1508, "rewards/accuracies": 1.0, "rewards/chosen": 2.517965793609619, "rewards/margins": 2.517965793609619, "rewards/rejected": 0.0, "step": 819 }, { "epoch": 4.581005586592179, "grad_norm": 3.693836941985265, "learning_rate": 8.981101241717427e-07, "logits/chosen": -3.3613967895507812, "logits/rejected": -3.5036234855651855, "logps/chosen": -5.705307483673096, "logps/rejected": -23.447486877441406, "loss": 0.1669, "rewards/accuracies": 1.0, "rewards/chosen": 1.9145679473876953, "rewards/margins": 1.9145679473876953, "rewards/rejected": 0.0, "step": 820 }, { "epoch": 4.581005586592179, "eval_logits/chosen": -3.43200945854187, "eval_logits/rejected": -3.550126552581787, "eval_logps/chosen": -20.60776138305664, "eval_logps/rejected": -36.83216094970703, "eval_loss": 0.45647016167640686, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.9462281465530396, "eval_rewards/margins": 0.9462281465530396, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7826, "eval_samples_per_second": 9.456, "eval_steps_per_second": 0.305, "step": 820 }, { "epoch": 4.58659217877095, "grad_norm": 3.2726788992489713, "learning_rate": 8.977410497847571e-07, "logits/chosen": -3.7218239307403564, "logits/rejected": -3.643237590789795, "logps/chosen": -11.734451293945312, "logps/rejected": -33.06830596923828, "loss": 0.1304, "rewards/accuracies": 1.0, "rewards/chosen": 1.9959282875061035, "rewards/margins": 1.9959282875061035, "rewards/rejected": 0.0, "step": 821 }, { "epoch": 4.592178770949721, "grad_norm": 3.8309460160415325, "learning_rate": 8.973713842837378e-07, "logits/chosen": -3.600090503692627, "logits/rejected": -3.7199010848999023, "logps/chosen": -11.833852767944336, "logps/rejected": -28.740642547607422, "loss": 0.1357, "rewards/accuracies": 1.0, "rewards/chosen": 2.4016051292419434, "rewards/margins": 2.4016051292419434, "rewards/rejected": 0.0, "step": 822 }, { "epoch": 4.597765363128492, "grad_norm": 3.698424680136248, "learning_rate": 8.970011282180733e-07, "logits/chosen": -3.499113082885742, "logits/rejected": -3.7541868686676025, "logps/chosen": -24.649921417236328, "logps/rejected": -37.322784423828125, "loss": 0.1373, "rewards/accuracies": 1.0, "rewards/chosen": 2.4891157150268555, "rewards/margins": 2.4891157150268555, "rewards/rejected": 0.0, "step": 823 }, { "epoch": 4.603351955307263, "grad_norm": 3.314445175410359, "learning_rate": 8.966302821380302e-07, "logits/chosen": -3.5853443145751953, "logits/rejected": -3.616360902786255, "logps/chosen": -7.119479656219482, "logps/rejected": -29.054214477539062, "loss": 0.1364, "rewards/accuracies": 1.0, "rewards/chosen": 2.3403353691101074, "rewards/margins": 2.3403353691101074, "rewards/rejected": 0.0, "step": 824 }, { "epoch": 4.608938547486034, "grad_norm": 3.813873948837155, "learning_rate": 8.96258846594752e-07, "logits/chosen": -3.519484281539917, "logits/rejected": -3.314337730407715, "logps/chosen": -3.3653411865234375, "logps/rejected": -64.49395751953125, "loss": 0.1381, "rewards/accuracies": 1.0, "rewards/chosen": 1.6540875434875488, "rewards/margins": 1.6540875434875488, "rewards/rejected": 0.0, "step": 825 }, { "epoch": 4.614525139664805, "grad_norm": 4.044003592559312, "learning_rate": 8.95886822140258e-07, "logits/chosen": -3.553804874420166, "logits/rejected": -3.6048622131347656, "logps/chosen": -11.220242500305176, "logps/rejected": -12.231485366821289, "loss": 0.1332, "rewards/accuracies": 1.0, "rewards/chosen": 2.2462263107299805, "rewards/margins": 2.2462263107299805, "rewards/rejected": 0.0, "step": 826 }, { "epoch": 4.620111731843576, "grad_norm": 3.0775225868967766, "learning_rate": 8.955142093274426e-07, "logits/chosen": -3.465428352355957, "logits/rejected": -3.665736675262451, "logps/chosen": -5.22053337097168, "logps/rejected": -22.805583953857422, "loss": 0.1119, "rewards/accuracies": 1.0, "rewards/chosen": 2.120401382446289, "rewards/margins": 2.120401382446289, "rewards/rejected": 0.0, "step": 827 }, { "epoch": 4.625698324022347, "grad_norm": 3.165298406526679, "learning_rate": 8.951410087100749e-07, "logits/chosen": -3.749488353729248, "logits/rejected": -3.714769124984741, "logps/chosen": -2.7161521911621094, "logps/rejected": -33.90278244018555, "loss": 0.1542, "rewards/accuracies": 1.0, "rewards/chosen": 1.5537762641906738, "rewards/margins": 1.5537762641906738, "rewards/rejected": 0.0, "step": 828 }, { "epoch": 4.631284916201118, "grad_norm": 7.899198610372127, "learning_rate": 8.947672208427976e-07, "logits/chosen": -3.750854730606079, "logits/rejected": -3.6782498359680176, "logps/chosen": -6.590742588043213, "logps/rejected": -46.52386474609375, "loss": 0.1743, "rewards/accuracies": 1.0, "rewards/chosen": 2.104245662689209, "rewards/margins": 2.104245662689209, "rewards/rejected": 0.0, "step": 829 }, { "epoch": 4.636871508379889, "grad_norm": 2.6030914754418926, "learning_rate": 8.943928462811257e-07, "logits/chosen": -3.698634386062622, "logits/rejected": -3.8758480548858643, "logps/chosen": -17.00765609741211, "logps/rejected": -34.08386993408203, "loss": 0.1394, "rewards/accuracies": 1.0, "rewards/chosen": 2.54410457611084, "rewards/margins": 2.54410457611084, "rewards/rejected": 0.0, "step": 830 }, { "epoch": 4.64245810055866, "grad_norm": 4.6086346471003985, "learning_rate": 8.940178855814468e-07, "logits/chosen": -3.7106263637542725, "logits/rejected": -3.6128463745117188, "logps/chosen": -14.202127456665039, "logps/rejected": -31.858396530151367, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": 2.933572769165039, "rewards/margins": 2.933572769165039, "rewards/rejected": 0.0, "step": 831 }, { "epoch": 4.648044692737431, "grad_norm": 5.784086044638562, "learning_rate": 8.936423393010193e-07, "logits/chosen": -2.882251024246216, "logits/rejected": -3.0542917251586914, "logps/chosen": -28.222265243530273, "logps/rejected": -54.81916427612305, "loss": 0.1184, "rewards/accuracies": 1.0, "rewards/chosen": 2.224271774291992, "rewards/margins": 2.224271774291992, "rewards/rejected": 0.0, "step": 832 }, { "epoch": 4.653631284916202, "grad_norm": 3.5986533913630003, "learning_rate": 8.932662079979717e-07, "logits/chosen": -3.5953774452209473, "logits/rejected": -3.6140482425689697, "logps/chosen": -4.883005619049072, "logps/rejected": -67.65734100341797, "loss": 0.1584, "rewards/accuracies": 1.0, "rewards/chosen": 1.9462710618972778, "rewards/margins": 1.9462710618972778, "rewards/rejected": 0.0, "step": 833 }, { "epoch": 4.659217877094972, "grad_norm": 5.502857848087159, "learning_rate": 8.928894922313022e-07, "logits/chosen": -3.6708383560180664, "logits/rejected": -3.9550371170043945, "logps/chosen": -24.624183654785156, "logps/rejected": -26.975831985473633, "loss": 0.1528, "rewards/accuracies": 1.0, "rewards/chosen": 2.015517234802246, "rewards/margins": 2.015517234802246, "rewards/rejected": 0.0, "step": 834 }, { "epoch": 4.664804469273743, "grad_norm": 4.464777725710083, "learning_rate": 8.925121925608772e-07, "logits/chosen": -3.9531307220458984, "logits/rejected": -3.589585542678833, "logps/chosen": -9.957625389099121, "logps/rejected": -56.52036666870117, "loss": 0.1909, "rewards/accuracies": 1.0, "rewards/chosen": 2.4638419151306152, "rewards/margins": 2.4638419151306152, "rewards/rejected": 0.0, "step": 835 }, { "epoch": 4.670391061452514, "grad_norm": 4.323240389193277, "learning_rate": 8.921343095474316e-07, "logits/chosen": -3.4413089752197266, "logits/rejected": -3.384950876235962, "logps/chosen": -12.17386245727539, "logps/rejected": -44.575828552246094, "loss": 0.148, "rewards/accuracies": 1.0, "rewards/chosen": 2.343230724334717, "rewards/margins": 2.343230724334717, "rewards/rejected": 0.0, "step": 836 }, { "epoch": 4.675977653631285, "grad_norm": 4.871331071988654, "learning_rate": 8.917558437525667e-07, "logits/chosen": -3.6823201179504395, "logits/rejected": -3.872624397277832, "logps/chosen": -3.832653045654297, "logps/rejected": -62.0364875793457, "loss": 0.1318, "rewards/accuracies": 1.0, "rewards/chosen": 1.7777758836746216, "rewards/margins": 1.7777758836746216, "rewards/rejected": 0.0, "step": 837 }, { "epoch": 4.681564245810056, "grad_norm": 2.739482165182619, "learning_rate": 8.913767957387501e-07, "logits/chosen": -3.8687398433685303, "logits/rejected": -3.897033452987671, "logps/chosen": -8.018453598022461, "logps/rejected": -42.729331970214844, "loss": 0.1188, "rewards/accuracies": 1.0, "rewards/chosen": 2.448753833770752, "rewards/margins": 2.448753833770752, "rewards/rejected": 0.0, "step": 838 }, { "epoch": 4.687150837988827, "grad_norm": 3.183335515081523, "learning_rate": 8.909971660693147e-07, "logits/chosen": -3.34853196144104, "logits/rejected": -3.4277381896972656, "logps/chosen": -8.967146873474121, "logps/rejected": -23.089962005615234, "loss": 0.1311, "rewards/accuracies": 1.0, "rewards/chosen": 2.13580060005188, "rewards/margins": 2.13580060005188, "rewards/rejected": 0.0, "step": 839 }, { "epoch": 4.692737430167598, "grad_norm": 7.559023766067752, "learning_rate": 8.906169553084577e-07, "logits/chosen": -3.4968342781066895, "logits/rejected": -3.479490280151367, "logps/chosen": -4.84238338470459, "logps/rejected": -29.289892196655273, "loss": 0.2087, "rewards/accuracies": 1.0, "rewards/chosen": 1.613553524017334, "rewards/margins": 1.613553524017334, "rewards/rejected": 0.0, "step": 840 }, { "epoch": 4.692737430167598, "eval_logits/chosen": -3.4382870197296143, "eval_logits/rejected": -3.557262897491455, "eval_logps/chosen": -20.889551162719727, "eval_logps/rejected": -37.64310073852539, "eval_loss": 0.4653335511684418, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.9180494546890259, "eval_rewards/margins": 0.9180494546890259, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7538, "eval_samples_per_second": 9.465, "eval_steps_per_second": 0.305, "step": 840 }, { "epoch": 4.698324022346369, "grad_norm": 4.164925855011585, "learning_rate": 8.9023616402124e-07, "logits/chosen": -3.6509761810302734, "logits/rejected": -3.4124083518981934, "logps/chosen": -13.185310363769531, "logps/rejected": -19.683773040771484, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": 2.4277474880218506, "rewards/margins": 2.4277474880218506, "rewards/rejected": 0.0, "step": 841 }, { "epoch": 4.70391061452514, "grad_norm": 4.095634852224189, "learning_rate": 8.898547927735853e-07, "logits/chosen": -3.4244139194488525, "logits/rejected": -3.3410980701446533, "logps/chosen": -20.241806030273438, "logps/rejected": -25.040424346923828, "loss": 0.1308, "rewards/accuracies": 1.0, "rewards/chosen": 2.5954225063323975, "rewards/margins": 2.5954225063323975, "rewards/rejected": 0.0, "step": 842 }, { "epoch": 4.709497206703911, "grad_norm": 3.8874672480164203, "learning_rate": 8.894728421322792e-07, "logits/chosen": -3.538426160812378, "logits/rejected": -3.6762707233428955, "logps/chosen": -34.24104690551758, "logps/rejected": -28.45409393310547, "loss": 0.1536, "rewards/accuracies": 1.0, "rewards/chosen": 2.510376453399658, "rewards/margins": 2.510376453399658, "rewards/rejected": 0.0, "step": 843 }, { "epoch": 4.715083798882682, "grad_norm": 5.641882413941325, "learning_rate": 8.890903126649685e-07, "logits/chosen": -3.534832000732422, "logits/rejected": -3.3883702754974365, "logps/chosen": -6.195542335510254, "logps/rejected": -21.10651397705078, "loss": 0.1425, "rewards/accuracies": 1.0, "rewards/chosen": 2.074922800064087, "rewards/margins": 2.074922800064087, "rewards/rejected": 0.0, "step": 844 }, { "epoch": 4.720670391061453, "grad_norm": 3.225454777473204, "learning_rate": 8.887072049401598e-07, "logits/chosen": -3.827566385269165, "logits/rejected": -3.703700304031372, "logps/chosen": -6.3797197341918945, "logps/rejected": -25.23931312561035, "loss": 0.1498, "rewards/accuracies": 1.0, "rewards/chosen": 2.050833225250244, "rewards/margins": 2.050833225250244, "rewards/rejected": 0.0, "step": 845 }, { "epoch": 4.726256983240224, "grad_norm": 4.180459311375749, "learning_rate": 8.883235195272196e-07, "logits/chosen": -3.547757148742676, "logits/rejected": -3.5946600437164307, "logps/chosen": -7.756105899810791, "logps/rejected": -29.495033264160156, "loss": 0.1188, "rewards/accuracies": 1.0, "rewards/chosen": 2.3571243286132812, "rewards/margins": 2.3571243286132812, "rewards/rejected": 0.0, "step": 846 }, { "epoch": 4.731843575418995, "grad_norm": 5.9734829206994995, "learning_rate": 8.879392569963728e-07, "logits/chosen": -3.4734814167022705, "logits/rejected": -3.33512020111084, "logps/chosen": -12.2142333984375, "logps/rejected": -28.297042846679688, "loss": 0.1844, "rewards/accuracies": 1.0, "rewards/chosen": 2.387676954269409, "rewards/margins": 2.387676954269409, "rewards/rejected": 0.0, "step": 847 }, { "epoch": 4.737430167597766, "grad_norm": 4.2661422590982, "learning_rate": 8.87554417918702e-07, "logits/chosen": -3.9001755714416504, "logits/rejected": -3.9348902702331543, "logps/chosen": -6.842373371124268, "logps/rejected": -26.677532196044922, "loss": 0.1663, "rewards/accuracies": 1.0, "rewards/chosen": 1.8079452514648438, "rewards/margins": 1.8079452514648438, "rewards/rejected": 0.0, "step": 848 }, { "epoch": 4.743016759776537, "grad_norm": 5.05967023339862, "learning_rate": 8.871690028661464e-07, "logits/chosen": -3.479654550552368, "logits/rejected": -3.278282403945923, "logps/chosen": -20.254270553588867, "logps/rejected": -81.40043640136719, "loss": 0.1558, "rewards/accuracies": 1.0, "rewards/chosen": 2.504744052886963, "rewards/margins": 2.504744052886963, "rewards/rejected": 0.0, "step": 849 }, { "epoch": 4.748603351955307, "grad_norm": 3.7319728586212753, "learning_rate": 8.867830124115014e-07, "logits/chosen": -3.176271915435791, "logits/rejected": -3.3628337383270264, "logps/chosen": -16.692127227783203, "logps/rejected": -42.161102294921875, "loss": 0.1304, "rewards/accuracies": 1.0, "rewards/chosen": 2.4846765995025635, "rewards/margins": 2.4846765995025635, "rewards/rejected": 0.0, "step": 850 }, { "epoch": 4.754189944134078, "grad_norm": 4.783110484472577, "learning_rate": 8.863964471284178e-07, "logits/chosen": -3.7601890563964844, "logits/rejected": -3.7013044357299805, "logps/chosen": -5.958396911621094, "logps/rejected": -22.45055389404297, "loss": 0.1333, "rewards/accuracies": 1.0, "rewards/chosen": 2.179135322570801, "rewards/margins": 2.179135322570801, "rewards/rejected": 0.0, "step": 851 }, { "epoch": 4.759776536312849, "grad_norm": 5.7151029669725935, "learning_rate": 8.860093075914004e-07, "logits/chosen": -3.2503085136413574, "logits/rejected": -3.472109794616699, "logps/chosen": -3.8963963985443115, "logps/rejected": -80.78500366210938, "loss": 0.1591, "rewards/accuracies": 1.0, "rewards/chosen": 1.7483668327331543, "rewards/margins": 1.7483668327331543, "rewards/rejected": 0.0, "step": 852 }, { "epoch": 4.76536312849162, "grad_norm": 9.094997919284872, "learning_rate": 8.856215943758074e-07, "logits/chosen": -3.5655927658081055, "logits/rejected": -3.1865780353546143, "logps/chosen": -11.715600967407227, "logps/rejected": -69.3724594116211, "loss": 0.1441, "rewards/accuracies": 1.0, "rewards/chosen": 2.4587438106536865, "rewards/margins": 2.4587438106536865, "rewards/rejected": 0.0, "step": 853 }, { "epoch": 4.770949720670391, "grad_norm": 5.112652606314636, "learning_rate": 8.852333080578495e-07, "logits/chosen": -3.502084255218506, "logits/rejected": -3.748291492462158, "logps/chosen": -9.790876388549805, "logps/rejected": -19.313125610351562, "loss": 0.1628, "rewards/accuracies": 1.0, "rewards/chosen": 1.8427542448043823, "rewards/margins": 1.8427542448043823, "rewards/rejected": 0.0, "step": 854 }, { "epoch": 4.776536312849162, "grad_norm": 4.848025576669861, "learning_rate": 8.848444492145899e-07, "logits/chosen": -3.5827841758728027, "logits/rejected": -3.7325146198272705, "logps/chosen": -13.776187896728516, "logps/rejected": -30.147903442382812, "loss": 0.1605, "rewards/accuracies": 1.0, "rewards/chosen": 2.8059005737304688, "rewards/margins": 2.8059005737304688, "rewards/rejected": 0.0, "step": 855 }, { "epoch": 4.782122905027933, "grad_norm": 4.2627260769307025, "learning_rate": 8.844550184239414e-07, "logits/chosen": -3.709784746170044, "logits/rejected": -3.637117624282837, "logps/chosen": -27.101978302001953, "logps/rejected": -38.568138122558594, "loss": 0.1394, "rewards/accuracies": 1.0, "rewards/chosen": 2.3738842010498047, "rewards/margins": 2.3738842010498047, "rewards/rejected": 0.0, "step": 856 }, { "epoch": 4.787709497206704, "grad_norm": 3.2765455080161354, "learning_rate": 8.840650162646679e-07, "logits/chosen": -3.7055535316467285, "logits/rejected": -3.7811439037323, "logps/chosen": -6.742583274841309, "logps/rejected": -61.49567413330078, "loss": 0.1335, "rewards/accuracies": 1.0, "rewards/chosen": 2.0676863193511963, "rewards/margins": 2.0676863193511963, "rewards/rejected": 0.0, "step": 857 }, { "epoch": 4.793296089385475, "grad_norm": 3.440546922947156, "learning_rate": 8.836744433163821e-07, "logits/chosen": -3.4895920753479004, "logits/rejected": -3.6672732830047607, "logps/chosen": -32.132694244384766, "logps/rejected": -29.041641235351562, "loss": 0.1212, "rewards/accuracies": 1.0, "rewards/chosen": 2.5951497554779053, "rewards/margins": 2.5951497554779053, "rewards/rejected": 0.0, "step": 858 }, { "epoch": 4.798882681564246, "grad_norm": 6.333532675924077, "learning_rate": 8.832833001595448e-07, "logits/chosen": -3.485452175140381, "logits/rejected": -3.5626187324523926, "logps/chosen": -6.243794918060303, "logps/rejected": -54.92760467529297, "loss": 0.1666, "rewards/accuracies": 1.0, "rewards/chosen": 1.4682459831237793, "rewards/margins": 1.4682459831237793, "rewards/rejected": 0.0, "step": 859 }, { "epoch": 4.804469273743017, "grad_norm": 9.321206136113556, "learning_rate": 8.828915873754643e-07, "logits/chosen": -3.8284683227539062, "logits/rejected": -3.896667718887329, "logps/chosen": -7.002790451049805, "logps/rejected": -38.846466064453125, "loss": 0.1917, "rewards/accuracies": 1.0, "rewards/chosen": 1.6483442783355713, "rewards/margins": 1.6483442783355713, "rewards/rejected": 0.0, "step": 860 }, { "epoch": 4.804469273743017, "eval_logits/chosen": -3.4337158203125, "eval_logits/rejected": -3.5523319244384766, "eval_logps/chosen": -20.837451934814453, "eval_logps/rejected": -37.41223907470703, "eval_loss": 0.45792660117149353, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.9232591390609741, "eval_rewards/margins": 0.9232591390609741, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7351, "eval_samples_per_second": 9.47, "eval_steps_per_second": 0.305, "step": 860 }, { "epoch": 4.810055865921788, "grad_norm": 10.558126361014091, "learning_rate": 8.82499305546296e-07, "logits/chosen": -3.3911778926849365, "logits/rejected": -3.5714023113250732, "logps/chosen": -29.563304901123047, "logps/rejected": -45.68463897705078, "loss": 0.1503, "rewards/accuracies": 1.0, "rewards/chosen": 2.3393919467926025, "rewards/margins": 2.3393919467926025, "rewards/rejected": 0.0, "step": 861 }, { "epoch": 4.815642458100559, "grad_norm": 3.6575243733620337, "learning_rate": 8.821064552550399e-07, "logits/chosen": -3.264042615890503, "logits/rejected": -3.4649696350097656, "logps/chosen": -7.36049222946167, "logps/rejected": -49.74733352661133, "loss": 0.1207, "rewards/accuracies": 1.0, "rewards/chosen": 1.71518874168396, "rewards/margins": 1.71518874168396, "rewards/rejected": 0.0, "step": 862 }, { "epoch": 4.82122905027933, "grad_norm": 5.312389583932255, "learning_rate": 8.817130370855421e-07, "logits/chosen": -3.665332794189453, "logits/rejected": -3.640456199645996, "logps/chosen": -7.155714988708496, "logps/rejected": -18.648021697998047, "loss": 0.1943, "rewards/accuracies": 0.75, "rewards/chosen": 1.0625817775726318, "rewards/margins": 1.0625817775726318, "rewards/rejected": 0.0, "step": 863 }, { "epoch": 4.826815642458101, "grad_norm": 4.271741315348231, "learning_rate": 8.813190516224917e-07, "logits/chosen": -3.786005973815918, "logits/rejected": -3.8781280517578125, "logps/chosen": -18.279701232910156, "logps/rejected": -27.392444610595703, "loss": 0.1428, "rewards/accuracies": 1.0, "rewards/chosen": 2.885749340057373, "rewards/margins": 2.885749340057373, "rewards/rejected": 0.0, "step": 864 }, { "epoch": 4.832402234636872, "grad_norm": 8.007426726489133, "learning_rate": 8.809244994514216e-07, "logits/chosen": -3.5738139152526855, "logits/rejected": -3.54390811920166, "logps/chosen": -9.149717330932617, "logps/rejected": -60.3470458984375, "loss": 0.1444, "rewards/accuracies": 1.0, "rewards/chosen": 2.350733757019043, "rewards/margins": 2.350733757019043, "rewards/rejected": 0.0, "step": 865 }, { "epoch": 4.837988826815643, "grad_norm": 3.501883519918708, "learning_rate": 8.80529381158706e-07, "logits/chosen": -3.549853563308716, "logits/rejected": -3.669538736343384, "logps/chosen": -5.122585773468018, "logps/rejected": -53.249053955078125, "loss": 0.1332, "rewards/accuracies": 1.0, "rewards/chosen": 1.3994872570037842, "rewards/margins": 1.3994872570037842, "rewards/rejected": 0.0, "step": 866 }, { "epoch": 4.843575418994414, "grad_norm": 4.794695180692528, "learning_rate": 8.801336973315617e-07, "logits/chosen": -3.5816879272460938, "logits/rejected": -3.6130690574645996, "logps/chosen": -13.329113960266113, "logps/rejected": -46.01040267944336, "loss": 0.1446, "rewards/accuracies": 1.0, "rewards/chosen": 2.4738619327545166, "rewards/margins": 2.4738619327545166, "rewards/rejected": 0.0, "step": 867 }, { "epoch": 4.849162011173185, "grad_norm": 3.4681825650643416, "learning_rate": 8.79737448558045e-07, "logits/chosen": -3.8831658363342285, "logits/rejected": -3.7107584476470947, "logps/chosen": -7.576117515563965, "logps/rejected": -37.6939582824707, "loss": 0.1578, "rewards/accuracies": 1.0, "rewards/chosen": 2.096287250518799, "rewards/margins": 2.096287250518799, "rewards/rejected": 0.0, "step": 868 }, { "epoch": 4.854748603351956, "grad_norm": 3.522115677977157, "learning_rate": 8.793406354270522e-07, "logits/chosen": -3.5656511783599854, "logits/rejected": -3.636209487915039, "logps/chosen": -9.377242088317871, "logps/rejected": -19.11086654663086, "loss": 0.1471, "rewards/accuracies": 1.0, "rewards/chosen": 2.1785616874694824, "rewards/margins": 2.1785616874694824, "rewards/rejected": 0.0, "step": 869 }, { "epoch": 4.860335195530726, "grad_norm": 3.636940330890014, "learning_rate": 8.789432585283182e-07, "logits/chosen": -3.9074556827545166, "logits/rejected": -3.8623340129852295, "logps/chosen": -9.630167961120605, "logps/rejected": -20.304290771484375, "loss": 0.1159, "rewards/accuracies": 1.0, "rewards/chosen": 2.2927615642547607, "rewards/margins": 2.2927615642547607, "rewards/rejected": 0.0, "step": 870 }, { "epoch": 4.865921787709497, "grad_norm": 8.786746912693621, "learning_rate": 8.785453184524161e-07, "logits/chosen": -3.4771811962127686, "logits/rejected": -3.7050230503082275, "logps/chosen": -5.9504594802856445, "logps/rejected": -50.320777893066406, "loss": 0.1872, "rewards/accuracies": 0.75, "rewards/chosen": 1.0571459531784058, "rewards/margins": 1.0571459531784058, "rewards/rejected": 0.0, "step": 871 }, { "epoch": 4.871508379888268, "grad_norm": 4.016442501619869, "learning_rate": 8.781468157907554e-07, "logits/chosen": -3.5626220703125, "logits/rejected": -3.583447217941284, "logps/chosen": -9.067211151123047, "logps/rejected": -26.456647872924805, "loss": 0.1499, "rewards/accuracies": 1.0, "rewards/chosen": 1.9865537881851196, "rewards/margins": 1.9865537881851196, "rewards/rejected": 0.0, "step": 872 }, { "epoch": 4.877094972067039, "grad_norm": 4.53543096354546, "learning_rate": 8.777477511355822e-07, "logits/chosen": -3.3158986568450928, "logits/rejected": -3.3363637924194336, "logps/chosen": -6.712955951690674, "logps/rejected": -32.11681365966797, "loss": 0.148, "rewards/accuracies": 1.0, "rewards/chosen": 2.324026346206665, "rewards/margins": 2.324026346206665, "rewards/rejected": 0.0, "step": 873 }, { "epoch": 4.88268156424581, "grad_norm": 3.3915553921064108, "learning_rate": 8.773481250799776e-07, "logits/chosen": -3.3401966094970703, "logits/rejected": -3.218843936920166, "logps/chosen": -5.150437831878662, "logps/rejected": -48.85731506347656, "loss": 0.1429, "rewards/accuracies": 1.0, "rewards/chosen": 1.6175756454467773, "rewards/margins": 1.6175756454467773, "rewards/rejected": 0.0, "step": 874 }, { "epoch": 4.888268156424581, "grad_norm": 5.366800397444954, "learning_rate": 8.76947938217857e-07, "logits/chosen": -3.6672873497009277, "logits/rejected": -3.4623639583587646, "logps/chosen": -4.0337114334106445, "logps/rejected": -21.869462966918945, "loss": 0.1619, "rewards/accuracies": 1.0, "rewards/chosen": 1.2740421295166016, "rewards/margins": 1.2740421295166016, "rewards/rejected": 0.0, "step": 875 }, { "epoch": 4.893854748603352, "grad_norm": 6.212216346123267, "learning_rate": 8.765471911439697e-07, "logits/chosen": -3.3895440101623535, "logits/rejected": -3.4063079357147217, "logps/chosen": -33.54104995727539, "logps/rejected": -39.06547546386719, "loss": 0.1685, "rewards/accuracies": 1.0, "rewards/chosen": 3.6400959491729736, "rewards/margins": 3.6400959491729736, "rewards/rejected": 0.0, "step": 876 }, { "epoch": 4.899441340782123, "grad_norm": 8.414250727496093, "learning_rate": 8.76145884453897e-07, "logits/chosen": -3.5738282203674316, "logits/rejected": -3.5223355293273926, "logps/chosen": -12.802833557128906, "logps/rejected": -20.496601104736328, "loss": 0.2038, "rewards/accuracies": 0.75, "rewards/chosen": 1.4248250722885132, "rewards/margins": 1.4248250722885132, "rewards/rejected": 0.0, "step": 877 }, { "epoch": 4.905027932960894, "grad_norm": 6.795900358584056, "learning_rate": 8.757440187440519e-07, "logits/chosen": -3.712117910385132, "logits/rejected": -3.529858350753784, "logps/chosen": -8.133525848388672, "logps/rejected": -24.60988998413086, "loss": 0.1829, "rewards/accuracies": 0.75, "rewards/chosen": 1.5177634954452515, "rewards/margins": 1.5177634954452515, "rewards/rejected": 0.0, "step": 878 }, { "epoch": 4.910614525139665, "grad_norm": 4.620793169748076, "learning_rate": 8.753415946116786e-07, "logits/chosen": -3.4427971839904785, "logits/rejected": -3.714064836502075, "logps/chosen": -3.4842402935028076, "logps/rejected": -55.89486312866211, "loss": 0.1461, "rewards/accuracies": 1.0, "rewards/chosen": 1.6254570484161377, "rewards/margins": 1.6254570484161377, "rewards/rejected": 0.0, "step": 879 }, { "epoch": 4.916201117318436, "grad_norm": 3.6278667379031915, "learning_rate": 8.74938612654851e-07, "logits/chosen": -3.660454511642456, "logits/rejected": -3.769160509109497, "logps/chosen": -7.781861305236816, "logps/rejected": -52.43263244628906, "loss": 0.1446, "rewards/accuracies": 1.0, "rewards/chosen": 2.0503783226013184, "rewards/margins": 2.0503783226013184, "rewards/rejected": 0.0, "step": 880 }, { "epoch": 4.916201117318436, "eval_logits/chosen": -3.4445042610168457, "eval_logits/rejected": -3.562300205230713, "eval_logps/chosen": -21.081167221069336, "eval_logps/rejected": -37.07769012451172, "eval_loss": 0.45790335536003113, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.8988874554634094, "eval_rewards/margins": 0.8988874554634094, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6974, "eval_samples_per_second": 9.481, "eval_steps_per_second": 0.306, "step": 880 }, { "epoch": 4.921787709497207, "grad_norm": 3.532124702646352, "learning_rate": 8.745350734724721e-07, "logits/chosen": -3.6082777976989746, "logits/rejected": -3.5758960247039795, "logps/chosen": -5.217126846313477, "logps/rejected": -43.59714126586914, "loss": 0.1326, "rewards/accuracies": 1.0, "rewards/chosen": 1.950150728225708, "rewards/margins": 1.950150728225708, "rewards/rejected": 0.0, "step": 881 }, { "epoch": 4.927374301675978, "grad_norm": 5.148709902345891, "learning_rate": 8.741309776642732e-07, "logits/chosen": -3.537998914718628, "logits/rejected": -3.392167568206787, "logps/chosen": -4.527282238006592, "logps/rejected": -38.122215270996094, "loss": 0.1304, "rewards/accuracies": 1.0, "rewards/chosen": 1.907090663909912, "rewards/margins": 1.907090663909912, "rewards/rejected": 0.0, "step": 882 }, { "epoch": 4.932960893854749, "grad_norm": 4.587751704336057, "learning_rate": 8.73726325830812e-07, "logits/chosen": -3.6477560997009277, "logits/rejected": -3.5672593116760254, "logps/chosen": -15.186811447143555, "logps/rejected": -20.84063148498535, "loss": 0.1278, "rewards/accuracies": 1.0, "rewards/chosen": 2.6742024421691895, "rewards/margins": 2.6742024421691895, "rewards/rejected": 0.0, "step": 883 }, { "epoch": 4.93854748603352, "grad_norm": 3.195306283341391, "learning_rate": 8.733211185734739e-07, "logits/chosen": -3.3010315895080566, "logits/rejected": -3.403163194656372, "logps/chosen": -19.622634887695312, "logps/rejected": -43.96053695678711, "loss": 0.1401, "rewards/accuracies": 1.0, "rewards/chosen": 2.4762537479400635, "rewards/margins": 2.4762537479400635, "rewards/rejected": 0.0, "step": 884 }, { "epoch": 4.94413407821229, "grad_norm": 4.803074260751064, "learning_rate": 8.729153564944687e-07, "logits/chosen": -3.221707582473755, "logits/rejected": -3.2944374084472656, "logps/chosen": -14.754175186157227, "logps/rejected": -35.13188934326172, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": 1.9772963523864746, "rewards/margins": 1.9772963523864746, "rewards/rejected": 0.0, "step": 885 }, { "epoch": 4.949720670391061, "grad_norm": 4.0082091117970915, "learning_rate": 8.725090401968312e-07, "logits/chosen": -3.5048813819885254, "logits/rejected": -3.5432684421539307, "logps/chosen": -11.013408660888672, "logps/rejected": -22.639747619628906, "loss": 0.1307, "rewards/accuracies": 1.0, "rewards/chosen": 2.150505542755127, "rewards/margins": 2.150505542755127, "rewards/rejected": 0.0, "step": 886 }, { "epoch": 4.955307262569832, "grad_norm": 3.0866883461040384, "learning_rate": 8.721021702844196e-07, "logits/chosen": -3.496249198913574, "logits/rejected": -3.6273727416992188, "logps/chosen": -6.44735050201416, "logps/rejected": -57.3204345703125, "loss": 0.1356, "rewards/accuracies": 1.0, "rewards/chosen": 2.3810064792633057, "rewards/margins": 2.3810064792633057, "rewards/rejected": 0.0, "step": 887 }, { "epoch": 4.960893854748603, "grad_norm": 3.206021847477474, "learning_rate": 8.716947473619153e-07, "logits/chosen": -3.3977081775665283, "logits/rejected": -3.6455678939819336, "logps/chosen": -11.624605178833008, "logps/rejected": -52.049560546875, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": 2.2550697326660156, "rewards/margins": 2.2550697326660156, "rewards/rejected": 0.0, "step": 888 }, { "epoch": 4.966480446927374, "grad_norm": 3.1866427838508047, "learning_rate": 8.712867720348211e-07, "logits/chosen": -3.441948890686035, "logits/rejected": -3.4032344818115234, "logps/chosen": -13.188152313232422, "logps/rejected": -31.50457763671875, "loss": 0.1197, "rewards/accuracies": 1.0, "rewards/chosen": 2.3232598304748535, "rewards/margins": 2.3232598304748535, "rewards/rejected": 0.0, "step": 889 }, { "epoch": 4.972067039106145, "grad_norm": 2.5771609283540378, "learning_rate": 8.708782449094611e-07, "logits/chosen": -3.64654803276062, "logits/rejected": -3.7692649364471436, "logps/chosen": -14.121740341186523, "logps/rejected": -22.496761322021484, "loss": 0.122, "rewards/accuracies": 1.0, "rewards/chosen": 2.407217502593994, "rewards/margins": 2.407217502593994, "rewards/rejected": 0.0, "step": 890 }, { "epoch": 4.977653631284916, "grad_norm": 5.04488215485463, "learning_rate": 8.704691665929795e-07, "logits/chosen": -3.8497824668884277, "logits/rejected": -3.7043254375457764, "logps/chosen": -11.662101745605469, "logps/rejected": -25.833295822143555, "loss": 0.1808, "rewards/accuracies": 1.0, "rewards/chosen": 1.9730836153030396, "rewards/margins": 1.9730836153030396, "rewards/rejected": 0.0, "step": 891 }, { "epoch": 4.983240223463687, "grad_norm": 3.658861469188171, "learning_rate": 8.700595376933393e-07, "logits/chosen": -3.8118834495544434, "logits/rejected": -3.5197701454162598, "logps/chosen": -13.414787292480469, "logps/rejected": -56.63683319091797, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": 2.0512075424194336, "rewards/margins": 2.0512075424194336, "rewards/rejected": 0.0, "step": 892 }, { "epoch": 4.988826815642458, "grad_norm": 2.8903311625392654, "learning_rate": 8.696493588193221e-07, "logits/chosen": -3.328495979309082, "logits/rejected": -3.4698615074157715, "logps/chosen": -3.700801134109497, "logps/rejected": -44.19495391845703, "loss": 0.1304, "rewards/accuracies": 1.0, "rewards/chosen": 1.874070405960083, "rewards/margins": 1.874070405960083, "rewards/rejected": 0.0, "step": 893 }, { "epoch": 4.994413407821229, "grad_norm": 3.2934790346791663, "learning_rate": 8.692386305805269e-07, "logits/chosen": -3.1227805614471436, "logits/rejected": -3.09611439704895, "logps/chosen": -35.340484619140625, "logps/rejected": -68.12325286865234, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": 2.268045425415039, "rewards/margins": 2.268045425415039, "rewards/rejected": 0.0, "step": 894 }, { "epoch": 5.0, "grad_norm": 2.0998319711198743, "learning_rate": 8.688273535873686e-07, "logits/chosen": -3.8451521396636963, "logits/rejected": -3.4695792198181152, "logps/chosen": -9.727790832519531, "logps/rejected": -28.658916473388672, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": 2.6223816871643066, "rewards/margins": 2.6223816871643066, "rewards/rejected": 0.0, "step": 895 }, { "epoch": 5.005586592178771, "grad_norm": 6.127492385172778, "learning_rate": 8.684155284510783e-07, "logits/chosen": -3.7467594146728516, "logits/rejected": -3.6886558532714844, "logps/chosen": -12.076903343200684, "logps/rejected": -22.749643325805664, "loss": 0.144, "rewards/accuracies": 1.0, "rewards/chosen": 2.371918201446533, "rewards/margins": 2.371918201446533, "rewards/rejected": 0.0, "step": 896 }, { "epoch": 5.011173184357542, "grad_norm": 2.0759935000059735, "learning_rate": 8.680031557837018e-07, "logits/chosen": -3.2360999584198, "logits/rejected": -3.2278568744659424, "logps/chosen": -11.073640823364258, "logps/rejected": -63.07777786254883, "loss": 0.1214, "rewards/accuracies": 1.0, "rewards/chosen": 2.6215603351593018, "rewards/margins": 2.6215603351593018, "rewards/rejected": 0.0, "step": 897 }, { "epoch": 5.016759776536313, "grad_norm": 7.738618865663303, "learning_rate": 8.675902361980978e-07, "logits/chosen": -3.3227431774139404, "logits/rejected": -3.1236305236816406, "logps/chosen": -4.390902042388916, "logps/rejected": -19.178699493408203, "loss": 0.1502, "rewards/accuracies": 1.0, "rewards/chosen": 2.052748441696167, "rewards/margins": 2.052748441696167, "rewards/rejected": 0.0, "step": 898 }, { "epoch": 5.022346368715084, "grad_norm": 2.6501499890783338, "learning_rate": 8.671767703079386e-07, "logits/chosen": -3.8698344230651855, "logits/rejected": -3.894357204437256, "logps/chosen": -21.294483184814453, "logps/rejected": -44.14567184448242, "loss": 0.1191, "rewards/accuracies": 1.0, "rewards/chosen": 3.803455352783203, "rewards/margins": 3.803455352783203, "rewards/rejected": 0.0, "step": 899 }, { "epoch": 5.027932960893855, "grad_norm": 2.386846191424922, "learning_rate": 8.667627587277081e-07, "logits/chosen": -3.7314257621765137, "logits/rejected": -3.7088756561279297, "logps/chosen": -3.20855450630188, "logps/rejected": -48.41551971435547, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 2.193094491958618, "rewards/margins": 2.193094491958618, "rewards/rejected": 0.0, "step": 900 }, { "epoch": 5.027932960893855, "eval_logits/chosen": -3.435518980026245, "eval_logits/rejected": -3.5585150718688965, "eval_logps/chosen": -21.40536117553711, "eval_logps/rejected": -38.2454719543457, "eval_loss": 0.4943534731864929, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.8664682507514954, "eval_rewards/margins": 0.8664682507514954, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7129, "eval_samples_per_second": 9.476, "eval_steps_per_second": 0.306, "step": 900 }, { "epoch": 5.033519553072626, "grad_norm": 2.5779826951393097, "learning_rate": 8.663482020727013e-07, "logits/chosen": -3.718381643295288, "logits/rejected": -3.7165474891662598, "logps/chosen": -12.928625106811523, "logps/rejected": -37.67301940917969, "loss": 0.1181, "rewards/accuracies": 1.0, "rewards/chosen": 2.815427780151367, "rewards/margins": 2.815427780151367, "rewards/rejected": 0.0, "step": 901 }, { "epoch": 5.039106145251397, "grad_norm": 2.3829605902021043, "learning_rate": 8.659331009590231e-07, "logits/chosen": -3.8124420642852783, "logits/rejected": -3.894714593887329, "logps/chosen": -6.802081108093262, "logps/rejected": -59.38578796386719, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": 2.1188838481903076, "rewards/margins": 2.1188838481903076, "rewards/rejected": 0.0, "step": 902 }, { "epoch": 5.044692737430168, "grad_norm": 3.295346521656783, "learning_rate": 8.655174560035878e-07, "logits/chosen": -3.3991622924804688, "logits/rejected": -3.6705405712127686, "logps/chosen": -19.94412612915039, "logps/rejected": -27.22773551940918, "loss": 0.1441, "rewards/accuracies": 1.0, "rewards/chosen": 3.003748893737793, "rewards/margins": 3.003748893737793, "rewards/rejected": 0.0, "step": 903 }, { "epoch": 5.050279329608939, "grad_norm": 2.4254931524029373, "learning_rate": 8.651012678241179e-07, "logits/chosen": -3.432058811187744, "logits/rejected": -3.484321117401123, "logps/chosen": -7.184386730194092, "logps/rejected": -19.5164794921875, "loss": 0.1367, "rewards/accuracies": 1.0, "rewards/chosen": 1.8795058727264404, "rewards/margins": 1.8795058727264404, "rewards/rejected": 0.0, "step": 904 }, { "epoch": 5.055865921787709, "grad_norm": 5.762149383593757, "learning_rate": 8.646845370391429e-07, "logits/chosen": -3.544438362121582, "logits/rejected": -3.552574872970581, "logps/chosen": -3.3116674423217773, "logps/rejected": -44.91862487792969, "loss": 0.1599, "rewards/accuracies": 1.0, "rewards/chosen": 2.1848440170288086, "rewards/margins": 2.1848440170288086, "rewards/rejected": 0.0, "step": 905 }, { "epoch": 5.06145251396648, "grad_norm": 2.238674137807379, "learning_rate": 8.642672642679991e-07, "logits/chosen": -2.8455119132995605, "logits/rejected": -3.1013801097869873, "logps/chosen": -37.575096130371094, "logps/rejected": -67.72724914550781, "loss": 0.1286, "rewards/accuracies": 1.0, "rewards/chosen": 2.719628095626831, "rewards/margins": 2.719628095626831, "rewards/rejected": 0.0, "step": 906 }, { "epoch": 5.067039106145251, "grad_norm": 3.1359154613767766, "learning_rate": 8.638494501308282e-07, "logits/chosen": -3.614899158477783, "logits/rejected": -3.5207931995391846, "logps/chosen": -2.8579413890838623, "logps/rejected": -29.47831916809082, "loss": 0.1145, "rewards/accuracies": 1.0, "rewards/chosen": 1.9729458093643188, "rewards/margins": 1.9729458093643188, "rewards/rejected": 0.0, "step": 907 }, { "epoch": 5.072625698324022, "grad_norm": 2.9435008272268743, "learning_rate": 8.634310952485764e-07, "logits/chosen": -3.7244222164154053, "logits/rejected": -3.8382468223571777, "logps/chosen": -5.40566349029541, "logps/rejected": -39.49150085449219, "loss": 0.1289, "rewards/accuracies": 1.0, "rewards/chosen": 2.449627161026001, "rewards/margins": 2.449627161026001, "rewards/rejected": 0.0, "step": 908 }, { "epoch": 5.078212290502793, "grad_norm": 3.07483432464254, "learning_rate": 8.630122002429934e-07, "logits/chosen": -3.888763666152954, "logits/rejected": -3.8629233837127686, "logps/chosen": -4.398175239562988, "logps/rejected": -40.81145095825195, "loss": 0.1191, "rewards/accuracies": 1.0, "rewards/chosen": 2.1777243614196777, "rewards/margins": 2.1777243614196777, "rewards/rejected": 0.0, "step": 909 }, { "epoch": 5.083798882681564, "grad_norm": 2.0242722462680076, "learning_rate": 8.625927657366321e-07, "logits/chosen": -3.602264165878296, "logits/rejected": -3.629054307937622, "logps/chosen": -7.341497421264648, "logps/rejected": -29.58710289001465, "loss": 0.1171, "rewards/accuracies": 1.0, "rewards/chosen": 2.576740026473999, "rewards/margins": 2.576740026473999, "rewards/rejected": 0.0, "step": 910 }, { "epoch": 5.089385474860335, "grad_norm": 4.349726842397969, "learning_rate": 8.621727923528465e-07, "logits/chosen": -3.780391216278076, "logits/rejected": -3.8685684204101562, "logps/chosen": -13.424625396728516, "logps/rejected": -41.788330078125, "loss": 0.1632, "rewards/accuracies": 1.0, "rewards/chosen": 2.475114345550537, "rewards/margins": 2.475114345550537, "rewards/rejected": 0.0, "step": 911 }, { "epoch": 5.094972067039106, "grad_norm": 2.1555804248891564, "learning_rate": 8.61752280715792e-07, "logits/chosen": -3.635103225708008, "logits/rejected": -3.7968673706054688, "logps/chosen": -4.576326370239258, "logps/rejected": -49.913028717041016, "loss": 0.0981, "rewards/accuracies": 1.0, "rewards/chosen": 2.112658739089966, "rewards/margins": 2.112658739089966, "rewards/rejected": 0.0, "step": 912 }, { "epoch": 5.100558659217877, "grad_norm": 3.6088824487821602, "learning_rate": 8.613312314504239e-07, "logits/chosen": -3.730335235595703, "logits/rejected": -3.8086090087890625, "logps/chosen": -3.0457799434661865, "logps/rejected": -28.66973114013672, "loss": 0.13, "rewards/accuracies": 1.0, "rewards/chosen": 1.5662554502487183, "rewards/margins": 1.5662554502487183, "rewards/rejected": 0.0, "step": 913 }, { "epoch": 5.106145251396648, "grad_norm": 5.127206199475611, "learning_rate": 8.609096451824962e-07, "logits/chosen": -3.3814682960510254, "logits/rejected": -3.2327253818511963, "logps/chosen": -3.7657828330993652, "logps/rejected": -39.985130310058594, "loss": 0.1425, "rewards/accuracies": 1.0, "rewards/chosen": 1.6491355895996094, "rewards/margins": 1.6491355895996094, "rewards/rejected": 0.0, "step": 914 }, { "epoch": 5.111731843575419, "grad_norm": 2.9408725419231034, "learning_rate": 8.604875225385612e-07, "logits/chosen": -3.7518861293792725, "logits/rejected": -3.776332139968872, "logps/chosen": -10.089970588684082, "logps/rejected": -22.130268096923828, "loss": 0.1085, "rewards/accuracies": 1.0, "rewards/chosen": 2.302517890930176, "rewards/margins": 2.302517890930176, "rewards/rejected": 0.0, "step": 915 }, { "epoch": 5.11731843575419, "grad_norm": 2.2874280726807887, "learning_rate": 8.600648641459687e-07, "logits/chosen": -3.748547077178955, "logits/rejected": -3.8262057304382324, "logps/chosen": -3.8874173164367676, "logps/rejected": -28.511350631713867, "loss": 0.1045, "rewards/accuracies": 1.0, "rewards/chosen": 1.9695693254470825, "rewards/margins": 1.9695693254470825, "rewards/rejected": 0.0, "step": 916 }, { "epoch": 5.122905027932961, "grad_norm": 3.3819493698529732, "learning_rate": 8.596416706328638e-07, "logits/chosen": -3.5149059295654297, "logits/rejected": -3.633885622024536, "logps/chosen": -5.141606330871582, "logps/rejected": -32.707542419433594, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 1.6533509492874146, "rewards/margins": 1.6533509492874146, "rewards/rejected": 0.0, "step": 917 }, { "epoch": 5.128491620111732, "grad_norm": 2.907818534904172, "learning_rate": 8.592179426281878e-07, "logits/chosen": -3.6833579540252686, "logits/rejected": -3.493891477584839, "logps/chosen": -6.513550758361816, "logps/rejected": -42.60105895996094, "loss": 0.1299, "rewards/accuracies": 1.0, "rewards/chosen": 2.287080764770508, "rewards/margins": 2.287080764770508, "rewards/rejected": 0.0, "step": 918 }, { "epoch": 5.134078212290503, "grad_norm": 2.2665989967634226, "learning_rate": 8.587936807616756e-07, "logits/chosen": -3.459026336669922, "logits/rejected": -3.541001558303833, "logps/chosen": -20.24863052368164, "logps/rejected": -31.118236541748047, "loss": 0.1317, "rewards/accuracies": 1.0, "rewards/chosen": 2.420464038848877, "rewards/margins": 2.420464038848877, "rewards/rejected": 0.0, "step": 919 }, { "epoch": 5.139664804469274, "grad_norm": 2.123177189518529, "learning_rate": 8.583688856638563e-07, "logits/chosen": -3.3693435192108154, "logits/rejected": -3.469839572906494, "logps/chosen": -8.63968276977539, "logps/rejected": -47.11341857910156, "loss": 0.1107, "rewards/accuracies": 1.0, "rewards/chosen": 2.0286364555358887, "rewards/margins": 2.0286364555358887, "rewards/rejected": 0.0, "step": 920 }, { "epoch": 5.139664804469274, "eval_logits/chosen": -3.425973415374756, "eval_logits/rejected": -3.551579236984253, "eval_logps/chosen": -21.862384796142578, "eval_logps/rejected": -39.4764289855957, "eval_loss": 0.5151383876800537, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.8207657933235168, "eval_rewards/margins": 0.8207657933235168, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7056, "eval_samples_per_second": 9.478, "eval_steps_per_second": 0.306, "step": 920 }, { "epoch": 5.145251396648045, "grad_norm": 2.6781754907340325, "learning_rate": 8.579435579660509e-07, "logits/chosen": -3.6018340587615967, "logits/rejected": -3.6825060844421387, "logps/chosen": -4.666955471038818, "logps/rejected": -52.6751708984375, "loss": 0.1357, "rewards/accuracies": 1.0, "rewards/chosen": 2.0365028381347656, "rewards/margins": 2.0365028381347656, "rewards/rejected": 0.0, "step": 921 }, { "epoch": 5.150837988826815, "grad_norm": 2.302785965792041, "learning_rate": 8.57517698300372e-07, "logits/chosen": -3.6873726844787598, "logits/rejected": -3.595874309539795, "logps/chosen": -14.56936264038086, "logps/rejected": -22.20042610168457, "loss": 0.1046, "rewards/accuracies": 1.0, "rewards/chosen": 3.0235393047332764, "rewards/margins": 3.0235393047332764, "rewards/rejected": 0.0, "step": 922 }, { "epoch": 5.156424581005586, "grad_norm": 3.426254757697152, "learning_rate": 8.570913072997233e-07, "logits/chosen": -3.8000307083129883, "logits/rejected": -3.5224392414093018, "logps/chosen": -6.43786096572876, "logps/rejected": -31.06023406982422, "loss": 0.1258, "rewards/accuracies": 1.0, "rewards/chosen": 2.4343795776367188, "rewards/margins": 2.4343795776367188, "rewards/rejected": 0.0, "step": 923 }, { "epoch": 5.162011173184357, "grad_norm": 4.169084053169075, "learning_rate": 8.566643855977971e-07, "logits/chosen": -3.583242416381836, "logits/rejected": -3.4318792819976807, "logps/chosen": -9.185176849365234, "logps/rejected": -17.2728271484375, "loss": 0.1301, "rewards/accuracies": 1.0, "rewards/chosen": 2.572232246398926, "rewards/margins": 2.572232246398926, "rewards/rejected": 0.0, "step": 924 }, { "epoch": 5.167597765363128, "grad_norm": 2.1962278849430503, "learning_rate": 8.562369338290756e-07, "logits/chosen": -3.6337318420410156, "logits/rejected": -3.725064754486084, "logps/chosen": -5.347441673278809, "logps/rejected": -33.45294952392578, "loss": 0.1434, "rewards/accuracies": 1.0, "rewards/chosen": 2.316202163696289, "rewards/margins": 2.316202163696289, "rewards/rejected": 0.0, "step": 925 }, { "epoch": 5.173184357541899, "grad_norm": 4.294902947201775, "learning_rate": 8.558089526288281e-07, "logits/chosen": -3.2959165573120117, "logits/rejected": -3.4752299785614014, "logps/chosen": -13.800309181213379, "logps/rejected": -29.787723541259766, "loss": 0.1588, "rewards/accuracies": 1.0, "rewards/chosen": 2.654491901397705, "rewards/margins": 2.654491901397705, "rewards/rejected": 0.0, "step": 926 }, { "epoch": 5.17877094972067, "grad_norm": 3.2968934448645784, "learning_rate": 8.55380442633111e-07, "logits/chosen": -3.612819194793701, "logits/rejected": -3.552001953125, "logps/chosen": -26.93370819091797, "logps/rejected": -24.36080551147461, "loss": 0.1116, "rewards/accuracies": 1.0, "rewards/chosen": 3.425076961517334, "rewards/margins": 3.425076961517334, "rewards/rejected": 0.0, "step": 927 }, { "epoch": 5.184357541899441, "grad_norm": 2.134847232261565, "learning_rate": 8.549514044787663e-07, "logits/chosen": -3.552821397781372, "logits/rejected": -3.3308751583099365, "logps/chosen": -6.113100051879883, "logps/rejected": -52.883689880371094, "loss": 0.1512, "rewards/accuracies": 1.0, "rewards/chosen": 2.0449371337890625, "rewards/margins": 2.0449371337890625, "rewards/rejected": 0.0, "step": 928 }, { "epoch": 5.189944134078212, "grad_norm": 1.9644718559300236, "learning_rate": 8.54521838803421e-07, "logits/chosen": -3.4582157135009766, "logits/rejected": -3.522745370864868, "logps/chosen": -10.016764640808105, "logps/rejected": -36.827789306640625, "loss": 0.1209, "rewards/accuracies": 1.0, "rewards/chosen": 2.741762161254883, "rewards/margins": 2.741762161254883, "rewards/rejected": 0.0, "step": 929 }, { "epoch": 5.195530726256983, "grad_norm": 9.682737619449624, "learning_rate": 8.540917462454864e-07, "logits/chosen": -3.9171395301818848, "logits/rejected": -3.842108964920044, "logps/chosen": -5.504759311676025, "logps/rejected": -33.843650817871094, "loss": 0.1553, "rewards/accuracies": 1.0, "rewards/chosen": 2.1367480754852295, "rewards/margins": 2.1367480754852295, "rewards/rejected": 0.0, "step": 930 }, { "epoch": 5.201117318435754, "grad_norm": 2.0908246070117453, "learning_rate": 8.536611274441566e-07, "logits/chosen": -3.6773064136505127, "logits/rejected": -3.590070962905884, "logps/chosen": -38.34971618652344, "logps/rejected": -41.589012145996094, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": 3.5228641033172607, "rewards/margins": 3.5228641033172607, "rewards/rejected": 0.0, "step": 931 }, { "epoch": 5.206703910614525, "grad_norm": 2.873060339890528, "learning_rate": 8.532299830394079e-07, "logits/chosen": -3.4728243350982666, "logits/rejected": -3.8036398887634277, "logps/chosen": -5.717551231384277, "logps/rejected": -60.32939910888672, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 2.168911933898926, "rewards/margins": 2.168911933898926, "rewards/rejected": 0.0, "step": 932 }, { "epoch": 5.212290502793296, "grad_norm": 4.196625161083795, "learning_rate": 8.527983136719976e-07, "logits/chosen": -3.3160006999969482, "logits/rejected": -3.38995623588562, "logps/chosen": -22.3360595703125, "logps/rejected": -26.047075271606445, "loss": 0.1157, "rewards/accuracies": 1.0, "rewards/chosen": 2.3260152339935303, "rewards/margins": 2.3260152339935303, "rewards/rejected": 0.0, "step": 933 }, { "epoch": 5.217877094972067, "grad_norm": 2.684892475589337, "learning_rate": 8.523661199834631e-07, "logits/chosen": -3.8219635486602783, "logits/rejected": -3.9562270641326904, "logps/chosen": -5.9781270027160645, "logps/rejected": -39.06108093261719, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": 2.3151016235351562, "rewards/margins": 2.3151016235351562, "rewards/rejected": 0.0, "step": 934 }, { "epoch": 5.223463687150838, "grad_norm": 3.361433215437716, "learning_rate": 8.519334026161214e-07, "logits/chosen": -3.4284021854400635, "logits/rejected": -3.430989980697632, "logps/chosen": -1.641413688659668, "logps/rejected": -48.60231018066406, "loss": 0.1605, "rewards/accuracies": 1.0, "rewards/chosen": 1.6644656658172607, "rewards/margins": 1.6644656658172607, "rewards/rejected": 0.0, "step": 935 }, { "epoch": 5.229050279329609, "grad_norm": 2.448103045769202, "learning_rate": 8.515001622130675e-07, "logits/chosen": -3.7019660472869873, "logits/rejected": -3.4524085521698, "logps/chosen": -4.677493095397949, "logps/rejected": -43.681678771972656, "loss": 0.122, "rewards/accuracies": 1.0, "rewards/chosen": 2.2800111770629883, "rewards/margins": 2.2800111770629883, "rewards/rejected": 0.0, "step": 936 }, { "epoch": 5.23463687150838, "grad_norm": 3.3938076413380376, "learning_rate": 8.510663994181738e-07, "logits/chosen": -3.1296334266662598, "logits/rejected": -3.234459400177002, "logps/chosen": -25.453676223754883, "logps/rejected": -38.42242431640625, "loss": 0.1216, "rewards/accuracies": 1.0, "rewards/chosen": 3.0531082153320312, "rewards/margins": 3.0531082153320312, "rewards/rejected": 0.0, "step": 937 }, { "epoch": 5.240223463687151, "grad_norm": 3.0708984028748247, "learning_rate": 8.506321148760891e-07, "logits/chosen": -3.6749613285064697, "logits/rejected": -3.7708475589752197, "logps/chosen": -3.0763132572174072, "logps/rejected": -33.902042388916016, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 1.474084734916687, "rewards/margins": 1.474084734916687, "rewards/rejected": 0.0, "step": 938 }, { "epoch": 5.245810055865922, "grad_norm": 3.9396728681411206, "learning_rate": 8.501973092322375e-07, "logits/chosen": -3.4395763874053955, "logits/rejected": -3.355232000350952, "logps/chosen": -5.489841938018799, "logps/rejected": -23.398866653442383, "loss": 0.1297, "rewards/accuracies": 1.0, "rewards/chosen": 2.4068026542663574, "rewards/margins": 2.4068026542663574, "rewards/rejected": 0.0, "step": 939 }, { "epoch": 5.251396648044693, "grad_norm": 3.2175744448504893, "learning_rate": 8.497619831328178e-07, "logits/chosen": -3.805591106414795, "logits/rejected": -3.9668242931365967, "logps/chosen": -10.686816215515137, "logps/rejected": -28.034881591796875, "loss": 0.1205, "rewards/accuracies": 1.0, "rewards/chosen": 2.7061309814453125, "rewards/margins": 2.7061309814453125, "rewards/rejected": 0.0, "step": 940 }, { "epoch": 5.251396648044693, "eval_logits/chosen": -3.4313652515411377, "eval_logits/rejected": -3.555899143218994, "eval_logps/chosen": -21.915748596191406, "eval_logps/rejected": -40.4402961730957, "eval_loss": 0.5229962468147278, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.8154290914535522, "eval_rewards/margins": 0.8154290914535522, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7408, "eval_samples_per_second": 9.468, "eval_steps_per_second": 0.305, "step": 940 }, { "epoch": 5.256983240223463, "grad_norm": 2.1227916790899037, "learning_rate": 8.493261372248017e-07, "logits/chosen": -3.7516915798187256, "logits/rejected": -3.5086164474487305, "logps/chosen": -8.845268249511719, "logps/rejected": -27.884584426879883, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": 2.5056686401367188, "rewards/margins": 2.5056686401367188, "rewards/rejected": 0.0, "step": 941 }, { "epoch": 5.262569832402234, "grad_norm": 2.86054465219148, "learning_rate": 8.488897721559343e-07, "logits/chosen": -3.6603825092315674, "logits/rejected": -3.7184808254241943, "logps/chosen": -5.2650980949401855, "logps/rejected": -37.26551055908203, "loss": 0.121, "rewards/accuracies": 1.0, "rewards/chosen": 1.9481420516967773, "rewards/margins": 1.9481420516967773, "rewards/rejected": 0.0, "step": 942 }, { "epoch": 5.268156424581005, "grad_norm": 3.4441727530514172, "learning_rate": 8.484528885747315e-07, "logits/chosen": -3.4722037315368652, "logits/rejected": -3.4777796268463135, "logps/chosen": -3.823040008544922, "logps/rejected": -30.969818115234375, "loss": 0.1313, "rewards/accuracies": 1.0, "rewards/chosen": 1.9734690189361572, "rewards/margins": 1.9734690189361572, "rewards/rejected": 0.0, "step": 943 }, { "epoch": 5.273743016759776, "grad_norm": 5.2095703916482154, "learning_rate": 8.480154871304801e-07, "logits/chosen": -3.396315097808838, "logits/rejected": -3.4404098987579346, "logps/chosen": -11.265424728393555, "logps/rejected": -14.863361358642578, "loss": 0.1526, "rewards/accuracies": 1.0, "rewards/chosen": 2.128246545791626, "rewards/margins": 2.128246545791626, "rewards/rejected": 0.0, "step": 944 }, { "epoch": 5.279329608938547, "grad_norm": 3.3338926579251162, "learning_rate": 8.475775684732369e-07, "logits/chosen": -3.5682334899902344, "logits/rejected": -3.294985771179199, "logps/chosen": -1.143467903137207, "logps/rejected": -56.27909469604492, "loss": 0.1497, "rewards/accuracies": 1.0, "rewards/chosen": 1.2283358573913574, "rewards/margins": 1.2283358573913574, "rewards/rejected": 0.0, "step": 945 }, { "epoch": 5.284916201117318, "grad_norm": 2.9230370638327585, "learning_rate": 8.471391332538266e-07, "logits/chosen": -3.6190085411071777, "logits/rejected": -3.870126962661743, "logps/chosen": -11.431694030761719, "logps/rejected": -18.61604881286621, "loss": 0.1224, "rewards/accuracies": 1.0, "rewards/chosen": 2.6102752685546875, "rewards/margins": 2.6102752685546875, "rewards/rejected": 0.0, "step": 946 }, { "epoch": 5.290502793296089, "grad_norm": 3.2666822273478417, "learning_rate": 8.467001821238421e-07, "logits/chosen": -3.7518441677093506, "logits/rejected": -3.6237411499023438, "logps/chosen": -1.5083858966827393, "logps/rejected": -82.01118469238281, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": 1.424487590789795, "rewards/margins": 1.424487590789795, "rewards/rejected": 0.0, "step": 947 }, { "epoch": 5.29608938547486, "grad_norm": 2.402537784639173, "learning_rate": 8.462607157356434e-07, "logits/chosen": -3.1628594398498535, "logits/rejected": -3.3524177074432373, "logps/chosen": -4.144198894500732, "logps/rejected": -45.98481750488281, "loss": 0.1267, "rewards/accuracies": 1.0, "rewards/chosen": 1.9893189668655396, "rewards/margins": 1.9893189668655396, "rewards/rejected": 0.0, "step": 948 }, { "epoch": 5.301675977653631, "grad_norm": 2.1287793565080166, "learning_rate": 8.458207347423553e-07, "logits/chosen": -3.2780921459198, "logits/rejected": -3.2125978469848633, "logps/chosen": -5.203969478607178, "logps/rejected": -66.4188232421875, "loss": 0.1184, "rewards/accuracies": 1.0, "rewards/chosen": 2.149090528488159, "rewards/margins": 2.149090528488159, "rewards/rejected": 0.0, "step": 949 }, { "epoch": 5.307262569832402, "grad_norm": 3.8601327518074102, "learning_rate": 8.45380239797868e-07, "logits/chosen": -3.5509846210479736, "logits/rejected": -3.5342063903808594, "logps/chosen": -10.181859970092773, "logps/rejected": -30.597375869750977, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 2.5514869689941406, "rewards/margins": 2.5514869689941406, "rewards/rejected": 0.0, "step": 950 }, { "epoch": 5.312849162011173, "grad_norm": 2.3130618426281937, "learning_rate": 8.449392315568355e-07, "logits/chosen": -3.836066484451294, "logits/rejected": -3.620694637298584, "logps/chosen": -8.613641738891602, "logps/rejected": -36.907615661621094, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 2.1679139137268066, "rewards/margins": 2.1679139137268066, "rewards/rejected": 0.0, "step": 951 }, { "epoch": 5.318435754189944, "grad_norm": 2.0727404018073043, "learning_rate": 8.444977106746746e-07, "logits/chosen": -3.6299524307250977, "logits/rejected": -3.7042176723480225, "logps/chosen": -25.02996826171875, "logps/rejected": -30.16958236694336, "loss": 0.1168, "rewards/accuracies": 1.0, "rewards/chosen": 3.044982433319092, "rewards/margins": 3.044982433319092, "rewards/rejected": 0.0, "step": 952 }, { "epoch": 5.324022346368715, "grad_norm": 3.5282975987744707, "learning_rate": 8.44055677807564e-07, "logits/chosen": -3.3784050941467285, "logits/rejected": -3.7101690769195557, "logps/chosen": -6.744629859924316, "logps/rejected": -48.71521759033203, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 2.211911201477051, "rewards/margins": 2.211911201477051, "rewards/rejected": 0.0, "step": 953 }, { "epoch": 5.329608938547486, "grad_norm": 1.88849840259381, "learning_rate": 8.436131336124431e-07, "logits/chosen": -3.7207722663879395, "logits/rejected": -3.75814151763916, "logps/chosen": -2.950428009033203, "logps/rejected": -43.67927551269531, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 2.291203022003174, "rewards/margins": 2.291203022003174, "rewards/rejected": 0.0, "step": 954 }, { "epoch": 5.335195530726257, "grad_norm": 2.862908630145159, "learning_rate": 8.431700787470116e-07, "logits/chosen": -3.9300715923309326, "logits/rejected": -3.9752509593963623, "logps/chosen": -37.81331253051758, "logps/rejected": -30.72120475769043, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 3.501981735229492, "rewards/margins": 3.501981735229492, "rewards/rejected": 0.0, "step": 955 }, { "epoch": 5.340782122905028, "grad_norm": 3.7350135696278914, "learning_rate": 8.427265138697279e-07, "logits/chosen": -3.7004594802856445, "logits/rejected": -3.6338562965393066, "logps/chosen": -8.439620971679688, "logps/rejected": -29.650474548339844, "loss": 0.129, "rewards/accuracies": 1.0, "rewards/chosen": 2.3722708225250244, "rewards/margins": 2.3722708225250244, "rewards/rejected": 0.0, "step": 956 }, { "epoch": 5.346368715083799, "grad_norm": 2.3180477658725267, "learning_rate": 8.422824396398081e-07, "logits/chosen": -3.6553523540496826, "logits/rejected": -3.615277051925659, "logps/chosen": -5.601487159729004, "logps/rejected": -32.682823181152344, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": 2.2934601306915283, "rewards/margins": 2.2934601306915283, "rewards/rejected": 0.0, "step": 957 }, { "epoch": 5.351955307262569, "grad_norm": 2.624376384375354, "learning_rate": 8.41837856717226e-07, "logits/chosen": -3.8188042640686035, "logits/rejected": -3.8369505405426025, "logps/chosen": -14.519266128540039, "logps/rejected": -36.771400451660156, "loss": 0.1166, "rewards/accuracies": 1.0, "rewards/chosen": 2.2506394386291504, "rewards/margins": 2.2506394386291504, "rewards/rejected": 0.0, "step": 958 }, { "epoch": 5.35754189944134, "grad_norm": 2.9579438881756746, "learning_rate": 8.413927657627109e-07, "logits/chosen": -3.6174941062927246, "logits/rejected": -3.5712153911590576, "logps/chosen": -3.6067914962768555, "logps/rejected": -53.74940490722656, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 2.3034090995788574, "rewards/margins": 2.3034090995788574, "rewards/rejected": 0.0, "step": 959 }, { "epoch": 5.363128491620111, "grad_norm": 3.1962700511590163, "learning_rate": 8.40947167437747e-07, "logits/chosen": -3.660860061645508, "logits/rejected": -3.572692632675171, "logps/chosen": -9.876673698425293, "logps/rejected": -41.77517318725586, "loss": 0.1228, "rewards/accuracies": 1.0, "rewards/chosen": 2.6486129760742188, "rewards/margins": 2.6486129760742188, "rewards/rejected": 0.0, "step": 960 }, { "epoch": 5.363128491620111, "eval_logits/chosen": -3.4246811866760254, "eval_logits/rejected": -3.549966335296631, "eval_logps/chosen": -21.75457763671875, "eval_logps/rejected": -40.558349609375, "eval_loss": 0.5337956547737122, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.8315466046333313, "eval_rewards/margins": 0.8315466046333313, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6947, "eval_samples_per_second": 9.482, "eval_steps_per_second": 0.306, "step": 960 }, { "epoch": 5.368715083798882, "grad_norm": 4.816839452806974, "learning_rate": 8.40501062404573e-07, "logits/chosen": -3.4338021278381348, "logits/rejected": -3.2634763717651367, "logps/chosen": -15.61409854888916, "logps/rejected": -34.017295837402344, "loss": 0.1534, "rewards/accuracies": 1.0, "rewards/chosen": 3.017667770385742, "rewards/margins": 3.017667770385742, "rewards/rejected": 0.0, "step": 961 }, { "epoch": 5.374301675977653, "grad_norm": 2.482510011792642, "learning_rate": 8.400544513261803e-07, "logits/chosen": -3.8516783714294434, "logits/rejected": -3.761363983154297, "logps/chosen": -3.9727489948272705, "logps/rejected": -32.69411849975586, "loss": 0.108, "rewards/accuracies": 1.0, "rewards/chosen": 2.200329303741455, "rewards/margins": 2.200329303741455, "rewards/rejected": 0.0, "step": 962 }, { "epoch": 5.379888268156424, "grad_norm": 1.748209005735185, "learning_rate": 8.396073348663125e-07, "logits/chosen": -3.537409543991089, "logits/rejected": -3.4711556434631348, "logps/chosen": -6.442181587219238, "logps/rejected": -26.657386779785156, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": 2.5487351417541504, "rewards/margins": 2.5487351417541504, "rewards/rejected": 0.0, "step": 963 }, { "epoch": 5.385474860335195, "grad_norm": 3.6730868297201518, "learning_rate": 8.391597136894644e-07, "logits/chosen": -3.5208287239074707, "logits/rejected": -3.2740936279296875, "logps/chosen": -11.94963550567627, "logps/rejected": -44.65607833862305, "loss": 0.1398, "rewards/accuracies": 1.0, "rewards/chosen": 2.4625320434570312, "rewards/margins": 2.4625320434570312, "rewards/rejected": 0.0, "step": 964 }, { "epoch": 5.391061452513966, "grad_norm": 2.632398028082482, "learning_rate": 8.387115884608807e-07, "logits/chosen": -3.6703941822052, "logits/rejected": -3.7788689136505127, "logps/chosen": -1.2421746253967285, "logps/rejected": -90.82178497314453, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": 1.3596502542495728, "rewards/margins": 1.3596502542495728, "rewards/rejected": 0.0, "step": 965 }, { "epoch": 5.396648044692737, "grad_norm": 3.7936712908039394, "learning_rate": 8.382629598465554e-07, "logits/chosen": -3.911432981491089, "logits/rejected": -3.613752841949463, "logps/chosen": -4.657186508178711, "logps/rejected": -24.86302947998047, "loss": 0.1416, "rewards/accuracies": 1.0, "rewards/chosen": 2.330866813659668, "rewards/margins": 2.330866813659668, "rewards/rejected": 0.0, "step": 966 }, { "epoch": 5.402234636871508, "grad_norm": 1.7917417081482783, "learning_rate": 8.378138285132303e-07, "logits/chosen": -3.687436819076538, "logits/rejected": -3.62178373336792, "logps/chosen": -1.7335727214813232, "logps/rejected": -30.068944931030273, "loss": 0.1116, "rewards/accuracies": 1.0, "rewards/chosen": 1.4629037380218506, "rewards/margins": 1.4629037380218506, "rewards/rejected": 0.0, "step": 967 }, { "epoch": 5.407821229050279, "grad_norm": 3.3786380351906637, "learning_rate": 8.373641951283947e-07, "logits/chosen": -3.4602854251861572, "logits/rejected": -3.360727071762085, "logps/chosen": -2.335566997528076, "logps/rejected": -38.54754638671875, "loss": 0.1297, "rewards/accuracies": 1.0, "rewards/chosen": 1.7878601551055908, "rewards/margins": 1.7878601551055908, "rewards/rejected": 0.0, "step": 968 }, { "epoch": 5.41340782122905, "grad_norm": 3.062911125741669, "learning_rate": 8.36914060360284e-07, "logits/chosen": -3.9719436168670654, "logits/rejected": -3.806168556213379, "logps/chosen": -7.921912670135498, "logps/rejected": -35.76070022583008, "loss": 0.1531, "rewards/accuracies": 1.0, "rewards/chosen": 2.394432544708252, "rewards/margins": 2.394432544708252, "rewards/rejected": 0.0, "step": 969 }, { "epoch": 5.418994413407821, "grad_norm": 4.118220377183595, "learning_rate": 8.364634248778783e-07, "logits/chosen": -3.5636565685272217, "logits/rejected": -3.6488091945648193, "logps/chosen": -6.545294761657715, "logps/rejected": -37.022560119628906, "loss": 0.1299, "rewards/accuracies": 1.0, "rewards/chosen": 2.072500705718994, "rewards/margins": 2.072500705718994, "rewards/rejected": 0.0, "step": 970 }, { "epoch": 5.424581005586592, "grad_norm": 2.423093707243962, "learning_rate": 8.360122893509025e-07, "logits/chosen": -3.716005802154541, "logits/rejected": -3.667078733444214, "logps/chosen": -6.051375865936279, "logps/rejected": -51.206546783447266, "loss": 0.1303, "rewards/accuracies": 1.0, "rewards/chosen": 2.1058497428894043, "rewards/margins": 2.1058497428894043, "rewards/rejected": 0.0, "step": 971 }, { "epoch": 5.430167597765363, "grad_norm": 2.469023269894222, "learning_rate": 8.355606544498241e-07, "logits/chosen": -3.6141245365142822, "logits/rejected": -3.656458854675293, "logps/chosen": -6.50161075592041, "logps/rejected": -28.264385223388672, "loss": 0.1372, "rewards/accuracies": 1.0, "rewards/chosen": 2.2572598457336426, "rewards/margins": 2.2572598457336426, "rewards/rejected": 0.0, "step": 972 }, { "epoch": 5.435754189944134, "grad_norm": 2.870405773440076, "learning_rate": 8.351085208458533e-07, "logits/chosen": -3.440138816833496, "logits/rejected": -3.4495396614074707, "logps/chosen": -5.213408470153809, "logps/rejected": -53.99237060546875, "loss": 0.1146, "rewards/accuracies": 1.0, "rewards/chosen": 2.179966449737549, "rewards/margins": 2.179966449737549, "rewards/rejected": 0.0, "step": 973 }, { "epoch": 5.441340782122905, "grad_norm": 4.779981011272239, "learning_rate": 8.346558892109407e-07, "logits/chosen": -3.6187150478363037, "logits/rejected": -3.481257200241089, "logps/chosen": -7.725311279296875, "logps/rejected": -26.0552978515625, "loss": 0.1366, "rewards/accuracies": 1.0, "rewards/chosen": 1.9844796657562256, "rewards/margins": 1.9844796657562256, "rewards/rejected": 0.0, "step": 974 }, { "epoch": 5.446927374301676, "grad_norm": 3.662552416936445, "learning_rate": 8.342027602177779e-07, "logits/chosen": -3.3446497917175293, "logits/rejected": -3.2567250728607178, "logps/chosen": -18.352100372314453, "logps/rejected": -29.235855102539062, "loss": 0.1411, "rewards/accuracies": 1.0, "rewards/chosen": 2.959120988845825, "rewards/margins": 2.959120988845825, "rewards/rejected": 0.0, "step": 975 }, { "epoch": 5.452513966480447, "grad_norm": 2.457014682852703, "learning_rate": 8.337491345397952e-07, "logits/chosen": -3.4628405570983887, "logits/rejected": -3.2928173542022705, "logps/chosen": -11.77698040008545, "logps/rejected": -48.920654296875, "loss": 0.1138, "rewards/accuracies": 1.0, "rewards/chosen": 2.4243571758270264, "rewards/margins": 2.4243571758270264, "rewards/rejected": 0.0, "step": 976 }, { "epoch": 5.4581005586592175, "grad_norm": 2.4380306326679055, "learning_rate": 8.332950128511611e-07, "logits/chosen": -3.9781696796417236, "logits/rejected": -3.8392252922058105, "logps/chosen": -3.0315113067626953, "logps/rejected": -49.192378997802734, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 1.9341998100280762, "rewards/margins": 1.9341998100280762, "rewards/rejected": 0.0, "step": 977 }, { "epoch": 5.4636871508379885, "grad_norm": 4.0072399880982585, "learning_rate": 8.328403958267813e-07, "logits/chosen": -3.6522932052612305, "logits/rejected": -3.7077629566192627, "logps/chosen": -10.203707695007324, "logps/rejected": -29.24556541442871, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": 2.6389682292938232, "rewards/margins": 2.6389682292938232, "rewards/rejected": 0.0, "step": 978 }, { "epoch": 5.4692737430167595, "grad_norm": 4.9940949419550416, "learning_rate": 8.323852841422976e-07, "logits/chosen": -3.3061370849609375, "logits/rejected": -3.5059430599212646, "logps/chosen": -1.0360535383224487, "logps/rejected": -56.605987548828125, "loss": 0.1476, "rewards/accuracies": 1.0, "rewards/chosen": 1.1801810264587402, "rewards/margins": 1.1801810264587402, "rewards/rejected": 0.0, "step": 979 }, { "epoch": 5.4748603351955305, "grad_norm": 4.580311287796682, "learning_rate": 8.319296784740872e-07, "logits/chosen": -3.534660816192627, "logits/rejected": -3.7138571739196777, "logps/chosen": -11.394460678100586, "logps/rejected": -25.93413543701172, "loss": 0.1248, "rewards/accuracies": 1.0, "rewards/chosen": 2.7908012866973877, "rewards/margins": 2.7908012866973877, "rewards/rejected": 0.0, "step": 980 }, { "epoch": 5.4748603351955305, "eval_logits/chosen": -3.4305739402770996, "eval_logits/rejected": -3.554555892944336, "eval_logps/chosen": -22.076717376708984, "eval_logps/rejected": -40.22883987426758, "eval_loss": 0.5252670049667358, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.7993324398994446, "eval_rewards/margins": 0.7993324398994446, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7101, "eval_samples_per_second": 9.477, "eval_steps_per_second": 0.306, "step": 980 }, { "epoch": 5.4804469273743015, "grad_norm": 2.8556803617902577, "learning_rate": 8.314735794992611e-07, "logits/chosen": -3.6257777214050293, "logits/rejected": -3.6955454349517822, "logps/chosen": -28.178733825683594, "logps/rejected": -27.137306213378906, "loss": 0.1562, "rewards/accuracies": 1.0, "rewards/chosen": 2.659531593322754, "rewards/margins": 2.659531593322754, "rewards/rejected": 0.0, "step": 981 }, { "epoch": 5.4860335195530725, "grad_norm": 2.670914142698, "learning_rate": 8.310169878956636e-07, "logits/chosen": -3.495283365249634, "logits/rejected": -3.2490310668945312, "logps/chosen": -3.7130908966064453, "logps/rejected": -30.080886840820312, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 2.114500045776367, "rewards/margins": 2.114500045776367, "rewards/rejected": 0.0, "step": 982 }, { "epoch": 5.4916201117318435, "grad_norm": 10.020042671020262, "learning_rate": 8.305599043418711e-07, "logits/chosen": -3.626030206680298, "logits/rejected": -3.4159045219421387, "logps/chosen": -11.557868003845215, "logps/rejected": -47.024723052978516, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": 2.241159439086914, "rewards/margins": 2.241159439086914, "rewards/rejected": 0.0, "step": 983 }, { "epoch": 5.4972067039106145, "grad_norm": 6.178298743082472, "learning_rate": 8.301023295171915e-07, "logits/chosen": -3.4850172996520996, "logits/rejected": -3.5257372856140137, "logps/chosen": -10.838176727294922, "logps/rejected": -26.925825119018555, "loss": 0.132, "rewards/accuracies": 1.0, "rewards/chosen": 2.4204211235046387, "rewards/margins": 2.4204211235046387, "rewards/rejected": 0.0, "step": 984 }, { "epoch": 5.5027932960893855, "grad_norm": 3.4672961029394913, "learning_rate": 8.296442641016621e-07, "logits/chosen": -3.311988592147827, "logits/rejected": -3.3689091205596924, "logps/chosen": -4.270512580871582, "logps/rejected": -45.78513717651367, "loss": 0.1356, "rewards/accuracies": 1.0, "rewards/chosen": 2.1297318935394287, "rewards/margins": 2.1297318935394287, "rewards/rejected": 0.0, "step": 985 }, { "epoch": 5.5083798882681565, "grad_norm": 3.9497562228771117, "learning_rate": 8.291857087760498e-07, "logits/chosen": -3.668921947479248, "logits/rejected": -3.7324578762054443, "logps/chosen": -11.818612098693848, "logps/rejected": -46.838600158691406, "loss": 0.1385, "rewards/accuracies": 1.0, "rewards/chosen": 2.626201629638672, "rewards/margins": 2.626201629638672, "rewards/rejected": 0.0, "step": 986 }, { "epoch": 5.5139664804469275, "grad_norm": 4.330539574087278, "learning_rate": 8.287266642218495e-07, "logits/chosen": -3.6318037509918213, "logits/rejected": -3.7037880420684814, "logps/chosen": -7.9843573570251465, "logps/rejected": -31.925270080566406, "loss": 0.1409, "rewards/accuracies": 1.0, "rewards/chosen": 2.511212110519409, "rewards/margins": 2.511212110519409, "rewards/rejected": 0.0, "step": 987 }, { "epoch": 5.5195530726256985, "grad_norm": 3.2623845153838342, "learning_rate": 8.282671311212832e-07, "logits/chosen": -3.3751072883605957, "logits/rejected": -3.4393019676208496, "logps/chosen": -16.140539169311523, "logps/rejected": -38.205322265625, "loss": 0.1286, "rewards/accuracies": 1.0, "rewards/chosen": 2.369124174118042, "rewards/margins": 2.369124174118042, "rewards/rejected": 0.0, "step": 988 }, { "epoch": 5.5251396648044695, "grad_norm": 3.8885666395174705, "learning_rate": 8.278071101572988e-07, "logits/chosen": -3.4989941120147705, "logits/rejected": -3.589448928833008, "logps/chosen": -6.971655368804932, "logps/rejected": -29.12643051147461, "loss": 0.1435, "rewards/accuracies": 1.0, "rewards/chosen": 1.9628701210021973, "rewards/margins": 1.9628701210021973, "rewards/rejected": 0.0, "step": 989 }, { "epoch": 5.5307262569832405, "grad_norm": 4.020234784130751, "learning_rate": 8.273466020135694e-07, "logits/chosen": -3.7029707431793213, "logits/rejected": -3.753220558166504, "logps/chosen": -8.310090065002441, "logps/rejected": -27.959949493408203, "loss": 0.1257, "rewards/accuracies": 1.0, "rewards/chosen": 2.644866466522217, "rewards/margins": 2.644866466522217, "rewards/rejected": 0.0, "step": 990 }, { "epoch": 5.5363128491620115, "grad_norm": 2.8017690945691913, "learning_rate": 8.268856073744923e-07, "logits/chosen": -3.3072383403778076, "logits/rejected": -3.4897496700286865, "logps/chosen": -13.173815727233887, "logps/rejected": -89.54933166503906, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": 2.632387161254883, "rewards/margins": 2.632387161254883, "rewards/rejected": 0.0, "step": 991 }, { "epoch": 5.5418994413407825, "grad_norm": 5.324536947997927, "learning_rate": 8.264241269251875e-07, "logits/chosen": -3.8031108379364014, "logits/rejected": -3.923901319503784, "logps/chosen": -30.693422317504883, "logps/rejected": -28.303131103515625, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 3.6970062255859375, "rewards/margins": 3.6970062255859375, "rewards/rejected": 0.0, "step": 992 }, { "epoch": 5.547486033519553, "grad_norm": 13.275659618085774, "learning_rate": 8.259621613514969e-07, "logits/chosen": -3.4624850749969482, "logits/rejected": -3.402142286300659, "logps/chosen": -9.430938720703125, "logps/rejected": -51.62495422363281, "loss": 0.1678, "rewards/accuracies": 1.0, "rewards/chosen": 2.3033833503723145, "rewards/margins": 2.3033833503723145, "rewards/rejected": 0.0, "step": 993 }, { "epoch": 5.553072625698324, "grad_norm": 5.129162530894774, "learning_rate": 8.254997113399842e-07, "logits/chosen": -4.001574516296387, "logits/rejected": -3.6959242820739746, "logps/chosen": -9.59112548828125, "logps/rejected": -36.154266357421875, "loss": 0.1283, "rewards/accuracies": 1.0, "rewards/chosen": 2.5434792041778564, "rewards/margins": 2.5434792041778564, "rewards/rejected": 0.0, "step": 994 }, { "epoch": 5.558659217877095, "grad_norm": 3.12863945151769, "learning_rate": 8.250367775779319e-07, "logits/chosen": -3.3619141578674316, "logits/rejected": -3.4087536334991455, "logps/chosen": -4.648746013641357, "logps/rejected": -88.69186401367188, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 1.778944969177246, "rewards/margins": 1.778944969177246, "rewards/rejected": 0.0, "step": 995 }, { "epoch": 5.564245810055866, "grad_norm": 4.191695662470572, "learning_rate": 8.245733607533423e-07, "logits/chosen": -3.499629259109497, "logits/rejected": -3.460113525390625, "logps/chosen": -14.256868362426758, "logps/rejected": -28.99484634399414, "loss": 0.1157, "rewards/accuracies": 1.0, "rewards/chosen": 2.6720776557922363, "rewards/margins": 2.6720776557922363, "rewards/rejected": 0.0, "step": 996 }, { "epoch": 5.569832402234637, "grad_norm": 2.8226776540352794, "learning_rate": 8.241094615549352e-07, "logits/chosen": -3.471566915512085, "logits/rejected": -3.5952682495117188, "logps/chosen": -5.305325031280518, "logps/rejected": -55.97410202026367, "loss": 0.1006, "rewards/accuracies": 1.0, "rewards/chosen": 2.113853931427002, "rewards/margins": 2.113853931427002, "rewards/rejected": 0.0, "step": 997 }, { "epoch": 5.575418994413408, "grad_norm": 3.013186677958537, "learning_rate": 8.236450806721475e-07, "logits/chosen": -3.428640127182007, "logits/rejected": -3.337449550628662, "logps/chosen": -3.7617545127868652, "logps/rejected": -34.300498962402344, "loss": 0.1294, "rewards/accuracies": 1.0, "rewards/chosen": 1.9175052642822266, "rewards/margins": 1.9175052642822266, "rewards/rejected": 0.0, "step": 998 }, { "epoch": 5.581005586592179, "grad_norm": 3.7475550308034378, "learning_rate": 8.23180218795132e-07, "logits/chosen": -3.6567182540893555, "logits/rejected": -3.4699931144714355, "logps/chosen": -8.725936889648438, "logps/rejected": -26.261375427246094, "loss": 0.1452, "rewards/accuracies": 1.0, "rewards/chosen": 2.1150784492492676, "rewards/margins": 2.1150784492492676, "rewards/rejected": 0.0, "step": 999 }, { "epoch": 5.58659217877095, "grad_norm": 2.4820108158390095, "learning_rate": 8.22714876614756e-07, "logits/chosen": -3.543668508529663, "logits/rejected": -3.7309658527374268, "logps/chosen": -19.350299835205078, "logps/rejected": -23.483177185058594, "loss": 0.1221, "rewards/accuracies": 1.0, "rewards/chosen": 2.6947455406188965, "rewards/margins": 2.6947455406188965, "rewards/rejected": 0.0, "step": 1000 }, { "epoch": 5.58659217877095, "eval_logits/chosen": -3.4348416328430176, "eval_logits/rejected": -3.5552430152893066, "eval_logps/chosen": -21.70493507385254, "eval_logps/rejected": -39.91596221923828, "eval_loss": 0.5152106285095215, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.8365108370780945, "eval_rewards/margins": 0.8365108370780945, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7193, "eval_samples_per_second": 9.475, "eval_steps_per_second": 0.306, "step": 1000 }, { "epoch": 5.592178770949721, "grad_norm": 2.432680359777377, "learning_rate": 8.222490548226011e-07, "logits/chosen": -3.6886191368103027, "logits/rejected": -3.5714361667633057, "logps/chosen": -5.722451210021973, "logps/rejected": -28.130802154541016, "loss": 0.13, "rewards/accuracies": 1.0, "rewards/chosen": 2.3259849548339844, "rewards/margins": 2.3259849548339844, "rewards/rejected": 0.0, "step": 1001 }, { "epoch": 5.597765363128492, "grad_norm": 2.2978400226359725, "learning_rate": 8.217827541109612e-07, "logits/chosen": -3.263062000274658, "logits/rejected": -3.352325677871704, "logps/chosen": -3.045107126235962, "logps/rejected": -60.54975128173828, "loss": 0.12, "rewards/accuracies": 1.0, "rewards/chosen": 1.559848666191101, "rewards/margins": 1.559848666191101, "rewards/rejected": 0.0, "step": 1002 }, { "epoch": 5.603351955307263, "grad_norm": 1.9418465558548306, "learning_rate": 8.213159751728424e-07, "logits/chosen": -3.3491270542144775, "logits/rejected": -3.388939619064331, "logps/chosen": -16.784282684326172, "logps/rejected": -29.645170211791992, "loss": 0.1022, "rewards/accuracies": 1.0, "rewards/chosen": 3.189891815185547, "rewards/margins": 3.189891815185547, "rewards/rejected": 0.0, "step": 1003 }, { "epoch": 5.608938547486034, "grad_norm": 4.783557383585265, "learning_rate": 8.208487187019612e-07, "logits/chosen": -3.413283348083496, "logits/rejected": -3.386101245880127, "logps/chosen": -1.149272084236145, "logps/rejected": -29.626628875732422, "loss": 0.1461, "rewards/accuracies": 1.0, "rewards/chosen": 1.537308692932129, "rewards/margins": 1.537308692932129, "rewards/rejected": 0.0, "step": 1004 }, { "epoch": 5.614525139664805, "grad_norm": 3.2186521172932294, "learning_rate": 8.203809853927439e-07, "logits/chosen": -3.5104498863220215, "logits/rejected": -3.4505863189697266, "logps/chosen": -15.869890213012695, "logps/rejected": -57.78500747680664, "loss": 0.0962, "rewards/accuracies": 1.0, "rewards/chosen": 2.779179334640503, "rewards/margins": 2.779179334640503, "rewards/rejected": 0.0, "step": 1005 }, { "epoch": 5.620111731843576, "grad_norm": 6.515149987517597, "learning_rate": 8.199127759403257e-07, "logits/chosen": -3.5511887073516846, "logits/rejected": -3.4592039585113525, "logps/chosen": -7.230931282043457, "logps/rejected": -35.0115966796875, "loss": 0.1398, "rewards/accuracies": 1.0, "rewards/chosen": 2.145414352416992, "rewards/margins": 2.145414352416992, "rewards/rejected": 0.0, "step": 1006 }, { "epoch": 5.625698324022347, "grad_norm": 5.188676319462628, "learning_rate": 8.194440910405492e-07, "logits/chosen": -3.5458908081054688, "logits/rejected": -3.509129285812378, "logps/chosen": -5.889590263366699, "logps/rejected": -36.376590728759766, "loss": 0.1425, "rewards/accuracies": 1.0, "rewards/chosen": 2.3784074783325195, "rewards/margins": 2.3784074783325195, "rewards/rejected": 0.0, "step": 1007 }, { "epoch": 5.631284916201118, "grad_norm": 2.9165117914502177, "learning_rate": 8.189749313899632e-07, "logits/chosen": -3.3884472846984863, "logits/rejected": -3.6048972606658936, "logps/chosen": -3.1345629692077637, "logps/rejected": -105.1725845336914, "loss": 0.1442, "rewards/accuracies": 1.0, "rewards/chosen": 2.012176990509033, "rewards/margins": 2.012176990509033, "rewards/rejected": 0.0, "step": 1008 }, { "epoch": 5.636871508379889, "grad_norm": 4.732486447912168, "learning_rate": 8.185052976858231e-07, "logits/chosen": -3.626363754272461, "logits/rejected": -3.478931188583374, "logps/chosen": -5.381372451782227, "logps/rejected": -42.6785888671875, "loss": 0.1473, "rewards/accuracies": 1.0, "rewards/chosen": 2.6120800971984863, "rewards/margins": 2.6120800971984863, "rewards/rejected": 0.0, "step": 1009 }, { "epoch": 5.64245810055866, "grad_norm": 2.897188146080436, "learning_rate": 8.180351906260878e-07, "logits/chosen": -3.9950497150421143, "logits/rejected": -3.830092668533325, "logps/chosen": -9.810124397277832, "logps/rejected": -27.96949577331543, "loss": 0.1298, "rewards/accuracies": 1.0, "rewards/chosen": 2.570422649383545, "rewards/margins": 2.570422649383545, "rewards/rejected": 0.0, "step": 1010 }, { "epoch": 5.648044692737431, "grad_norm": 2.2812328890419256, "learning_rate": 8.175646109094204e-07, "logits/chosen": -3.7176613807678223, "logits/rejected": -3.571798801422119, "logps/chosen": -15.496166229248047, "logps/rejected": -27.049665451049805, "loss": 0.1145, "rewards/accuracies": 1.0, "rewards/chosen": 2.841841459274292, "rewards/margins": 2.841841459274292, "rewards/rejected": 0.0, "step": 1011 }, { "epoch": 5.653631284916202, "grad_norm": 3.264848508506336, "learning_rate": 8.17093559235186e-07, "logits/chosen": -3.532919406890869, "logits/rejected": -3.543309211730957, "logps/chosen": -2.1750073432922363, "logps/rejected": -64.8692855834961, "loss": 0.1453, "rewards/accuracies": 1.0, "rewards/chosen": 1.9546127319335938, "rewards/margins": 1.9546127319335938, "rewards/rejected": 0.0, "step": 1012 }, { "epoch": 5.659217877094972, "grad_norm": 5.876623350033097, "learning_rate": 8.166220363034513e-07, "logits/chosen": -3.6311070919036865, "logits/rejected": -3.5080888271331787, "logps/chosen": -10.898408889770508, "logps/rejected": -21.208297729492188, "loss": 0.1423, "rewards/accuracies": 1.0, "rewards/chosen": 2.5477521419525146, "rewards/margins": 2.5477521419525146, "rewards/rejected": 0.0, "step": 1013 }, { "epoch": 5.664804469273743, "grad_norm": 2.544333298111979, "learning_rate": 8.161500428149832e-07, "logits/chosen": -3.4825284481048584, "logits/rejected": -3.7296884059906006, "logps/chosen": -4.902135848999023, "logps/rejected": -39.969940185546875, "loss": 0.1134, "rewards/accuracies": 1.0, "rewards/chosen": 2.2294304370880127, "rewards/margins": 2.2294304370880127, "rewards/rejected": 0.0, "step": 1014 }, { "epoch": 5.670391061452514, "grad_norm": 4.401093520263053, "learning_rate": 8.156775794712483e-07, "logits/chosen": -3.7538087368011475, "logits/rejected": -3.8753857612609863, "logps/chosen": -7.648348808288574, "logps/rejected": -53.634117126464844, "loss": 0.1273, "rewards/accuracies": 1.0, "rewards/chosen": 2.640012264251709, "rewards/margins": 2.640012264251709, "rewards/rejected": 0.0, "step": 1015 }, { "epoch": 5.675977653631285, "grad_norm": 8.629381180647874, "learning_rate": 8.152046469744114e-07, "logits/chosen": -3.2993342876434326, "logits/rejected": -3.439915895462036, "logps/chosen": -3.192535400390625, "logps/rejected": -77.20162963867188, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": 2.2494940757751465, "rewards/margins": 2.2494940757751465, "rewards/rejected": 0.0, "step": 1016 }, { "epoch": 5.681564245810056, "grad_norm": 2.9299287157690532, "learning_rate": 8.14731246027334e-07, "logits/chosen": -3.7943758964538574, "logits/rejected": -3.711491823196411, "logps/chosen": -10.302391052246094, "logps/rejected": -29.03504753112793, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": 2.7932891845703125, "rewards/margins": 2.7932891845703125, "rewards/rejected": 0.0, "step": 1017 }, { "epoch": 5.687150837988827, "grad_norm": 3.790138742496397, "learning_rate": 8.142573773335744e-07, "logits/chosen": -3.3959226608276367, "logits/rejected": -3.3778514862060547, "logps/chosen": -6.375226974487305, "logps/rejected": -24.757173538208008, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": 2.3533895015716553, "rewards/margins": 2.3533895015716553, "rewards/rejected": 0.0, "step": 1018 }, { "epoch": 5.692737430167598, "grad_norm": 4.346472453742642, "learning_rate": 8.137830415973861e-07, "logits/chosen": -3.6288812160491943, "logits/rejected": -3.6254985332489014, "logps/chosen": -3.6216018199920654, "logps/rejected": -40.97502899169922, "loss": 0.1124, "rewards/accuracies": 1.0, "rewards/chosen": 1.5600640773773193, "rewards/margins": 1.5600640773773193, "rewards/rejected": 0.0, "step": 1019 }, { "epoch": 5.698324022346369, "grad_norm": 1.5770828705824953, "learning_rate": 8.133082395237162e-07, "logits/chosen": -3.769218683242798, "logits/rejected": -3.8052897453308105, "logps/chosen": -2.879276752471924, "logps/rejected": -27.094223022460938, "loss": 0.113, "rewards/accuracies": 1.0, "rewards/chosen": 2.373605489730835, "rewards/margins": 2.373605489730835, "rewards/rejected": 0.0, "step": 1020 }, { "epoch": 5.698324022346369, "eval_logits/chosen": -3.4237537384033203, "eval_logits/rejected": -3.548767566680908, "eval_logps/chosen": -21.93404197692871, "eval_logps/rejected": -40.45651626586914, "eval_loss": 0.5296804308891296, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.8135999441146851, "eval_rewards/margins": 0.8135999441146851, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7144, "eval_samples_per_second": 9.476, "eval_steps_per_second": 0.306, "step": 1020 }, { "epoch": 5.70391061452514, "grad_norm": 3.777357781410946, "learning_rate": 8.128329718182052e-07, "logits/chosen": -3.8146719932556152, "logits/rejected": -3.7009477615356445, "logps/chosen": -6.057042598724365, "logps/rejected": -34.36450958251953, "loss": 0.1312, "rewards/accuracies": 1.0, "rewards/chosen": 2.0107688903808594, "rewards/margins": 2.0107688903808594, "rewards/rejected": 0.0, "step": 1021 }, { "epoch": 5.709497206703911, "grad_norm": 2.5088495568479985, "learning_rate": 8.123572391871855e-07, "logits/chosen": -3.6293792724609375, "logits/rejected": -3.667412281036377, "logps/chosen": -5.019689559936523, "logps/rejected": -38.291656494140625, "loss": 0.1445, "rewards/accuracies": 1.0, "rewards/chosen": 2.206315517425537, "rewards/margins": 2.206315517425537, "rewards/rejected": 0.0, "step": 1022 }, { "epoch": 5.715083798882682, "grad_norm": 4.798630862002979, "learning_rate": 8.118810423376808e-07, "logits/chosen": -3.55879545211792, "logits/rejected": -3.4452855587005615, "logps/chosen": -11.348285675048828, "logps/rejected": -28.296905517578125, "loss": 0.1337, "rewards/accuracies": 1.0, "rewards/chosen": 3.0398333072662354, "rewards/margins": 3.0398333072662354, "rewards/rejected": 0.0, "step": 1023 }, { "epoch": 5.720670391061453, "grad_norm": 5.99365526397894, "learning_rate": 8.11404381977404e-07, "logits/chosen": -3.6069586277008057, "logits/rejected": -3.7181835174560547, "logps/chosen": -7.550654888153076, "logps/rejected": -25.838010787963867, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": 2.031066417694092, "rewards/margins": 2.031066417694092, "rewards/rejected": 0.0, "step": 1024 }, { "epoch": 5.726256983240224, "grad_norm": 2.65472959335182, "learning_rate": 8.109272588147576e-07, "logits/chosen": -3.678741931915283, "logits/rejected": -3.7180609703063965, "logps/chosen": -11.061220169067383, "logps/rejected": -19.70529556274414, "loss": 0.1223, "rewards/accuracies": 1.0, "rewards/chosen": 2.749025344848633, "rewards/margins": 2.749025344848633, "rewards/rejected": 0.0, "step": 1025 }, { "epoch": 5.731843575418995, "grad_norm": 3.7313982458131734, "learning_rate": 8.104496735588316e-07, "logits/chosen": -3.725174903869629, "logits/rejected": -3.6457762718200684, "logps/chosen": -11.355713844299316, "logps/rejected": -36.744564056396484, "loss": 0.128, "rewards/accuracies": 1.0, "rewards/chosen": 1.9542886018753052, "rewards/margins": 1.9542886018753052, "rewards/rejected": 0.0, "step": 1026 }, { "epoch": 5.737430167597766, "grad_norm": 4.039599732016783, "learning_rate": 8.099716269194025e-07, "logits/chosen": -3.6347756385803223, "logits/rejected": -4.101277828216553, "logps/chosen": -4.197378635406494, "logps/rejected": -55.23566436767578, "loss": 0.1538, "rewards/accuracies": 1.0, "rewards/chosen": 2.1737470626831055, "rewards/margins": 2.1737470626831055, "rewards/rejected": 0.0, "step": 1027 }, { "epoch": 5.743016759776537, "grad_norm": 9.835650662654556, "learning_rate": 8.094931196069327e-07, "logits/chosen": -3.9969725608825684, "logits/rejected": -4.015323162078857, "logps/chosen": -6.438171863555908, "logps/rejected": -23.971437454223633, "loss": 0.1465, "rewards/accuracies": 1.0, "rewards/chosen": 2.403317451477051, "rewards/margins": 2.403317451477051, "rewards/rejected": 0.0, "step": 1028 }, { "epoch": 5.748603351955307, "grad_norm": 3.742173466066059, "learning_rate": 8.090141523325698e-07, "logits/chosen": -3.7224981784820557, "logits/rejected": -3.749600648880005, "logps/chosen": -20.025390625, "logps/rejected": -26.75469207763672, "loss": 0.1077, "rewards/accuracies": 1.0, "rewards/chosen": 2.9519076347351074, "rewards/margins": 2.9519076347351074, "rewards/rejected": 0.0, "step": 1029 }, { "epoch": 5.754189944134078, "grad_norm": 3.520843064313235, "learning_rate": 8.085347258081439e-07, "logits/chosen": -3.7439513206481934, "logits/rejected": -3.7707877159118652, "logps/chosen": -6.769908905029297, "logps/rejected": -40.603904724121094, "loss": 0.1516, "rewards/accuracies": 1.0, "rewards/chosen": 2.562917947769165, "rewards/margins": 2.562917947769165, "rewards/rejected": 0.0, "step": 1030 }, { "epoch": 5.759776536312849, "grad_norm": 2.988490256300306, "learning_rate": 8.080548407461684e-07, "logits/chosen": -3.547328472137451, "logits/rejected": -3.535720109939575, "logps/chosen": -10.439088821411133, "logps/rejected": -41.473087310791016, "loss": 0.1374, "rewards/accuracies": 1.0, "rewards/chosen": 2.3769493103027344, "rewards/margins": 2.3769493103027344, "rewards/rejected": 0.0, "step": 1031 }, { "epoch": 5.76536312849162, "grad_norm": 3.8913613629080124, "learning_rate": 8.07574497859838e-07, "logits/chosen": -3.9097604751586914, "logits/rejected": -3.8350741863250732, "logps/chosen": -2.4992494583129883, "logps/rejected": -51.154930114746094, "loss": 0.1552, "rewards/accuracies": 1.0, "rewards/chosen": 1.5016214847564697, "rewards/margins": 1.5016214847564697, "rewards/rejected": 0.0, "step": 1032 }, { "epoch": 5.770949720670391, "grad_norm": 3.2573820900391017, "learning_rate": 8.070936978630279e-07, "logits/chosen": -3.601395845413208, "logits/rejected": -3.5447804927825928, "logps/chosen": -10.644867897033691, "logps/rejected": -23.28240203857422, "loss": 0.1316, "rewards/accuracies": 1.0, "rewards/chosen": 2.515011787414551, "rewards/margins": 2.515011787414551, "rewards/rejected": 0.0, "step": 1033 }, { "epoch": 5.776536312849162, "grad_norm": 4.333177457770317, "learning_rate": 8.066124414702921e-07, "logits/chosen": -3.6848273277282715, "logits/rejected": -3.734130620956421, "logps/chosen": -9.491111755371094, "logps/rejected": -26.304800033569336, "loss": 0.1287, "rewards/accuracies": 1.0, "rewards/chosen": 2.5129756927490234, "rewards/margins": 2.5129756927490234, "rewards/rejected": 0.0, "step": 1034 }, { "epoch": 5.782122905027933, "grad_norm": 2.348425439806568, "learning_rate": 8.06130729396864e-07, "logits/chosen": -3.655939817428589, "logits/rejected": -3.356834888458252, "logps/chosen": -4.262138366699219, "logps/rejected": -49.86429977416992, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 1.8290448188781738, "rewards/margins": 1.8290448188781738, "rewards/rejected": 0.0, "step": 1035 }, { "epoch": 5.787709497206704, "grad_norm": 8.205788491058067, "learning_rate": 8.056485623586529e-07, "logits/chosen": -3.6774847507476807, "logits/rejected": -3.6492533683776855, "logps/chosen": -28.121702194213867, "logps/rejected": -27.963851928710938, "loss": 0.1575, "rewards/accuracies": 1.0, "rewards/chosen": 2.6612625122070312, "rewards/margins": 2.6612625122070312, "rewards/rejected": 0.0, "step": 1036 }, { "epoch": 5.793296089385475, "grad_norm": 3.7346531198512865, "learning_rate": 8.051659410722449e-07, "logits/chosen": -3.5452606678009033, "logits/rejected": -3.6970622539520264, "logps/chosen": -8.915273666381836, "logps/rejected": -38.14643096923828, "loss": 0.1517, "rewards/accuracies": 1.0, "rewards/chosen": 2.321495532989502, "rewards/margins": 2.321495532989502, "rewards/rejected": 0.0, "step": 1037 }, { "epoch": 5.798882681564246, "grad_norm": 4.525769205141487, "learning_rate": 8.046828662549016e-07, "logits/chosen": -3.406954526901245, "logits/rejected": -3.3492534160614014, "logps/chosen": -19.924955368041992, "logps/rejected": -37.408931732177734, "loss": 0.1248, "rewards/accuracies": 1.0, "rewards/chosen": 2.026550769805908, "rewards/margins": 2.026550769805908, "rewards/rejected": 0.0, "step": 1038 }, { "epoch": 5.804469273743017, "grad_norm": 2.844572501441276, "learning_rate": 8.04199338624558e-07, "logits/chosen": -3.7834746837615967, "logits/rejected": -3.807988405227661, "logps/chosen": -6.659724235534668, "logps/rejected": -28.40852165222168, "loss": 0.1231, "rewards/accuracies": 1.0, "rewards/chosen": 2.8151044845581055, "rewards/margins": 2.8151044845581055, "rewards/rejected": 0.0, "step": 1039 }, { "epoch": 5.810055865921788, "grad_norm": 7.162773196323615, "learning_rate": 8.037153588998223e-07, "logits/chosen": -3.5483334064483643, "logits/rejected": -3.4280738830566406, "logps/chosen": -22.878292083740234, "logps/rejected": -43.18260192871094, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": 2.384970188140869, "rewards/margins": 2.384970188140869, "rewards/rejected": 0.0, "step": 1040 }, { "epoch": 5.810055865921788, "eval_logits/chosen": -3.41727876663208, "eval_logits/rejected": -3.543991804122925, "eval_logps/chosen": -21.917339324951172, "eval_logps/rejected": -40.18777847290039, "eval_loss": 0.5309573411941528, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.8152703046798706, "eval_rewards/margins": 0.8152703046798706, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7306, "eval_samples_per_second": 9.471, "eval_steps_per_second": 0.306, "step": 1040 }, { "epoch": 5.815642458100559, "grad_norm": 5.937504038655379, "learning_rate": 8.032309277999745e-07, "logits/chosen": -3.1624813079833984, "logits/rejected": -3.0980820655822754, "logps/chosen": -14.842673301696777, "logps/rejected": -29.77103614807129, "loss": 0.1342, "rewards/accuracies": 1.0, "rewards/chosen": 2.7918972969055176, "rewards/margins": 2.7918972969055176, "rewards/rejected": 0.0, "step": 1041 }, { "epoch": 5.82122905027933, "grad_norm": 2.2425943551312097, "learning_rate": 8.027460460449655e-07, "logits/chosen": -3.483254909515381, "logits/rejected": -3.4363527297973633, "logps/chosen": -11.010003089904785, "logps/rejected": -24.420269012451172, "loss": 0.1253, "rewards/accuracies": 1.0, "rewards/chosen": 2.189192295074463, "rewards/margins": 2.189192295074463, "rewards/rejected": 0.0, "step": 1042 }, { "epoch": 5.826815642458101, "grad_norm": 3.844927643246877, "learning_rate": 8.022607143554159e-07, "logits/chosen": -3.4990077018737793, "logits/rejected": -3.4998536109924316, "logps/chosen": -7.874691963195801, "logps/rejected": -34.47044372558594, "loss": 0.1196, "rewards/accuracies": 1.0, "rewards/chosen": 2.21986985206604, "rewards/margins": 2.21986985206604, "rewards/rejected": 0.0, "step": 1043 }, { "epoch": 5.832402234636872, "grad_norm": 4.981271074838592, "learning_rate": 8.017749334526151e-07, "logits/chosen": -3.855689764022827, "logits/rejected": -3.902771234512329, "logps/chosen": -10.003469467163086, "logps/rejected": -64.29655456542969, "loss": 0.1322, "rewards/accuracies": 1.0, "rewards/chosen": 2.3023786544799805, "rewards/margins": 2.3023786544799805, "rewards/rejected": 0.0, "step": 1044 }, { "epoch": 5.837988826815643, "grad_norm": 3.429561411893467, "learning_rate": 8.012887040585201e-07, "logits/chosen": -3.5651566982269287, "logits/rejected": -3.5544581413269043, "logps/chosen": -13.647613525390625, "logps/rejected": -47.53219985961914, "loss": 0.1406, "rewards/accuracies": 1.0, "rewards/chosen": 2.001678466796875, "rewards/margins": 2.001678466796875, "rewards/rejected": 0.0, "step": 1045 }, { "epoch": 5.843575418994414, "grad_norm": 8.603228515077392, "learning_rate": 8.008020268957542e-07, "logits/chosen": -3.314690351486206, "logits/rejected": -3.366624355316162, "logps/chosen": -9.321515083312988, "logps/rejected": -22.403581619262695, "loss": 0.1534, "rewards/accuracies": 1.0, "rewards/chosen": 2.4352869987487793, "rewards/margins": 2.4352869987487793, "rewards/rejected": 0.0, "step": 1046 }, { "epoch": 5.849162011173185, "grad_norm": 3.6846850123231274, "learning_rate": 8.003149026876064e-07, "logits/chosen": -3.2948601245880127, "logits/rejected": -3.2105228900909424, "logps/chosen": -3.3570847511291504, "logps/rejected": -51.15724563598633, "loss": 0.1373, "rewards/accuracies": 1.0, "rewards/chosen": 2.01554012298584, "rewards/margins": 2.01554012298584, "rewards/rejected": 0.0, "step": 1047 }, { "epoch": 5.854748603351956, "grad_norm": 2.189238114286271, "learning_rate": 7.998273321580301e-07, "logits/chosen": -3.7571964263916016, "logits/rejected": -3.6517891883850098, "logps/chosen": -23.5264949798584, "logps/rejected": -43.30960464477539, "loss": 0.0983, "rewards/accuracies": 1.0, "rewards/chosen": 3.620962142944336, "rewards/margins": 3.620962142944336, "rewards/rejected": 0.0, "step": 1048 }, { "epoch": 5.860335195530726, "grad_norm": 5.025232426897415, "learning_rate": 7.99339316031642e-07, "logits/chosen": -3.4721052646636963, "logits/rejected": -3.5792720317840576, "logps/chosen": -16.34332847595215, "logps/rejected": -51.911041259765625, "loss": 0.1335, "rewards/accuracies": 1.0, "rewards/chosen": 2.748723268508911, "rewards/margins": 2.748723268508911, "rewards/rejected": 0.0, "step": 1049 }, { "epoch": 5.865921787709497, "grad_norm": 5.05088904377592, "learning_rate": 7.988508550337206e-07, "logits/chosen": -3.437150001525879, "logits/rejected": -3.4027011394500732, "logps/chosen": -12.300931930541992, "logps/rejected": -26.506114959716797, "loss": 0.1632, "rewards/accuracies": 1.0, "rewards/chosen": 2.8190996646881104, "rewards/margins": 2.8190996646881104, "rewards/rejected": 0.0, "step": 1050 }, { "epoch": 5.871508379888268, "grad_norm": 3.384330851640664, "learning_rate": 7.983619498902063e-07, "logits/chosen": -3.9197616577148438, "logits/rejected": -3.4385604858398438, "logps/chosen": -8.68655014038086, "logps/rejected": -55.796478271484375, "loss": 0.1511, "rewards/accuracies": 1.0, "rewards/chosen": 2.1019668579101562, "rewards/margins": 2.1019668579101562, "rewards/rejected": 0.0, "step": 1051 }, { "epoch": 5.877094972067039, "grad_norm": 4.992564809277405, "learning_rate": 7.978726013276993e-07, "logits/chosen": -3.516857862472534, "logits/rejected": -3.541065216064453, "logps/chosen": -2.68699049949646, "logps/rejected": -36.906333923339844, "loss": 0.1532, "rewards/accuracies": 1.0, "rewards/chosen": 1.6484850645065308, "rewards/margins": 1.6484850645065308, "rewards/rejected": 0.0, "step": 1052 }, { "epoch": 5.88268156424581, "grad_norm": 5.892173992077341, "learning_rate": 7.973828100734583e-07, "logits/chosen": -3.618582248687744, "logits/rejected": -3.6364614963531494, "logps/chosen": -36.14175796508789, "logps/rejected": -38.002197265625, "loss": 0.1425, "rewards/accuracies": 1.0, "rewards/chosen": 2.6985483169555664, "rewards/margins": 2.6985483169555664, "rewards/rejected": 0.0, "step": 1053 }, { "epoch": 5.888268156424581, "grad_norm": 2.657512711385388, "learning_rate": 7.968925768554005e-07, "logits/chosen": -3.326869487762451, "logits/rejected": -3.615558385848999, "logps/chosen": -12.183609008789062, "logps/rejected": -40.154624938964844, "loss": 0.1214, "rewards/accuracies": 1.0, "rewards/chosen": 2.736098051071167, "rewards/margins": 2.736098051071167, "rewards/rejected": 0.0, "step": 1054 }, { "epoch": 5.893854748603352, "grad_norm": 2.7462352809129795, "learning_rate": 7.964019024021e-07, "logits/chosen": -3.8306450843811035, "logits/rejected": -3.7286834716796875, "logps/chosen": -6.44540548324585, "logps/rejected": -84.51054382324219, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": 2.596806526184082, "rewards/margins": 2.596806526184082, "rewards/rejected": 0.0, "step": 1055 }, { "epoch": 5.899441340782123, "grad_norm": 2.1605553755845204, "learning_rate": 7.959107874427862e-07, "logits/chosen": -3.3188657760620117, "logits/rejected": -3.349465847015381, "logps/chosen": -2.676693916320801, "logps/rejected": -86.0643081665039, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": 1.8748719692230225, "rewards/margins": 1.8748719692230225, "rewards/rejected": 0.0, "step": 1056 }, { "epoch": 5.905027932960894, "grad_norm": 6.44224485187991, "learning_rate": 7.954192327073434e-07, "logits/chosen": -3.76053524017334, "logits/rejected": -3.7138631343841553, "logps/chosen": -27.124252319335938, "logps/rejected": -33.14948654174805, "loss": 0.1293, "rewards/accuracies": 1.0, "rewards/chosen": 3.01265025138855, "rewards/margins": 3.01265025138855, "rewards/rejected": 0.0, "step": 1057 }, { "epoch": 5.910614525139665, "grad_norm": 5.831035365072501, "learning_rate": 7.949272389263095e-07, "logits/chosen": -3.683530807495117, "logits/rejected": -3.5552704334259033, "logps/chosen": -12.019757270812988, "logps/rejected": -34.60222625732422, "loss": 0.1243, "rewards/accuracies": 1.0, "rewards/chosen": 3.1793899536132812, "rewards/margins": 3.1793899536132812, "rewards/rejected": 0.0, "step": 1058 }, { "epoch": 5.916201117318436, "grad_norm": 3.837026863586912, "learning_rate": 7.944348068308749e-07, "logits/chosen": -3.480384349822998, "logits/rejected": -3.5684092044830322, "logps/chosen": -3.4614875316619873, "logps/rejected": -17.95250701904297, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": 1.811203956604004, "rewards/margins": 1.811203956604004, "rewards/rejected": 0.0, "step": 1059 }, { "epoch": 5.921787709497207, "grad_norm": 4.358596389031213, "learning_rate": 7.939419371528813e-07, "logits/chosen": -3.3316895961761475, "logits/rejected": -3.5211856365203857, "logps/chosen": -7.271438121795654, "logps/rejected": -38.95193099975586, "loss": 0.1454, "rewards/accuracies": 1.0, "rewards/chosen": 2.3479537963867188, "rewards/margins": 2.3479537963867188, "rewards/rejected": 0.0, "step": 1060 }, { "epoch": 5.921787709497207, "eval_logits/chosen": -3.4203200340270996, "eval_logits/rejected": -3.545351028442383, "eval_logps/chosen": -21.673046112060547, "eval_logps/rejected": -40.24152755737305, "eval_loss": 0.5311393141746521, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.8396994471549988, "eval_rewards/margins": 0.8396994471549988, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7014, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.306, "step": 1060 }, { "epoch": 5.927374301675978, "grad_norm": 6.649311192603633, "learning_rate": 7.93448630624821e-07, "logits/chosen": -3.718939781188965, "logits/rejected": -3.7220516204833984, "logps/chosen": -20.689510345458984, "logps/rejected": -36.891448974609375, "loss": 0.1423, "rewards/accuracies": 1.0, "rewards/chosen": 2.5494956970214844, "rewards/margins": 2.5494956970214844, "rewards/rejected": 0.0, "step": 1061 }, { "epoch": 5.932960893854749, "grad_norm": 4.235624484730155, "learning_rate": 7.929548879798352e-07, "logits/chosen": -3.562255859375, "logits/rejected": -3.632988691329956, "logps/chosen": -23.885807037353516, "logps/rejected": -48.20660400390625, "loss": 0.1076, "rewards/accuracies": 1.0, "rewards/chosen": 3.7936930656433105, "rewards/margins": 3.7936930656433105, "rewards/rejected": 0.0, "step": 1062 }, { "epoch": 5.93854748603352, "grad_norm": 6.259998385431391, "learning_rate": 7.924607099517134e-07, "logits/chosen": -3.5328803062438965, "logits/rejected": -3.5107877254486084, "logps/chosen": -24.228809356689453, "logps/rejected": -40.1533317565918, "loss": 0.1482, "rewards/accuracies": 1.0, "rewards/chosen": 2.37412166595459, "rewards/margins": 2.37412166595459, "rewards/rejected": 0.0, "step": 1063 }, { "epoch": 5.94413407821229, "grad_norm": 2.4960125141521794, "learning_rate": 7.919660972748923e-07, "logits/chosen": -3.378631591796875, "logits/rejected": -3.5496344566345215, "logps/chosen": -2.1727137565612793, "logps/rejected": -64.11479187011719, "loss": 0.1236, "rewards/accuracies": 1.0, "rewards/chosen": 1.8212121725082397, "rewards/margins": 1.8212121725082397, "rewards/rejected": 0.0, "step": 1064 }, { "epoch": 5.949720670391061, "grad_norm": 2.855849965245941, "learning_rate": 7.914710506844544e-07, "logits/chosen": -3.1338109970092773, "logits/rejected": -3.261589765548706, "logps/chosen": -9.45151138305664, "logps/rejected": -44.79124069213867, "loss": 0.1463, "rewards/accuracies": 1.0, "rewards/chosen": 2.204951763153076, "rewards/margins": 2.204951763153076, "rewards/rejected": 0.0, "step": 1065 }, { "epoch": 5.955307262569832, "grad_norm": 2.7537517537632286, "learning_rate": 7.909755709161268e-07, "logits/chosen": -3.887150526046753, "logits/rejected": -3.8906705379486084, "logps/chosen": -2.40024995803833, "logps/rejected": -54.04871368408203, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": 1.8678940534591675, "rewards/margins": 1.8678940534591675, "rewards/rejected": 0.0, "step": 1066 }, { "epoch": 5.960893854748603, "grad_norm": 3.683780045334, "learning_rate": 7.904796587062811e-07, "logits/chosen": -3.3832907676696777, "logits/rejected": -3.313124418258667, "logps/chosen": -16.009614944458008, "logps/rejected": -37.07920837402344, "loss": 0.1333, "rewards/accuracies": 1.0, "rewards/chosen": 2.613959789276123, "rewards/margins": 2.613959789276123, "rewards/rejected": 0.0, "step": 1067 }, { "epoch": 5.966480446927374, "grad_norm": 3.518954282147113, "learning_rate": 7.899833147919309e-07, "logits/chosen": -2.8122658729553223, "logits/rejected": -2.786893367767334, "logps/chosen": -6.672562599182129, "logps/rejected": -40.3980712890625, "loss": 0.1257, "rewards/accuracies": 1.0, "rewards/chosen": 2.3488590717315674, "rewards/margins": 2.3488590717315674, "rewards/rejected": 0.0, "step": 1068 }, { "epoch": 5.972067039106145, "grad_norm": 5.106324928377724, "learning_rate": 7.894865399107317e-07, "logits/chosen": -3.834169387817383, "logits/rejected": -3.4647724628448486, "logps/chosen": -18.958227157592773, "logps/rejected": -43.90966796875, "loss": 0.129, "rewards/accuracies": 1.0, "rewards/chosen": 2.4929468631744385, "rewards/margins": 2.4929468631744385, "rewards/rejected": 0.0, "step": 1069 }, { "epoch": 5.977653631284916, "grad_norm": 8.920813378166514, "learning_rate": 7.889893348009795e-07, "logits/chosen": -3.744349956512451, "logits/rejected": -3.489316940307617, "logps/chosen": -17.75255012512207, "logps/rejected": -43.721099853515625, "loss": 0.152, "rewards/accuracies": 1.0, "rewards/chosen": 2.822836399078369, "rewards/margins": 2.822836399078369, "rewards/rejected": 0.0, "step": 1070 }, { "epoch": 5.983240223463687, "grad_norm": 13.605019074673555, "learning_rate": 7.884917002016096e-07, "logits/chosen": -3.423605442047119, "logits/rejected": -3.5889503955841064, "logps/chosen": -14.498847007751465, "logps/rejected": -23.585159301757812, "loss": 0.1945, "rewards/accuracies": 0.75, "rewards/chosen": 1.329728126525879, "rewards/margins": 1.329728126525879, "rewards/rejected": 0.0, "step": 1071 }, { "epoch": 5.988826815642458, "grad_norm": 3.1418183217119156, "learning_rate": 7.879936368521956e-07, "logits/chosen": -3.825878143310547, "logits/rejected": -3.856353998184204, "logps/chosen": -4.65446662902832, "logps/rejected": -27.227596282958984, "loss": 0.1321, "rewards/accuracies": 1.0, "rewards/chosen": 2.210226058959961, "rewards/margins": 2.210226058959961, "rewards/rejected": 0.0, "step": 1072 }, { "epoch": 5.994413407821229, "grad_norm": 6.986694525808208, "learning_rate": 7.874951454929483e-07, "logits/chosen": -3.3854620456695557, "logits/rejected": -3.423590660095215, "logps/chosen": -4.558568477630615, "logps/rejected": -60.397037506103516, "loss": 0.124, "rewards/accuracies": 1.0, "rewards/chosen": 2.3430333137512207, "rewards/margins": 2.3430333137512207, "rewards/rejected": 0.0, "step": 1073 }, { "epoch": 6.0, "grad_norm": 2.334469710023271, "learning_rate": 7.869962268647149e-07, "logits/chosen": -3.527007818222046, "logits/rejected": -3.4847359657287598, "logps/chosen": -4.882283687591553, "logps/rejected": -26.33693504333496, "loss": 0.1236, "rewards/accuracies": 1.0, "rewards/chosen": 1.9604823589324951, "rewards/margins": 1.9604823589324951, "rewards/rejected": 0.0, "step": 1074 }, { "epoch": 6.005586592178771, "grad_norm": 2.4628652540684617, "learning_rate": 7.864968817089771e-07, "logits/chosen": -3.7268893718719482, "logits/rejected": -3.7700512409210205, "logps/chosen": -6.030098915100098, "logps/rejected": -31.229232788085938, "loss": 0.1234, "rewards/accuracies": 1.0, "rewards/chosen": 2.246878147125244, "rewards/margins": 2.246878147125244, "rewards/rejected": 0.0, "step": 1075 }, { "epoch": 6.011173184357542, "grad_norm": 2.202430159342966, "learning_rate": 7.859971107678507e-07, "logits/chosen": -3.4105281829833984, "logits/rejected": -3.4312808513641357, "logps/chosen": -8.663097381591797, "logps/rejected": -19.917152404785156, "loss": 0.1134, "rewards/accuracies": 1.0, "rewards/chosen": 2.780505895614624, "rewards/margins": 2.780505895614624, "rewards/rejected": 0.0, "step": 1076 }, { "epoch": 6.016759776536313, "grad_norm": 2.712624777228195, "learning_rate": 7.854969147840844e-07, "logits/chosen": -3.333019733428955, "logits/rejected": -3.460301399230957, "logps/chosen": -39.58592224121094, "logps/rejected": -66.58130645751953, "loss": 0.1376, "rewards/accuracies": 1.0, "rewards/chosen": 2.9631738662719727, "rewards/margins": 2.9631738662719727, "rewards/rejected": 0.0, "step": 1077 }, { "epoch": 6.022346368715084, "grad_norm": 3.5542963226582662, "learning_rate": 7.849962945010586e-07, "logits/chosen": -3.498375415802002, "logits/rejected": -3.4425761699676514, "logps/chosen": -21.79305648803711, "logps/rejected": -28.10723304748535, "loss": 0.1335, "rewards/accuracies": 1.0, "rewards/chosen": 2.7957966327667236, "rewards/margins": 2.7957966327667236, "rewards/rejected": 0.0, "step": 1078 }, { "epoch": 6.027932960893855, "grad_norm": 1.490515196796577, "learning_rate": 7.84495250662784e-07, "logits/chosen": -3.419386625289917, "logits/rejected": -3.4304134845733643, "logps/chosen": -4.168264389038086, "logps/rejected": -52.03262710571289, "loss": 0.1092, "rewards/accuracies": 1.0, "rewards/chosen": 2.539543390274048, "rewards/margins": 2.539543390274048, "rewards/rejected": 0.0, "step": 1079 }, { "epoch": 6.033519553072626, "grad_norm": 2.0665741161577853, "learning_rate": 7.839937840139012e-07, "logits/chosen": -3.3358426094055176, "logits/rejected": -3.5571582317352295, "logps/chosen": -12.014814376831055, "logps/rejected": -39.008941650390625, "loss": 0.0924, "rewards/accuracies": 1.0, "rewards/chosen": 2.8855361938476562, "rewards/margins": 2.8855361938476562, "rewards/rejected": 0.0, "step": 1080 }, { "epoch": 6.033519553072626, "eval_logits/chosen": -3.4154648780822754, "eval_logits/rejected": -3.541161298751831, "eval_logps/chosen": -22.391386032104492, "eval_logps/rejected": -40.84318161010742, "eval_loss": 0.5555992126464844, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": 0.7678656578063965, "eval_rewards/margins": 0.7678656578063965, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7461, "eval_samples_per_second": 9.467, "eval_steps_per_second": 0.305, "step": 1080 }, { "epoch": 6.039106145251397, "grad_norm": 3.631732747504115, "learning_rate": 7.834918952996787e-07, "logits/chosen": -3.3401753902435303, "logits/rejected": -3.1974246501922607, "logps/chosen": -13.715397834777832, "logps/rejected": -34.694549560546875, "loss": 0.1262, "rewards/accuracies": 1.0, "rewards/chosen": 2.8850817680358887, "rewards/margins": 2.8850817680358887, "rewards/rejected": 0.0, "step": 1081 }, { "epoch": 6.044692737430168, "grad_norm": 1.400512054416602, "learning_rate": 7.829895852660129e-07, "logits/chosen": -3.5798497200012207, "logits/rejected": -3.602731466293335, "logps/chosen": -9.231255531311035, "logps/rejected": -17.62391471862793, "loss": 0.1119, "rewards/accuracies": 1.0, "rewards/chosen": 2.3963284492492676, "rewards/margins": 2.3963284492492676, "rewards/rejected": 0.0, "step": 1082 }, { "epoch": 6.050279329608939, "grad_norm": 3.3975025017852483, "learning_rate": 7.824868546594254e-07, "logits/chosen": -3.515927791595459, "logits/rejected": -3.5182993412017822, "logps/chosen": -10.096548080444336, "logps/rejected": -19.822113037109375, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": 2.589899778366089, "rewards/margins": 2.589899778366089, "rewards/rejected": 0.0, "step": 1083 }, { "epoch": 6.055865921787709, "grad_norm": 2.0597535081699094, "learning_rate": 7.819837042270639e-07, "logits/chosen": -3.2524971961975098, "logits/rejected": -3.3197362422943115, "logps/chosen": -5.943603515625, "logps/rejected": -34.231170654296875, "loss": 0.13, "rewards/accuracies": 1.0, "rewards/chosen": 2.120120048522949, "rewards/margins": 2.120120048522949, "rewards/rejected": 0.0, "step": 1084 }, { "epoch": 6.06145251396648, "grad_norm": 1.3017971192466566, "learning_rate": 7.814801347166992e-07, "logits/chosen": -3.6389377117156982, "logits/rejected": -3.7064850330352783, "logps/chosen": -1.456046223640442, "logps/rejected": -51.3591194152832, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": 1.416407823562622, "rewards/margins": 1.416407823562622, "rewards/rejected": 0.0, "step": 1085 }, { "epoch": 6.067039106145251, "grad_norm": 1.680492345553586, "learning_rate": 7.809761468767256e-07, "logits/chosen": -3.8416240215301514, "logits/rejected": -3.710263729095459, "logps/chosen": -17.1885986328125, "logps/rejected": -58.12712097167969, "loss": 0.1084, "rewards/accuracies": 1.0, "rewards/chosen": 2.589221954345703, "rewards/margins": 2.589221954345703, "rewards/rejected": 0.0, "step": 1086 }, { "epoch": 6.072625698324022, "grad_norm": 5.760094213348393, "learning_rate": 7.804717414561585e-07, "logits/chosen": -3.3121144771575928, "logits/rejected": -3.3707149028778076, "logps/chosen": -4.068519592285156, "logps/rejected": -72.06900024414062, "loss": 0.1257, "rewards/accuracies": 1.0, "rewards/chosen": 1.890760898590088, "rewards/margins": 1.890760898590088, "rewards/rejected": 0.0, "step": 1087 }, { "epoch": 6.078212290502793, "grad_norm": 5.42449667131543, "learning_rate": 7.799669192046344e-07, "logits/chosen": -3.530776023864746, "logits/rejected": -3.664048433303833, "logps/chosen": -5.868477821350098, "logps/rejected": -43.213623046875, "loss": 0.151, "rewards/accuracies": 1.0, "rewards/chosen": 2.247279167175293, "rewards/margins": 2.247279167175293, "rewards/rejected": 0.0, "step": 1088 }, { "epoch": 6.083798882681564, "grad_norm": 1.3059898216049628, "learning_rate": 7.794616808724089e-07, "logits/chosen": -3.378438711166382, "logits/rejected": -3.362328052520752, "logps/chosen": -13.31649112701416, "logps/rejected": -52.14571762084961, "loss": 0.1027, "rewards/accuracies": 1.0, "rewards/chosen": 2.272395133972168, "rewards/margins": 2.272395133972168, "rewards/rejected": 0.0, "step": 1089 }, { "epoch": 6.089385474860335, "grad_norm": 5.847080787887179, "learning_rate": 7.789560272103563e-07, "logits/chosen": -3.6775078773498535, "logits/rejected": -3.790405511856079, "logps/chosen": -6.6467742919921875, "logps/rejected": -34.40592575073242, "loss": 0.1252, "rewards/accuracies": 1.0, "rewards/chosen": 2.8223204612731934, "rewards/margins": 2.8223204612731934, "rewards/rejected": 0.0, "step": 1090 }, { "epoch": 6.094972067039106, "grad_norm": 2.3429210829400433, "learning_rate": 7.78449958969968e-07, "logits/chosen": -3.4835174083709717, "logits/rejected": -3.3803036212921143, "logps/chosen": -3.2445549964904785, "logps/rejected": -44.495182037353516, "loss": 0.1124, "rewards/accuracies": 1.0, "rewards/chosen": 2.4009182453155518, "rewards/margins": 2.4009182453155518, "rewards/rejected": 0.0, "step": 1091 }, { "epoch": 6.100558659217877, "grad_norm": 3.9284252763194507, "learning_rate": 7.779434769033513e-07, "logits/chosen": -3.2931106090545654, "logits/rejected": -3.4320995807647705, "logps/chosen": -6.39832067489624, "logps/rejected": -28.484161376953125, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": 1.5139878988265991, "rewards/margins": 1.5139878988265991, "rewards/rejected": 0.0, "step": 1092 }, { "epoch": 6.106145251396648, "grad_norm": 2.075697482606786, "learning_rate": 7.774365817632289e-07, "logits/chosen": -3.5117132663726807, "logits/rejected": -3.3721182346343994, "logps/chosen": -5.439365386962891, "logps/rejected": -24.08738136291504, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 2.383086681365967, "rewards/margins": 2.383086681365967, "rewards/rejected": 0.0, "step": 1093 }, { "epoch": 6.111731843575419, "grad_norm": 3.798888734456071, "learning_rate": 7.769292743029373e-07, "logits/chosen": -3.698251724243164, "logits/rejected": -3.4040896892547607, "logps/chosen": -11.444732666015625, "logps/rejected": -29.642263412475586, "loss": 0.1101, "rewards/accuracies": 1.0, "rewards/chosen": 2.803191661834717, "rewards/margins": 2.803191661834717, "rewards/rejected": 0.0, "step": 1094 }, { "epoch": 6.11731843575419, "grad_norm": 1.4851485378801812, "learning_rate": 7.764215552764258e-07, "logits/chosen": -3.5323593616485596, "logits/rejected": -3.5734283924102783, "logps/chosen": -1.9695065021514893, "logps/rejected": -35.47942352294922, "loss": 0.1262, "rewards/accuracies": 1.0, "rewards/chosen": 2.191638231277466, "rewards/margins": 2.191638231277466, "rewards/rejected": 0.0, "step": 1095 }, { "epoch": 6.122905027932961, "grad_norm": 3.6155021859206116, "learning_rate": 7.759134254382549e-07, "logits/chosen": -3.7087323665618896, "logits/rejected": -3.744468927383423, "logps/chosen": -1.8839842081069946, "logps/rejected": -15.816478729248047, "loss": 0.1083, "rewards/accuracies": 1.0, "rewards/chosen": 1.832279920578003, "rewards/margins": 1.832279920578003, "rewards/rejected": 0.0, "step": 1096 }, { "epoch": 6.128491620111732, "grad_norm": 1.505737569211408, "learning_rate": 7.754048855435964e-07, "logits/chosen": -3.5456957817077637, "logits/rejected": -3.611483335494995, "logps/chosen": -6.43549919128418, "logps/rejected": -24.106260299682617, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": 2.4427661895751953, "rewards/margins": 2.4427661895751953, "rewards/rejected": 0.0, "step": 1097 }, { "epoch": 6.134078212290503, "grad_norm": 3.6197195471728216, "learning_rate": 7.748959363482313e-07, "logits/chosen": -3.683819532394409, "logits/rejected": -3.587161064147949, "logps/chosen": -5.716750621795654, "logps/rejected": -20.286861419677734, "loss": 0.1201, "rewards/accuracies": 1.0, "rewards/chosen": 1.9873279333114624, "rewards/margins": 1.9873279333114624, "rewards/rejected": 0.0, "step": 1098 }, { "epoch": 6.139664804469274, "grad_norm": 2.5314161282485736, "learning_rate": 7.743865786085481e-07, "logits/chosen": -3.505862236022949, "logits/rejected": -3.561208963394165, "logps/chosen": -14.792437553405762, "logps/rejected": -40.24578857421875, "loss": 0.1318, "rewards/accuracies": 1.0, "rewards/chosen": 2.70249605178833, "rewards/margins": 2.70249605178833, "rewards/rejected": 0.0, "step": 1099 }, { "epoch": 6.145251396648045, "grad_norm": 1.6066077502153424, "learning_rate": 7.738768130815438e-07, "logits/chosen": -3.3270411491394043, "logits/rejected": -3.530663251876831, "logps/chosen": -4.3255462646484375, "logps/rejected": -35.94929504394531, "loss": 0.0872, "rewards/accuracies": 1.0, "rewards/chosen": 2.5646371841430664, "rewards/margins": 2.5646371841430664, "rewards/rejected": 0.0, "step": 1100 }, { "epoch": 6.145251396648045, "eval_logits/chosen": -3.408015727996826, "eval_logits/rejected": -3.5352530479431152, "eval_logps/chosen": -23.45020866394043, "eval_logps/rejected": -42.12792205810547, "eval_loss": 0.5975612998008728, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.6619831919670105, "eval_rewards/margins": 0.6619831919670105, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6794, "eval_samples_per_second": 9.486, "eval_steps_per_second": 0.306, "step": 1100 }, { "epoch": 6.150837988826815, "grad_norm": 1.566825548015347, "learning_rate": 7.733666405248203e-07, "logits/chosen": -3.6134276390075684, "logits/rejected": -3.5345585346221924, "logps/chosen": -13.918472290039062, "logps/rejected": -36.42315673828125, "loss": 0.1197, "rewards/accuracies": 1.0, "rewards/chosen": 3.099599838256836, "rewards/margins": 3.099599838256836, "rewards/rejected": 0.0, "step": 1101 }, { "epoch": 6.156424581005586, "grad_norm": 3.3667862061296234, "learning_rate": 7.728560616965851e-07, "logits/chosen": -3.281236171722412, "logits/rejected": -3.4886529445648193, "logps/chosen": -17.144258499145508, "logps/rejected": -23.47547149658203, "loss": 0.0909, "rewards/accuracies": 1.0, "rewards/chosen": 2.8349051475524902, "rewards/margins": 2.8349051475524902, "rewards/rejected": 0.0, "step": 1102 }, { "epoch": 6.162011173184357, "grad_norm": 6.492421992577809, "learning_rate": 7.723450773556493e-07, "logits/chosen": -3.350792646408081, "logits/rejected": -3.4875328540802, "logps/chosen": -38.94091796875, "logps/rejected": -45.85662841796875, "loss": 0.1303, "rewards/accuracies": 1.0, "rewards/chosen": 3.9481279850006104, "rewards/margins": 3.9481279850006104, "rewards/rejected": 0.0, "step": 1103 }, { "epoch": 6.167597765363128, "grad_norm": 2.360987681881043, "learning_rate": 7.718336882614264e-07, "logits/chosen": -3.499664545059204, "logits/rejected": -3.5045888423919678, "logps/chosen": -14.796363830566406, "logps/rejected": -41.72339630126953, "loss": 0.1211, "rewards/accuracies": 1.0, "rewards/chosen": 2.8475091457366943, "rewards/margins": 2.8475091457366943, "rewards/rejected": 0.0, "step": 1104 }, { "epoch": 6.173184357541899, "grad_norm": 4.889157305192587, "learning_rate": 7.71321895173932e-07, "logits/chosen": -3.3693530559539795, "logits/rejected": -3.3101730346679688, "logps/chosen": -10.217439651489258, "logps/rejected": -41.31106185913086, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": 2.839888095855713, "rewards/margins": 2.839888095855713, "rewards/rejected": 0.0, "step": 1105 }, { "epoch": 6.17877094972067, "grad_norm": 1.4209093612701365, "learning_rate": 7.708096988537815e-07, "logits/chosen": -3.4527056217193604, "logits/rejected": -3.456287145614624, "logps/chosen": -1.576203465461731, "logps/rejected": -31.562179565429688, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 1.8774808645248413, "rewards/margins": 1.8774808645248413, "rewards/rejected": 0.0, "step": 1106 }, { "epoch": 6.184357541899441, "grad_norm": 2.245537768734017, "learning_rate": 7.702971000621898e-07, "logits/chosen": -3.5811023712158203, "logits/rejected": -3.478609561920166, "logps/chosen": -20.62826156616211, "logps/rejected": -53.61195373535156, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 3.106804609298706, "rewards/margins": 3.106804609298706, "rewards/rejected": 0.0, "step": 1107 }, { "epoch": 6.189944134078212, "grad_norm": 5.3723395536066265, "learning_rate": 7.697840995609703e-07, "logits/chosen": -3.7272963523864746, "logits/rejected": -3.635612964630127, "logps/chosen": -3.4792723655700684, "logps/rejected": -27.30158805847168, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": 2.251101016998291, "rewards/margins": 2.251101016998291, "rewards/rejected": 0.0, "step": 1108 }, { "epoch": 6.195530726256983, "grad_norm": 7.300222095177023, "learning_rate": 7.692706981125328e-07, "logits/chosen": -3.196547508239746, "logits/rejected": -3.2908828258514404, "logps/chosen": -3.8858413696289062, "logps/rejected": -41.947906494140625, "loss": 0.1611, "rewards/accuracies": 1.0, "rewards/chosen": 1.6939663887023926, "rewards/margins": 1.6939663887023926, "rewards/rejected": 0.0, "step": 1109 }, { "epoch": 6.201117318435754, "grad_norm": 3.8000838724091492, "learning_rate": 7.687568964798835e-07, "logits/chosen": -3.5294885635375977, "logits/rejected": -3.667855739593506, "logps/chosen": -23.49508285522461, "logps/rejected": -42.863075256347656, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 3.693263053894043, "rewards/margins": 3.693263053894043, "rewards/rejected": 0.0, "step": 1110 }, { "epoch": 6.206703910614525, "grad_norm": 1.4707324906429422, "learning_rate": 7.682426954266231e-07, "logits/chosen": -3.5440869331359863, "logits/rejected": -3.706747531890869, "logps/chosen": -10.006698608398438, "logps/rejected": -21.6866455078125, "loss": 0.1236, "rewards/accuracies": 1.0, "rewards/chosen": 2.6204147338867188, "rewards/margins": 2.6204147338867188, "rewards/rejected": 0.0, "step": 1111 }, { "epoch": 6.212290502793296, "grad_norm": 2.95793580906938, "learning_rate": 7.677280957169459e-07, "logits/chosen": -3.447383165359497, "logits/rejected": -3.511056661605835, "logps/chosen": -3.379901647567749, "logps/rejected": -44.22206497192383, "loss": 0.1525, "rewards/accuracies": 1.0, "rewards/chosen": 2.633761167526245, "rewards/margins": 2.633761167526245, "rewards/rejected": 0.0, "step": 1112 }, { "epoch": 6.217877094972067, "grad_norm": 3.9685239765315483, "learning_rate": 7.672130981156386e-07, "logits/chosen": -3.4409449100494385, "logits/rejected": -3.4873509407043457, "logps/chosen": -4.753771781921387, "logps/rejected": -38.67449188232422, "loss": 0.1307, "rewards/accuracies": 1.0, "rewards/chosen": 1.859330654144287, "rewards/margins": 1.859330654144287, "rewards/rejected": 0.0, "step": 1113 }, { "epoch": 6.223463687150838, "grad_norm": 5.311161386369822, "learning_rate": 7.666977033880795e-07, "logits/chosen": -3.696667432785034, "logits/rejected": -3.5870654582977295, "logps/chosen": -11.565668106079102, "logps/rejected": -21.682395935058594, "loss": 0.1037, "rewards/accuracies": 1.0, "rewards/chosen": 2.505594491958618, "rewards/margins": 2.505594491958618, "rewards/rejected": 0.0, "step": 1114 }, { "epoch": 6.229050279329609, "grad_norm": 2.649344486648601, "learning_rate": 7.661819123002366e-07, "logits/chosen": -3.3797621726989746, "logits/rejected": -3.473595380783081, "logps/chosen": -2.7563633918762207, "logps/rejected": -90.32902526855469, "loss": 0.1225, "rewards/accuracies": 1.0, "rewards/chosen": 2.310032367706299, "rewards/margins": 2.310032367706299, "rewards/rejected": 0.0, "step": 1115 }, { "epoch": 6.23463687150838, "grad_norm": 2.7494213517881807, "learning_rate": 7.65665725618668e-07, "logits/chosen": -3.3604629039764404, "logits/rejected": -3.2099506855010986, "logps/chosen": -22.696582794189453, "logps/rejected": -47.31389617919922, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": 3.088898181915283, "rewards/margins": 3.088898181915283, "rewards/rejected": 0.0, "step": 1116 }, { "epoch": 6.240223463687151, "grad_norm": 4.959981557959078, "learning_rate": 7.651491441105187e-07, "logits/chosen": -3.936720848083496, "logits/rejected": -3.9750871658325195, "logps/chosen": -2.2815120220184326, "logps/rejected": -25.254283905029297, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": 2.22707200050354, "rewards/margins": 2.22707200050354, "rewards/rejected": 0.0, "step": 1117 }, { "epoch": 6.245810055865922, "grad_norm": 4.277199276005058, "learning_rate": 7.646321685435205e-07, "logits/chosen": -3.290116548538208, "logits/rejected": -3.431396245956421, "logps/chosen": -2.931079864501953, "logps/rejected": -33.018619537353516, "loss": 0.1366, "rewards/accuracies": 1.0, "rewards/chosen": 2.4505295753479004, "rewards/margins": 2.4505295753479004, "rewards/rejected": 0.0, "step": 1118 }, { "epoch": 6.251396648044693, "grad_norm": 1.613438108862121, "learning_rate": 7.641147996859916e-07, "logits/chosen": -3.8043417930603027, "logits/rejected": -3.849745035171509, "logps/chosen": -1.462568759918213, "logps/rejected": -29.720266342163086, "loss": 0.1322, "rewards/accuracies": 1.0, "rewards/chosen": 1.598609447479248, "rewards/margins": 1.598609447479248, "rewards/rejected": 0.0, "step": 1119 }, { "epoch": 6.256983240223463, "grad_norm": 2.9443967723445663, "learning_rate": 7.63597038306834e-07, "logits/chosen": -3.5113418102264404, "logits/rejected": -3.5904808044433594, "logps/chosen": -1.8839757442474365, "logps/rejected": -21.58673858642578, "loss": 0.1171, "rewards/accuracies": 1.0, "rewards/chosen": 1.9584304094314575, "rewards/margins": 1.9584304094314575, "rewards/rejected": 0.0, "step": 1120 }, { "epoch": 6.256983240223463, "eval_logits/chosen": -3.404155731201172, "eval_logits/rejected": -3.5301384925842285, "eval_logps/chosen": -23.150352478027344, "eval_logps/rejected": -42.37028503417969, "eval_loss": 0.586790144443512, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.6919690370559692, "eval_rewards/margins": 0.6919690370559692, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7298, "eval_samples_per_second": 9.471, "eval_steps_per_second": 0.306, "step": 1120 }, { "epoch": 6.262569832402234, "grad_norm": 2.1725677329370794, "learning_rate": 7.630788851755334e-07, "logits/chosen": -3.3949060440063477, "logits/rejected": -3.3819336891174316, "logps/chosen": -2.6754307746887207, "logps/rejected": -24.352420806884766, "loss": 0.1182, "rewards/accuracies": 1.0, "rewards/chosen": 2.228742837905884, "rewards/margins": 2.228742837905884, "rewards/rejected": 0.0, "step": 1121 }, { "epoch": 6.268156424581005, "grad_norm": 1.3327566934330697, "learning_rate": 7.625603410621578e-07, "logits/chosen": -3.430837392807007, "logits/rejected": -3.391392946243286, "logps/chosen": -7.514692306518555, "logps/rejected": -31.07344627380371, "loss": 0.0865, "rewards/accuracies": 1.0, "rewards/chosen": 2.1787056922912598, "rewards/margins": 2.1787056922912598, "rewards/rejected": 0.0, "step": 1122 }, { "epoch": 6.273743016759776, "grad_norm": 2.3769441085508225, "learning_rate": 7.620414067373558e-07, "logits/chosen": -3.2495040893554688, "logits/rejected": -3.4531049728393555, "logps/chosen": -11.662211418151855, "logps/rejected": -50.61724090576172, "loss": 0.1177, "rewards/accuracies": 1.0, "rewards/chosen": 2.59584641456604, "rewards/margins": 2.59584641456604, "rewards/rejected": 0.0, "step": 1123 }, { "epoch": 6.279329608938547, "grad_norm": 2.8236764416772027, "learning_rate": 7.615220829723563e-07, "logits/chosen": -3.635977268218994, "logits/rejected": -3.314913749694824, "logps/chosen": -2.3783607482910156, "logps/rejected": -71.88471984863281, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 1.5717906951904297, "rewards/margins": 1.5717906951904297, "rewards/rejected": 0.0, "step": 1124 }, { "epoch": 6.284916201117318, "grad_norm": 5.067874331233562, "learning_rate": 7.610023705389672e-07, "logits/chosen": -3.6423583030700684, "logits/rejected": -3.7084062099456787, "logps/chosen": -2.9335761070251465, "logps/rejected": -29.922504425048828, "loss": 0.1198, "rewards/accuracies": 1.0, "rewards/chosen": 2.2978029251098633, "rewards/margins": 2.2978029251098633, "rewards/rejected": 0.0, "step": 1125 }, { "epoch": 6.290502793296089, "grad_norm": 1.682862918959285, "learning_rate": 7.604822702095733e-07, "logits/chosen": -3.428931951522827, "logits/rejected": -3.549999713897705, "logps/chosen": -9.16401195526123, "logps/rejected": -43.981082916259766, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": 2.355374813079834, "rewards/margins": 2.355374813079834, "rewards/rejected": 0.0, "step": 1126 }, { "epoch": 6.29608938547486, "grad_norm": 1.5641957951116872, "learning_rate": 7.599617827571366e-07, "logits/chosen": -3.205566167831421, "logits/rejected": -3.266538619995117, "logps/chosen": -3.9005112648010254, "logps/rejected": -35.63095474243164, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 2.4671335220336914, "rewards/margins": 2.4671335220336914, "rewards/rejected": 0.0, "step": 1127 }, { "epoch": 6.301675977653631, "grad_norm": 1.3903496328047276, "learning_rate": 7.594409089551941e-07, "logits/chosen": -3.446619987487793, "logits/rejected": -3.578960657119751, "logps/chosen": -3.6555418968200684, "logps/rejected": -26.66632080078125, "loss": 0.1282, "rewards/accuracies": 1.0, "rewards/chosen": 2.413005828857422, "rewards/margins": 2.413005828857422, "rewards/rejected": 0.0, "step": 1128 }, { "epoch": 6.307262569832402, "grad_norm": 4.070485445328956, "learning_rate": 7.58919649577857e-07, "logits/chosen": -3.253666639328003, "logits/rejected": -3.2835707664489746, "logps/chosen": -12.576066017150879, "logps/rejected": -61.259986877441406, "loss": 0.1205, "rewards/accuracies": 1.0, "rewards/chosen": 2.4137935638427734, "rewards/margins": 2.4137935638427734, "rewards/rejected": 0.0, "step": 1129 }, { "epoch": 6.312849162011173, "grad_norm": 6.158340194259556, "learning_rate": 7.583980053998094e-07, "logits/chosen": -3.6586406230926514, "logits/rejected": -3.458526372909546, "logps/chosen": -9.13015365600586, "logps/rejected": -44.24652099609375, "loss": 0.119, "rewards/accuracies": 1.0, "rewards/chosen": 2.887941360473633, "rewards/margins": 2.887941360473633, "rewards/rejected": 0.0, "step": 1130 }, { "epoch": 6.318435754189944, "grad_norm": 3.9187148644017924, "learning_rate": 7.578759771963077e-07, "logits/chosen": -3.6340296268463135, "logits/rejected": -3.5403378009796143, "logps/chosen": -30.52096939086914, "logps/rejected": -27.17243194580078, "loss": 0.133, "rewards/accuracies": 1.0, "rewards/chosen": 3.3830909729003906, "rewards/margins": 3.3830909729003906, "rewards/rejected": 0.0, "step": 1131 }, { "epoch": 6.324022346368715, "grad_norm": 2.388480448849047, "learning_rate": 7.573535657431786e-07, "logits/chosen": -3.5924248695373535, "logits/rejected": -3.5247814655303955, "logps/chosen": -8.00168514251709, "logps/rejected": -55.73610305786133, "loss": 0.124, "rewards/accuracies": 1.0, "rewards/chosen": 2.651998519897461, "rewards/margins": 2.651998519897461, "rewards/rejected": 0.0, "step": 1132 }, { "epoch": 6.329608938547486, "grad_norm": 2.735087380655357, "learning_rate": 7.568307718168189e-07, "logits/chosen": -3.3794960975646973, "logits/rejected": -3.631844997406006, "logps/chosen": -3.5662617683410645, "logps/rejected": -48.561580657958984, "loss": 0.1406, "rewards/accuracies": 1.0, "rewards/chosen": 2.1723389625549316, "rewards/margins": 2.1723389625549316, "rewards/rejected": 0.0, "step": 1133 }, { "epoch": 6.335195530726257, "grad_norm": 3.0594035105757844, "learning_rate": 7.563075961941929e-07, "logits/chosen": -3.596266746520996, "logits/rejected": -3.5486643314361572, "logps/chosen": -12.418999671936035, "logps/rejected": -45.26614761352539, "loss": 0.1088, "rewards/accuracies": 1.0, "rewards/chosen": 3.0499706268310547, "rewards/margins": 3.0499706268310547, "rewards/rejected": 0.0, "step": 1134 }, { "epoch": 6.340782122905028, "grad_norm": 4.442309247909587, "learning_rate": 7.557840396528333e-07, "logits/chosen": -3.2268638610839844, "logits/rejected": -3.0698015689849854, "logps/chosen": -6.380509376525879, "logps/rejected": -24.476642608642578, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/chosen": 2.8292136192321777, "rewards/margins": 2.8292136192321777, "rewards/rejected": 0.0, "step": 1135 }, { "epoch": 6.346368715083799, "grad_norm": 6.983921678944319, "learning_rate": 7.55260102970838e-07, "logits/chosen": -3.9538321495056152, "logits/rejected": -3.7950656414031982, "logps/chosen": -12.06083869934082, "logps/rejected": -28.718433380126953, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 3.0041298866271973, "rewards/margins": 3.0041298866271973, "rewards/rejected": 0.0, "step": 1136 }, { "epoch": 6.351955307262569, "grad_norm": 1.6742125231229434, "learning_rate": 7.547357869268704e-07, "logits/chosen": -3.7847020626068115, "logits/rejected": -3.9054253101348877, "logps/chosen": -3.6618285179138184, "logps/rejected": -38.363895416259766, "loss": 0.1174, "rewards/accuracies": 1.0, "rewards/chosen": 2.2039732933044434, "rewards/margins": 2.2039732933044434, "rewards/rejected": 0.0, "step": 1137 }, { "epoch": 6.35754189944134, "grad_norm": 3.190302920370684, "learning_rate": 7.542110923001576e-07, "logits/chosen": -3.446341037750244, "logits/rejected": -3.7196896076202393, "logps/chosen": -5.214937686920166, "logps/rejected": -48.24104309082031, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": 2.1412739753723145, "rewards/margins": 2.1412739753723145, "rewards/rejected": 0.0, "step": 1138 }, { "epoch": 6.363128491620111, "grad_norm": 5.672963080835398, "learning_rate": 7.53686019870489e-07, "logits/chosen": -3.458178997039795, "logits/rejected": -3.370027780532837, "logps/chosen": -13.658634185791016, "logps/rejected": -23.169994354248047, "loss": 0.1501, "rewards/accuracies": 1.0, "rewards/chosen": 2.8471968173980713, "rewards/margins": 2.8471968173980713, "rewards/rejected": 0.0, "step": 1139 }, { "epoch": 6.368715083798882, "grad_norm": 3.58288083102813, "learning_rate": 7.531605704182158e-07, "logits/chosen": -3.4480130672454834, "logits/rejected": -3.4778430461883545, "logps/chosen": -1.6349246501922607, "logps/rejected": -40.36206817626953, "loss": 0.1352, "rewards/accuracies": 1.0, "rewards/chosen": 1.9295605421066284, "rewards/margins": 1.9295605421066284, "rewards/rejected": 0.0, "step": 1140 }, { "epoch": 6.368715083798882, "eval_logits/chosen": -3.388373613357544, "eval_logits/rejected": -3.5139546394348145, "eval_logps/chosen": -23.564157485961914, "eval_logps/rejected": -42.2648811340332, "eval_loss": 0.5904839634895325, "eval_rewards/accuracies": 0.8500000238418579, "eval_rewards/chosen": 0.6505883932113647, "eval_rewards/margins": 0.6505883932113647, "eval_rewards/rejected": 0.0, "eval_runtime": 32.737, "eval_samples_per_second": 9.469, "eval_steps_per_second": 0.305, "step": 1140 }, { "epoch": 6.374301675977653, "grad_norm": 2.07573234783886, "learning_rate": 7.526347447242496e-07, "logits/chosen": -3.5074241161346436, "logits/rejected": -3.385617733001709, "logps/chosen": -21.858230590820312, "logps/rejected": -38.5920524597168, "loss": 0.1189, "rewards/accuracies": 1.0, "rewards/chosen": 3.0649476051330566, "rewards/margins": 3.0649476051330566, "rewards/rejected": 0.0, "step": 1141 }, { "epoch": 6.379888268156424, "grad_norm": 1.9055519576325777, "learning_rate": 7.521085435700609e-07, "logits/chosen": -3.439046859741211, "logits/rejected": -3.6108994483947754, "logps/chosen": -4.852899551391602, "logps/rejected": -29.23027992248535, "loss": 0.1119, "rewards/accuracies": 1.0, "rewards/chosen": 2.1748762130737305, "rewards/margins": 2.1748762130737305, "rewards/rejected": 0.0, "step": 1142 }, { "epoch": 6.385474860335195, "grad_norm": 7.2018815283622315, "learning_rate": 7.515819677376786e-07, "logits/chosen": -3.479985237121582, "logits/rejected": -3.486938714981079, "logps/chosen": -7.141788482666016, "logps/rejected": -39.78456115722656, "loss": 0.142, "rewards/accuracies": 1.0, "rewards/chosen": 2.5886242389678955, "rewards/margins": 2.5886242389678955, "rewards/rejected": 0.0, "step": 1143 }, { "epoch": 6.391061452513966, "grad_norm": 4.999108636137947, "learning_rate": 7.510550180096876e-07, "logits/chosen": -3.5652968883514404, "logits/rejected": -3.490980625152588, "logps/chosen": -10.460119247436523, "logps/rejected": -16.658226013183594, "loss": 0.128, "rewards/accuracies": 1.0, "rewards/chosen": 2.4940853118896484, "rewards/margins": 2.4940853118896484, "rewards/rejected": 0.0, "step": 1144 }, { "epoch": 6.396648044692737, "grad_norm": 2.5838107318157224, "learning_rate": 7.505276951692296e-07, "logits/chosen": -3.634608030319214, "logits/rejected": -3.643927812576294, "logps/chosen": -4.419398307800293, "logps/rejected": -59.13410949707031, "loss": 0.098, "rewards/accuracies": 1.0, "rewards/chosen": 2.5251169204711914, "rewards/margins": 2.5251169204711914, "rewards/rejected": 0.0, "step": 1145 }, { "epoch": 6.402234636871508, "grad_norm": 3.3446696073869164, "learning_rate": 7.5e-07, "logits/chosen": -3.5030786991119385, "logits/rejected": -3.564039468765259, "logps/chosen": -9.80803108215332, "logps/rejected": -40.293479919433594, "loss": 0.1074, "rewards/accuracies": 1.0, "rewards/chosen": 2.9979138374328613, "rewards/margins": 2.9979138374328613, "rewards/rejected": 0.0, "step": 1146 }, { "epoch": 6.407821229050279, "grad_norm": 1.9616353941615796, "learning_rate": 7.494719332862478e-07, "logits/chosen": -3.623610258102417, "logits/rejected": -3.1363115310668945, "logps/chosen": -1.4352056980133057, "logps/rejected": -26.92434310913086, "loss": 0.1204, "rewards/accuracies": 1.0, "rewards/chosen": 1.8442667722702026, "rewards/margins": 1.8442667722702026, "rewards/rejected": 0.0, "step": 1147 }, { "epoch": 6.41340782122905, "grad_norm": 3.3440966999791693, "learning_rate": 7.489434958127742e-07, "logits/chosen": -3.5022785663604736, "logits/rejected": -3.3467249870300293, "logps/chosen": -13.047353744506836, "logps/rejected": -58.06136703491211, "loss": 0.1256, "rewards/accuracies": 1.0, "rewards/chosen": 2.292217969894409, "rewards/margins": 2.292217969894409, "rewards/rejected": 0.0, "step": 1148 }, { "epoch": 6.418994413407821, "grad_norm": 9.914571903712174, "learning_rate": 7.484146883649315e-07, "logits/chosen": -3.668579339981079, "logits/rejected": -3.719177722930908, "logps/chosen": -3.6147677898406982, "logps/rejected": -43.331233978271484, "loss": 0.1395, "rewards/accuracies": 1.0, "rewards/chosen": 2.225457191467285, "rewards/margins": 2.225457191467285, "rewards/rejected": 0.0, "step": 1149 }, { "epoch": 6.424581005586592, "grad_norm": 1.8114041552518896, "learning_rate": 7.478855117286216e-07, "logits/chosen": -3.4629340171813965, "logits/rejected": -3.67208194732666, "logps/chosen": -22.60823631286621, "logps/rejected": -29.602725982666016, "loss": 0.0995, "rewards/accuracies": 1.0, "rewards/chosen": 2.589573860168457, "rewards/margins": 2.589573860168457, "rewards/rejected": 0.0, "step": 1150 }, { "epoch": 6.430167597765363, "grad_norm": 8.34282059434499, "learning_rate": 7.473559666902953e-07, "logits/chosen": -3.734013557434082, "logits/rejected": -3.6599209308624268, "logps/chosen": -1.3387436866760254, "logps/rejected": -31.555988311767578, "loss": 0.1494, "rewards/accuracies": 1.0, "rewards/chosen": 1.5991384983062744, "rewards/margins": 1.5991384983062744, "rewards/rejected": 0.0, "step": 1151 }, { "epoch": 6.435754189944134, "grad_norm": 2.1156571176367254, "learning_rate": 7.468260540369509e-07, "logits/chosen": -3.589686632156372, "logits/rejected": -3.5712993144989014, "logps/chosen": -9.595553398132324, "logps/rejected": -41.65563201904297, "loss": 0.1125, "rewards/accuracies": 1.0, "rewards/chosen": 2.34926700592041, "rewards/margins": 2.34926700592041, "rewards/rejected": 0.0, "step": 1152 }, { "epoch": 6.441340782122905, "grad_norm": 2.068700473887163, "learning_rate": 7.462957745561327e-07, "logits/chosen": -3.513237953186035, "logits/rejected": -3.693328619003296, "logps/chosen": -14.855696678161621, "logps/rejected": -40.235191345214844, "loss": 0.1264, "rewards/accuracies": 1.0, "rewards/chosen": 3.3135600090026855, "rewards/margins": 3.3135600090026855, "rewards/rejected": 0.0, "step": 1153 }, { "epoch": 6.446927374301676, "grad_norm": 2.8361763613552813, "learning_rate": 7.457651290359306e-07, "logits/chosen": -3.4706835746765137, "logits/rejected": -3.6301958560943604, "logps/chosen": -31.08868408203125, "logps/rejected": -78.93798065185547, "loss": 0.1326, "rewards/accuracies": 1.0, "rewards/chosen": 2.295527935028076, "rewards/margins": 2.295527935028076, "rewards/rejected": 0.0, "step": 1154 }, { "epoch": 6.452513966480447, "grad_norm": 2.782776862415484, "learning_rate": 7.452341182649787e-07, "logits/chosen": -3.7000601291656494, "logits/rejected": -3.6847100257873535, "logps/chosen": -8.851727485656738, "logps/rejected": -35.246849060058594, "loss": 0.1126, "rewards/accuracies": 1.0, "rewards/chosen": 2.5256786346435547, "rewards/margins": 2.5256786346435547, "rewards/rejected": 0.0, "step": 1155 }, { "epoch": 6.4581005586592175, "grad_norm": 5.337794330407838, "learning_rate": 7.447027430324529e-07, "logits/chosen": -3.444211006164551, "logits/rejected": -3.5518124103546143, "logps/chosen": -10.179937362670898, "logps/rejected": -24.692378997802734, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": 2.374847888946533, "rewards/margins": 2.374847888946533, "rewards/rejected": 0.0, "step": 1156 }, { "epoch": 6.4636871508379885, "grad_norm": 2.392044190739456, "learning_rate": 7.441710041280719e-07, "logits/chosen": -3.629194498062134, "logits/rejected": -3.6020259857177734, "logps/chosen": -1.9871127605438232, "logps/rejected": -81.12548828125, "loss": 0.1302, "rewards/accuracies": 1.0, "rewards/chosen": 2.277148723602295, "rewards/margins": 2.277148723602295, "rewards/rejected": 0.0, "step": 1157 }, { "epoch": 6.4692737430167595, "grad_norm": 1.7949174942430215, "learning_rate": 7.436389023420942e-07, "logits/chosen": -3.7679221630096436, "logits/rejected": -3.652407646179199, "logps/chosen": -5.68017578125, "logps/rejected": -47.26246643066406, "loss": 0.1313, "rewards/accuracies": 1.0, "rewards/chosen": 2.207443952560425, "rewards/margins": 2.207443952560425, "rewards/rejected": 0.0, "step": 1158 }, { "epoch": 6.4748603351955305, "grad_norm": 2.546044260638092, "learning_rate": 7.431064384653181e-07, "logits/chosen": -3.419480085372925, "logits/rejected": -3.672830104827881, "logps/chosen": -3.2580134868621826, "logps/rejected": -58.788848876953125, "loss": 0.1396, "rewards/accuracies": 1.0, "rewards/chosen": 2.158781051635742, "rewards/margins": 2.158781051635742, "rewards/rejected": 0.0, "step": 1159 }, { "epoch": 6.4804469273743015, "grad_norm": 3.3398948644505384, "learning_rate": 7.425736132890794e-07, "logits/chosen": -3.4635164737701416, "logits/rejected": -3.459397315979004, "logps/chosen": -15.559146881103516, "logps/rejected": -37.985374450683594, "loss": 0.1121, "rewards/accuracies": 1.0, "rewards/chosen": 2.4702444076538086, "rewards/margins": 2.4702444076538086, "rewards/rejected": 0.0, "step": 1160 }, { "epoch": 6.4804469273743015, "eval_logits/chosen": -3.3824734687805176, "eval_logits/rejected": -3.5105621814727783, "eval_logps/chosen": -22.876380920410156, "eval_logps/rejected": -42.273921966552734, "eval_loss": 0.5851587057113647, "eval_rewards/accuracies": 0.8500000238418579, "eval_rewards/chosen": 0.7193660736083984, "eval_rewards/margins": 0.7193660736083984, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6808, "eval_samples_per_second": 9.486, "eval_steps_per_second": 0.306, "step": 1160 }, { "epoch": 6.4860335195530725, "grad_norm": 1.61387712350269, "learning_rate": 7.420404276052516e-07, "logits/chosen": -3.4014992713928223, "logits/rejected": -3.5311710834503174, "logps/chosen": -4.388230800628662, "logps/rejected": -46.40802001953125, "loss": 0.1188, "rewards/accuracies": 1.0, "rewards/chosen": 2.1586849689483643, "rewards/margins": 2.1586849689483643, "rewards/rejected": 0.0, "step": 1161 }, { "epoch": 6.4916201117318435, "grad_norm": 1.800848057129923, "learning_rate": 7.415068822062431e-07, "logits/chosen": -3.586928367614746, "logits/rejected": -3.7211384773254395, "logps/chosen": -3.3956210613250732, "logps/rejected": -46.08708190917969, "loss": 0.1189, "rewards/accuracies": 1.0, "rewards/chosen": 2.251286029815674, "rewards/margins": 2.251286029815674, "rewards/rejected": 0.0, "step": 1162 }, { "epoch": 6.4972067039106145, "grad_norm": 4.4067031135950145, "learning_rate": 7.409729778849976e-07, "logits/chosen": -3.4703292846679688, "logits/rejected": -3.460296869277954, "logps/chosen": -5.283385276794434, "logps/rejected": -55.7769889831543, "loss": 0.1091, "rewards/accuracies": 1.0, "rewards/chosen": 2.5026423931121826, "rewards/margins": 2.5026423931121826, "rewards/rejected": 0.0, "step": 1163 }, { "epoch": 6.5027932960893855, "grad_norm": 7.293108498783106, "learning_rate": 7.40438715434992e-07, "logits/chosen": -3.7694311141967773, "logits/rejected": -3.788268804550171, "logps/chosen": -1.3201477527618408, "logps/rejected": -35.48125076293945, "loss": 0.1356, "rewards/accuracies": 1.0, "rewards/chosen": 1.9290229082107544, "rewards/margins": 1.9290229082107544, "rewards/rejected": 0.0, "step": 1164 }, { "epoch": 6.5083798882681565, "grad_norm": 1.9437543708834484, "learning_rate": 7.399040956502357e-07, "logits/chosen": -3.42350697517395, "logits/rejected": -3.4623332023620605, "logps/chosen": -6.216604709625244, "logps/rejected": -44.93048095703125, "loss": 0.1078, "rewards/accuracies": 1.0, "rewards/chosen": 2.248582601547241, "rewards/margins": 2.248582601547241, "rewards/rejected": 0.0, "step": 1165 }, { "epoch": 6.5139664804469275, "grad_norm": 4.817821483946756, "learning_rate": 7.393691193252685e-07, "logits/chosen": -3.5924384593963623, "logits/rejected": -3.4373843669891357, "logps/chosen": -12.131484985351562, "logps/rejected": -19.567914962768555, "loss": 0.1526, "rewards/accuracies": 1.0, "rewards/chosen": 2.4102120399475098, "rewards/margins": 2.4102120399475098, "rewards/rejected": 0.0, "step": 1166 }, { "epoch": 6.5195530726256985, "grad_norm": 4.3216972734564605, "learning_rate": 7.388337872551606e-07, "logits/chosen": -3.5208194255828857, "logits/rejected": -3.3143310546875, "logps/chosen": -31.798171997070312, "logps/rejected": -21.487539291381836, "loss": 0.1089, "rewards/accuracies": 1.0, "rewards/chosen": 2.6758854389190674, "rewards/margins": 2.6758854389190674, "rewards/rejected": 0.0, "step": 1167 }, { "epoch": 6.5251396648044695, "grad_norm": 2.638264524988585, "learning_rate": 7.382981002355111e-07, "logits/chosen": -3.5087356567382812, "logits/rejected": -3.5776567459106445, "logps/chosen": -25.8616943359375, "logps/rejected": -48.079994201660156, "loss": 0.1161, "rewards/accuracies": 1.0, "rewards/chosen": 3.036346673965454, "rewards/margins": 3.036346673965454, "rewards/rejected": 0.0, "step": 1168 }, { "epoch": 6.5307262569832405, "grad_norm": 1.879004053663191, "learning_rate": 7.377620590624461e-07, "logits/chosen": -3.4802727699279785, "logits/rejected": -3.360588788986206, "logps/chosen": -15.909881591796875, "logps/rejected": -30.90283203125, "loss": 0.1076, "rewards/accuracies": 1.0, "rewards/chosen": 3.166745662689209, "rewards/margins": 3.166745662689209, "rewards/rejected": 0.0, "step": 1169 }, { "epoch": 6.5363128491620115, "grad_norm": 5.624151661105335, "learning_rate": 7.37225664532618e-07, "logits/chosen": -3.6656854152679443, "logits/rejected": -3.3708481788635254, "logps/chosen": -5.943014144897461, "logps/rejected": -62.0150260925293, "loss": 0.1463, "rewards/accuracies": 0.75, "rewards/chosen": 1.5379159450531006, "rewards/margins": 1.5379159450531006, "rewards/rejected": 0.0, "step": 1170 }, { "epoch": 6.5418994413407825, "grad_norm": 1.4296245046803395, "learning_rate": 7.366889174432049e-07, "logits/chosen": -3.0365090370178223, "logits/rejected": -3.0734381675720215, "logps/chosen": -15.989898681640625, "logps/rejected": -30.509754180908203, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": 3.0527782440185547, "rewards/margins": 3.0527782440185547, "rewards/rejected": 0.0, "step": 1171 }, { "epoch": 6.547486033519553, "grad_norm": 3.3616999767523104, "learning_rate": 7.361518185919086e-07, "logits/chosen": -3.6535892486572266, "logits/rejected": -3.5110924243927, "logps/chosen": -5.396595001220703, "logps/rejected": -33.7409782409668, "loss": 0.1239, "rewards/accuracies": 1.0, "rewards/chosen": 2.0897598266601562, "rewards/margins": 2.0897598266601562, "rewards/rejected": 0.0, "step": 1172 }, { "epoch": 6.553072625698324, "grad_norm": 2.334322615680733, "learning_rate": 7.356143687769534e-07, "logits/chosen": -3.3524158000946045, "logits/rejected": -3.2852206230163574, "logps/chosen": -18.785200119018555, "logps/rejected": -63.185218811035156, "loss": 0.1047, "rewards/accuracies": 1.0, "rewards/chosen": 2.363424301147461, "rewards/margins": 2.363424301147461, "rewards/rejected": 0.0, "step": 1173 }, { "epoch": 6.558659217877095, "grad_norm": 6.407386995876666, "learning_rate": 7.350765687970856e-07, "logits/chosen": -3.524543523788452, "logits/rejected": -3.4610118865966797, "logps/chosen": -1.330913782119751, "logps/rejected": -51.27202606201172, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 2.0210142135620117, "rewards/margins": 2.0210142135620117, "rewards/rejected": 0.0, "step": 1174 }, { "epoch": 6.564245810055866, "grad_norm": 5.064980797784388, "learning_rate": 7.345384194515718e-07, "logits/chosen": -3.58046293258667, "logits/rejected": -3.786489248275757, "logps/chosen": -4.507716178894043, "logps/rejected": -48.74477005004883, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 2.252613067626953, "rewards/margins": 2.252613067626953, "rewards/rejected": 0.0, "step": 1175 }, { "epoch": 6.569832402234637, "grad_norm": 1.8492597615441708, "learning_rate": 7.339999215401975e-07, "logits/chosen": -3.372981548309326, "logits/rejected": -3.4837727546691895, "logps/chosen": -6.701407432556152, "logps/rejected": -37.21700668334961, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 2.7453620433807373, "rewards/margins": 2.7453620433807373, "rewards/rejected": 0.0, "step": 1176 }, { "epoch": 6.575418994413408, "grad_norm": 6.3422045402981855, "learning_rate": 7.334610758632668e-07, "logits/chosen": -3.6905386447906494, "logits/rejected": -3.767056703567505, "logps/chosen": -31.702373504638672, "logps/rejected": -46.73040771484375, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": 2.9266233444213867, "rewards/margins": 2.9266233444213867, "rewards/rejected": 0.0, "step": 1177 }, { "epoch": 6.581005586592179, "grad_norm": 3.1790101358671246, "learning_rate": 7.329218832216003e-07, "logits/chosen": -3.395127773284912, "logits/rejected": -3.559117555618286, "logps/chosen": -5.663971900939941, "logps/rejected": -48.885009765625, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": 2.305607318878174, "rewards/margins": 2.305607318878174, "rewards/rejected": 0.0, "step": 1178 }, { "epoch": 6.58659217877095, "grad_norm": 7.017385742847004, "learning_rate": 7.32382344416534e-07, "logits/chosen": -3.26444411277771, "logits/rejected": -3.290856122970581, "logps/chosen": -6.893622875213623, "logps/rejected": -60.45686721801758, "loss": 0.1248, "rewards/accuracies": 1.0, "rewards/chosen": 2.7198753356933594, "rewards/margins": 2.7198753356933594, "rewards/rejected": 0.0, "step": 1179 }, { "epoch": 6.592178770949721, "grad_norm": 3.2928264012315513, "learning_rate": 7.318424602499192e-07, "logits/chosen": -3.4912145137786865, "logits/rejected": -3.486445426940918, "logps/chosen": -8.337770462036133, "logps/rejected": -35.21287155151367, "loss": 0.1095, "rewards/accuracies": 1.0, "rewards/chosen": 2.795140504837036, "rewards/margins": 2.795140504837036, "rewards/rejected": 0.0, "step": 1180 }, { "epoch": 6.592178770949721, "eval_logits/chosen": -3.3981025218963623, "eval_logits/rejected": -3.5221188068389893, "eval_logps/chosen": -23.001008987426758, "eval_logps/rejected": -42.521881103515625, "eval_loss": 0.5904117822647095, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.706903338432312, "eval_rewards/margins": 0.706903338432312, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7111, "eval_samples_per_second": 9.477, "eval_steps_per_second": 0.306, "step": 1180 }, { "epoch": 6.597765363128492, "grad_norm": 2.2059531916875192, "learning_rate": 7.313022315241194e-07, "logits/chosen": -3.7348954677581787, "logits/rejected": -3.775949239730835, "logps/chosen": -16.370988845825195, "logps/rejected": -45.253597259521484, "loss": 0.0886, "rewards/accuracies": 1.0, "rewards/chosen": 2.4794254302978516, "rewards/margins": 2.4794254302978516, "rewards/rejected": 0.0, "step": 1181 }, { "epoch": 6.603351955307263, "grad_norm": 1.3834486669385506, "learning_rate": 7.307616590420109e-07, "logits/chosen": -3.329288959503174, "logits/rejected": -3.528491497039795, "logps/chosen": -5.240683555603027, "logps/rejected": -26.84988784790039, "loss": 0.1112, "rewards/accuracies": 1.0, "rewards/chosen": 2.5287373065948486, "rewards/margins": 2.5287373065948486, "rewards/rejected": 0.0, "step": 1182 }, { "epoch": 6.608938547486034, "grad_norm": 4.938881978910298, "learning_rate": 7.302207436069806e-07, "logits/chosen": -3.7750566005706787, "logits/rejected": -3.910111427307129, "logps/chosen": -3.403721332550049, "logps/rejected": -88.3388442993164, "loss": 0.1272, "rewards/accuracies": 1.0, "rewards/chosen": 1.8856760263442993, "rewards/margins": 1.8856760263442993, "rewards/rejected": 0.0, "step": 1183 }, { "epoch": 6.614525139664805, "grad_norm": 1.7965923799071406, "learning_rate": 7.296794860229253e-07, "logits/chosen": -3.5343782901763916, "logits/rejected": -3.388319969177246, "logps/chosen": -5.0793962478637695, "logps/rejected": -32.3995361328125, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 2.3262863159179688, "rewards/margins": 2.3262863159179688, "rewards/rejected": 0.0, "step": 1184 }, { "epoch": 6.620111731843576, "grad_norm": 1.5554842365439236, "learning_rate": 7.2913788709425e-07, "logits/chosen": -3.6333446502685547, "logits/rejected": -3.594719409942627, "logps/chosen": -5.939208030700684, "logps/rejected": -34.853302001953125, "loss": 0.1046, "rewards/accuracies": 1.0, "rewards/chosen": 2.663480281829834, "rewards/margins": 2.663480281829834, "rewards/rejected": 0.0, "step": 1185 }, { "epoch": 6.625698324022347, "grad_norm": 1.7944320372729725, "learning_rate": 7.285959476258673e-07, "logits/chosen": -3.796753406524658, "logits/rejected": -3.6380088329315186, "logps/chosen": -12.204524993896484, "logps/rejected": -24.707441329956055, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": 3.130892276763916, "rewards/margins": 3.130892276763916, "rewards/rejected": 0.0, "step": 1186 }, { "epoch": 6.631284916201118, "grad_norm": 1.8467357194452632, "learning_rate": 7.280536684231957e-07, "logits/chosen": -3.501314163208008, "logits/rejected": -3.2968432903289795, "logps/chosen": -3.581840753555298, "logps/rejected": -25.83201789855957, "loss": 0.1271, "rewards/accuracies": 1.0, "rewards/chosen": 2.264824628829956, "rewards/margins": 2.264824628829956, "rewards/rejected": 0.0, "step": 1187 }, { "epoch": 6.636871508379889, "grad_norm": 3.1295977153882824, "learning_rate": 7.275110502921588e-07, "logits/chosen": -3.4816713333129883, "logits/rejected": -3.6248412132263184, "logps/chosen": -1.5265557765960693, "logps/rejected": -31.821849822998047, "loss": 0.1009, "rewards/accuracies": 1.0, "rewards/chosen": 1.9344596862792969, "rewards/margins": 1.9344596862792969, "rewards/rejected": 0.0, "step": 1188 }, { "epoch": 6.64245810055866, "grad_norm": 2.7051695361680923, "learning_rate": 7.269680940391835e-07, "logits/chosen": -3.3442203998565674, "logits/rejected": -3.4781646728515625, "logps/chosen": -9.685230255126953, "logps/rejected": -36.17442321777344, "loss": 0.1061, "rewards/accuracies": 1.0, "rewards/chosen": 2.676804304122925, "rewards/margins": 2.676804304122925, "rewards/rejected": 0.0, "step": 1189 }, { "epoch": 6.648044692737431, "grad_norm": 6.929736209226811, "learning_rate": 7.264248004711997e-07, "logits/chosen": -3.505671501159668, "logits/rejected": -3.411191701889038, "logps/chosen": -8.421308517456055, "logps/rejected": -42.89100646972656, "loss": 0.1181, "rewards/accuracies": 1.0, "rewards/chosen": 2.6684720516204834, "rewards/margins": 2.6684720516204834, "rewards/rejected": 0.0, "step": 1190 }, { "epoch": 6.653631284916202, "grad_norm": 3.004093237871956, "learning_rate": 7.258811703956385e-07, "logits/chosen": -3.009087324142456, "logits/rejected": -2.9700117111206055, "logps/chosen": -8.3806791305542, "logps/rejected": -40.365577697753906, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": 2.768009901046753, "rewards/margins": 2.768009901046753, "rewards/rejected": 0.0, "step": 1191 }, { "epoch": 6.659217877094972, "grad_norm": 1.8564056250718206, "learning_rate": 7.253372046204307e-07, "logits/chosen": -3.6647820472717285, "logits/rejected": -3.746948003768921, "logps/chosen": -1.0470430850982666, "logps/rejected": -22.98351287841797, "loss": 0.1428, "rewards/accuracies": 1.0, "rewards/chosen": 1.2635326385498047, "rewards/margins": 1.2635326385498047, "rewards/rejected": 0.0, "step": 1192 }, { "epoch": 6.664804469273743, "grad_norm": 2.596685756261615, "learning_rate": 7.247929039540064e-07, "logits/chosen": -3.5840508937835693, "logits/rejected": -3.5116047859191895, "logps/chosen": -10.925397872924805, "logps/rejected": -37.749664306640625, "loss": 0.1125, "rewards/accuracies": 1.0, "rewards/chosen": 2.496830940246582, "rewards/margins": 2.496830940246582, "rewards/rejected": 0.0, "step": 1193 }, { "epoch": 6.670391061452514, "grad_norm": 1.7800121822891553, "learning_rate": 7.242482692052935e-07, "logits/chosen": -3.11643385887146, "logits/rejected": -3.096829652786255, "logps/chosen": -9.554300308227539, "logps/rejected": -43.78571319580078, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": 2.657076597213745, "rewards/margins": 2.657076597213745, "rewards/rejected": 0.0, "step": 1194 }, { "epoch": 6.675977653631285, "grad_norm": 1.8899008327500304, "learning_rate": 7.237033011837161e-07, "logits/chosen": -3.393386125564575, "logits/rejected": -3.544006586074829, "logps/chosen": -5.600795745849609, "logps/rejected": -52.57780456542969, "loss": 0.1486, "rewards/accuracies": 1.0, "rewards/chosen": 2.2170066833496094, "rewards/margins": 2.2170066833496094, "rewards/rejected": 0.0, "step": 1195 }, { "epoch": 6.681564245810056, "grad_norm": 4.892910432832869, "learning_rate": 7.23158000699194e-07, "logits/chosen": -3.600586175918579, "logits/rejected": -3.6606061458587646, "logps/chosen": -28.68573760986328, "logps/rejected": -27.99110221862793, "loss": 0.1179, "rewards/accuracies": 1.0, "rewards/chosen": 3.1784684658050537, "rewards/margins": 3.1784684658050537, "rewards/rejected": 0.0, "step": 1196 }, { "epoch": 6.687150837988827, "grad_norm": 4.778481457666094, "learning_rate": 7.226123685621403e-07, "logits/chosen": -3.7846198081970215, "logits/rejected": -3.756338119506836, "logps/chosen": -1.9063308238983154, "logps/rejected": -26.37069320678711, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 1.7744323015213013, "rewards/margins": 1.7744323015213013, "rewards/rejected": 0.0, "step": 1197 }, { "epoch": 6.692737430167598, "grad_norm": 6.191543108954036, "learning_rate": 7.22066405583462e-07, "logits/chosen": -3.42878794670105, "logits/rejected": -3.6990392208099365, "logps/chosen": -14.416027069091797, "logps/rejected": -42.47001266479492, "loss": 0.1245, "rewards/accuracies": 1.0, "rewards/chosen": 2.790872812271118, "rewards/margins": 2.790872812271118, "rewards/rejected": 0.0, "step": 1198 }, { "epoch": 6.698324022346369, "grad_norm": 3.200562659039872, "learning_rate": 7.21520112574557e-07, "logits/chosen": -3.419428586959839, "logits/rejected": -3.5295228958129883, "logps/chosen": -1.1163108348846436, "logps/rejected": -48.84260177612305, "loss": 0.1344, "rewards/accuracies": 1.0, "rewards/chosen": 1.7757481336593628, "rewards/margins": 1.7757481336593628, "rewards/rejected": 0.0, "step": 1199 }, { "epoch": 6.70391061452514, "grad_norm": 3.4983642115084916, "learning_rate": 7.209734903473141e-07, "logits/chosen": -3.2450597286224365, "logits/rejected": -3.299987554550171, "logps/chosen": -2.303595781326294, "logps/rejected": -40.97272872924805, "loss": 0.1029, "rewards/accuracies": 1.0, "rewards/chosen": 2.18654203414917, "rewards/margins": 2.18654203414917, "rewards/rejected": 0.0, "step": 1200 }, { "epoch": 6.70391061452514, "eval_logits/chosen": -3.398895740509033, "eval_logits/rejected": -3.5260047912597656, "eval_logps/chosen": -23.452024459838867, "eval_logps/rejected": -43.03379821777344, "eval_loss": 0.617557942867279, "eval_rewards/accuracies": 0.8500000238418579, "eval_rewards/chosen": 0.6618017554283142, "eval_rewards/margins": 0.6618017554283142, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7097, "eval_samples_per_second": 9.477, "eval_steps_per_second": 0.306, "step": 1200 }, { "epoch": 6.709497206703911, "grad_norm": 2.1236245304175703, "learning_rate": 7.204265397141115e-07, "logits/chosen": -3.3595433235168457, "logits/rejected": -3.3600683212280273, "logps/chosen": -3.071162223815918, "logps/rejected": -47.003204345703125, "loss": 0.1352, "rewards/accuracies": 1.0, "rewards/chosen": 1.686194896697998, "rewards/margins": 1.686194896697998, "rewards/rejected": 0.0, "step": 1201 }, { "epoch": 6.715083798882682, "grad_norm": 3.0521846397375367, "learning_rate": 7.19879261487815e-07, "logits/chosen": -3.675001859664917, "logits/rejected": -3.700336456298828, "logps/chosen": -5.996543884277344, "logps/rejected": -38.44892883300781, "loss": 0.1201, "rewards/accuracies": 1.0, "rewards/chosen": 2.87506103515625, "rewards/margins": 2.87506103515625, "rewards/rejected": 0.0, "step": 1202 }, { "epoch": 6.720670391061453, "grad_norm": 2.3043975156469174, "learning_rate": 7.193316564817775e-07, "logits/chosen": -3.619896173477173, "logits/rejected": -3.757736921310425, "logps/chosen": -4.08686637878418, "logps/rejected": -42.31098937988281, "loss": 0.1036, "rewards/accuracies": 1.0, "rewards/chosen": 2.071007013320923, "rewards/margins": 2.071007013320923, "rewards/rejected": 0.0, "step": 1203 }, { "epoch": 6.726256983240224, "grad_norm": 9.080938342219454, "learning_rate": 7.187837255098378e-07, "logits/chosen": -3.453091859817505, "logits/rejected": -3.3350491523742676, "logps/chosen": -11.954121589660645, "logps/rejected": -69.71781921386719, "loss": 0.1533, "rewards/accuracies": 0.75, "rewards/chosen": 1.75028395652771, "rewards/margins": 1.75028395652771, "rewards/rejected": 0.0, "step": 1204 }, { "epoch": 6.731843575418995, "grad_norm": 6.330344122312587, "learning_rate": 7.182354693863185e-07, "logits/chosen": -3.3193860054016113, "logits/rejected": -3.7034122943878174, "logps/chosen": -1.3383597135543823, "logps/rejected": -49.510345458984375, "loss": 0.159, "rewards/accuracies": 1.0, "rewards/chosen": 1.6445035934448242, "rewards/margins": 1.6445035934448242, "rewards/rejected": 0.0, "step": 1205 }, { "epoch": 6.737430167597766, "grad_norm": 4.194445576780564, "learning_rate": 7.176868889260264e-07, "logits/chosen": -3.772306442260742, "logits/rejected": -3.756890058517456, "logps/chosen": -2.6748602390289307, "logps/rejected": -42.153343200683594, "loss": 0.1116, "rewards/accuracies": 1.0, "rewards/chosen": 2.1727726459503174, "rewards/margins": 2.1727726459503174, "rewards/rejected": 0.0, "step": 1206 }, { "epoch": 6.743016759776537, "grad_norm": 2.1329505604911145, "learning_rate": 7.171379849442492e-07, "logits/chosen": -3.4764533042907715, "logits/rejected": -3.428299903869629, "logps/chosen": -26.964725494384766, "logps/rejected": -50.2739143371582, "loss": 0.1237, "rewards/accuracies": 1.0, "rewards/chosen": 2.7658278942108154, "rewards/margins": 2.7658278942108154, "rewards/rejected": 0.0, "step": 1207 }, { "epoch": 6.748603351955307, "grad_norm": 2.042392428716392, "learning_rate": 7.165887582567562e-07, "logits/chosen": -3.3654351234436035, "logits/rejected": -3.2815451622009277, "logps/chosen": -10.788187026977539, "logps/rejected": -42.93102264404297, "loss": 0.1271, "rewards/accuracies": 1.0, "rewards/chosen": 2.399768829345703, "rewards/margins": 2.399768829345703, "rewards/rejected": 0.0, "step": 1208 }, { "epoch": 6.754189944134078, "grad_norm": 1.8204590297465313, "learning_rate": 7.160392096797962e-07, "logits/chosen": -3.4422526359558105, "logits/rejected": -3.43233323097229, "logps/chosen": -6.396111011505127, "logps/rejected": -38.756385803222656, "loss": 0.1214, "rewards/accuracies": 1.0, "rewards/chosen": 2.2732014656066895, "rewards/margins": 2.2732014656066895, "rewards/rejected": 0.0, "step": 1209 }, { "epoch": 6.759776536312849, "grad_norm": 1.9915316249999613, "learning_rate": 7.154893400300961e-07, "logits/chosen": -3.3545596599578857, "logits/rejected": -3.3272500038146973, "logps/chosen": -15.552148818969727, "logps/rejected": -38.22154235839844, "loss": 0.1129, "rewards/accuracies": 1.0, "rewards/chosen": 3.815664052963257, "rewards/margins": 3.815664052963257, "rewards/rejected": 0.0, "step": 1210 }, { "epoch": 6.76536312849162, "grad_norm": 2.1818868474595057, "learning_rate": 7.1493915012486e-07, "logits/chosen": -3.586728572845459, "logits/rejected": -3.6481289863586426, "logps/chosen": -5.913375377655029, "logps/rejected": -59.924766540527344, "loss": 0.1095, "rewards/accuracies": 1.0, "rewards/chosen": 2.5586137771606445, "rewards/margins": 2.5586137771606445, "rewards/rejected": 0.0, "step": 1211 }, { "epoch": 6.770949720670391, "grad_norm": 6.423805073766037, "learning_rate": 7.143886407817685e-07, "logits/chosen": -3.373922348022461, "logits/rejected": -3.477799415588379, "logps/chosen": -2.9880781173706055, "logps/rejected": -27.999588012695312, "loss": 0.1459, "rewards/accuracies": 1.0, "rewards/chosen": 1.9803999662399292, "rewards/margins": 1.9803999662399292, "rewards/rejected": 0.0, "step": 1212 }, { "epoch": 6.776536312849162, "grad_norm": 6.5904341124871975, "learning_rate": 7.138378128189762e-07, "logits/chosen": -3.2786753177642822, "logits/rejected": -3.3929131031036377, "logps/chosen": -0.9103558659553528, "logps/rejected": -32.21274185180664, "loss": 0.1478, "rewards/accuracies": 1.0, "rewards/chosen": 1.2840886116027832, "rewards/margins": 1.2840886116027832, "rewards/rejected": 0.0, "step": 1213 }, { "epoch": 6.782122905027933, "grad_norm": 2.7972847424818963, "learning_rate": 7.132866670551116e-07, "logits/chosen": -3.7327377796173096, "logits/rejected": -3.74499249458313, "logps/chosen": -13.38409423828125, "logps/rejected": -38.56711196899414, "loss": 0.1038, "rewards/accuracies": 1.0, "rewards/chosen": 3.2696335315704346, "rewards/margins": 3.2696335315704346, "rewards/rejected": 0.0, "step": 1214 }, { "epoch": 6.787709497206704, "grad_norm": 1.3959306891227423, "learning_rate": 7.127352043092754e-07, "logits/chosen": -3.496448278427124, "logits/rejected": -3.335744619369507, "logps/chosen": -6.675405502319336, "logps/rejected": -44.683937072753906, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 2.6381330490112305, "rewards/margins": 2.6381330490112305, "rewards/rejected": 0.0, "step": 1215 }, { "epoch": 6.793296089385475, "grad_norm": 2.259854727960776, "learning_rate": 7.121834254010397e-07, "logits/chosen": -3.683525323867798, "logits/rejected": -3.503322124481201, "logps/chosen": -2.7960424423217773, "logps/rejected": -32.17485046386719, "loss": 0.1358, "rewards/accuracies": 1.0, "rewards/chosen": 2.0533640384674072, "rewards/margins": 2.0533640384674072, "rewards/rejected": 0.0, "step": 1216 }, { "epoch": 6.798882681564246, "grad_norm": 1.7479237032343986, "learning_rate": 7.116313311504459e-07, "logits/chosen": -3.8045952320098877, "logits/rejected": -3.823420524597168, "logps/chosen": -8.632901191711426, "logps/rejected": -41.81629180908203, "loss": 0.1257, "rewards/accuracies": 1.0, "rewards/chosen": 2.8498501777648926, "rewards/margins": 2.8498501777648926, "rewards/rejected": 0.0, "step": 1217 }, { "epoch": 6.804469273743017, "grad_norm": 3.3165446290546354, "learning_rate": 7.110789223780045e-07, "logits/chosen": -3.540322780609131, "logits/rejected": -3.6948049068450928, "logps/chosen": -7.688474178314209, "logps/rejected": -58.047393798828125, "loss": 0.1218, "rewards/accuracies": 1.0, "rewards/chosen": 2.5741453170776367, "rewards/margins": 2.5741453170776367, "rewards/rejected": 0.0, "step": 1218 }, { "epoch": 6.810055865921788, "grad_norm": 4.9105938149798485, "learning_rate": 7.105261999046934e-07, "logits/chosen": -3.63484787940979, "logits/rejected": -3.6123716831207275, "logps/chosen": -6.4083356857299805, "logps/rejected": -45.80668640136719, "loss": 0.1398, "rewards/accuracies": 1.0, "rewards/chosen": 2.0915567874908447, "rewards/margins": 2.0915567874908447, "rewards/rejected": 0.0, "step": 1219 }, { "epoch": 6.815642458100559, "grad_norm": 2.493230433513645, "learning_rate": 7.099731645519567e-07, "logits/chosen": -3.586568832397461, "logits/rejected": -3.7155306339263916, "logps/chosen": -16.714839935302734, "logps/rejected": -30.529937744140625, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 2.616159439086914, "rewards/margins": 2.616159439086914, "rewards/rejected": 0.0, "step": 1220 }, { "epoch": 6.815642458100559, "eval_logits/chosen": -3.400486707687378, "eval_logits/rejected": -3.5274319648742676, "eval_logps/chosen": -23.52200698852539, "eval_logps/rejected": -42.7762451171875, "eval_loss": 0.6176553964614868, "eval_rewards/accuracies": 0.8500000238418579, "eval_rewards/chosen": 0.6548033356666565, "eval_rewards/margins": 0.6548033356666565, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7579, "eval_samples_per_second": 9.463, "eval_steps_per_second": 0.305, "step": 1220 }, { "epoch": 6.82122905027933, "grad_norm": 1.8848191142706658, "learning_rate": 7.094198171417031e-07, "logits/chosen": -3.562079429626465, "logits/rejected": -3.5121777057647705, "logps/chosen": -1.9159338474273682, "logps/rejected": -44.53816223144531, "loss": 0.1196, "rewards/accuracies": 1.0, "rewards/chosen": 1.6635804176330566, "rewards/margins": 1.6635804176330566, "rewards/rejected": 0.0, "step": 1221 }, { "epoch": 6.826815642458101, "grad_norm": 9.830095342409699, "learning_rate": 7.088661584963058e-07, "logits/chosen": -3.6274678707122803, "logits/rejected": -3.4499831199645996, "logps/chosen": -10.85792064666748, "logps/rejected": -25.182781219482422, "loss": 0.1478, "rewards/accuracies": 1.0, "rewards/chosen": 2.5888285636901855, "rewards/margins": 2.5888285636901855, "rewards/rejected": 0.0, "step": 1222 }, { "epoch": 6.832402234636872, "grad_norm": 3.349762567016223, "learning_rate": 7.083121894385999e-07, "logits/chosen": -3.280095338821411, "logits/rejected": -3.4245829582214355, "logps/chosen": -11.596260070800781, "logps/rejected": -51.15528869628906, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 2.3517813682556152, "rewards/margins": 2.3517813682556152, "rewards/rejected": 0.0, "step": 1223 }, { "epoch": 6.837988826815643, "grad_norm": 2.407717174388427, "learning_rate": 7.077579107918821e-07, "logits/chosen": -3.8237805366516113, "logits/rejected": -3.719397783279419, "logps/chosen": -5.843072414398193, "logps/rejected": -27.59650421142578, "loss": 0.1151, "rewards/accuracies": 1.0, "rewards/chosen": 2.6652581691741943, "rewards/margins": 2.6652581691741943, "rewards/rejected": 0.0, "step": 1224 }, { "epoch": 6.843575418994414, "grad_norm": 2.360322438871608, "learning_rate": 7.072033233799091e-07, "logits/chosen": -3.5416440963745117, "logits/rejected": -3.6952686309814453, "logps/chosen": -2.770350694656372, "logps/rejected": -73.64277648925781, "loss": 0.1258, "rewards/accuracies": 1.0, "rewards/chosen": 2.074777603149414, "rewards/margins": 2.074777603149414, "rewards/rejected": 0.0, "step": 1225 }, { "epoch": 6.849162011173185, "grad_norm": 3.7667916496111733, "learning_rate": 7.066484280268967e-07, "logits/chosen": -3.4020073413848877, "logits/rejected": -3.483330726623535, "logps/chosen": -6.428084373474121, "logps/rejected": -30.403133392333984, "loss": 0.1671, "rewards/accuracies": 1.0, "rewards/chosen": 3.0210697650909424, "rewards/margins": 3.0210697650909424, "rewards/rejected": 0.0, "step": 1226 }, { "epoch": 6.854748603351956, "grad_norm": 4.398224507010163, "learning_rate": 7.060932255575182e-07, "logits/chosen": -3.457984447479248, "logits/rejected": -3.4139060974121094, "logps/chosen": -2.470066785812378, "logps/rejected": -100.52609252929688, "loss": 0.1239, "rewards/accuracies": 1.0, "rewards/chosen": 2.1537039279937744, "rewards/margins": 2.1537039279937744, "rewards/rejected": 0.0, "step": 1227 }, { "epoch": 6.860335195530726, "grad_norm": 2.487376116821704, "learning_rate": 7.055377167969034e-07, "logits/chosen": -3.5053632259368896, "logits/rejected": -3.6033573150634766, "logps/chosen": -2.783202648162842, "logps/rejected": -30.8094425201416, "loss": 0.1117, "rewards/accuracies": 1.0, "rewards/chosen": 2.058837413787842, "rewards/margins": 2.058837413787842, "rewards/rejected": 0.0, "step": 1228 }, { "epoch": 6.865921787709497, "grad_norm": 2.0118986319633723, "learning_rate": 7.049819025706371e-07, "logits/chosen": -3.568035840988159, "logits/rejected": -3.5127785205841064, "logps/chosen": -2.541534900665283, "logps/rejected": -27.387203216552734, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 2.272038459777832, "rewards/margins": 2.272038459777832, "rewards/rejected": 0.0, "step": 1229 }, { "epoch": 6.871508379888268, "grad_norm": 3.518051662240881, "learning_rate": 7.044257837047583e-07, "logits/chosen": -3.6467161178588867, "logits/rejected": -3.5425987243652344, "logps/chosen": -6.234301567077637, "logps/rejected": -38.334869384765625, "loss": 0.142, "rewards/accuracies": 1.0, "rewards/chosen": 2.4043684005737305, "rewards/margins": 2.4043684005737305, "rewards/rejected": 0.0, "step": 1230 }, { "epoch": 6.877094972067039, "grad_norm": 4.108878555529441, "learning_rate": 7.038693610257588e-07, "logits/chosen": -3.5054585933685303, "logits/rejected": -3.3625519275665283, "logps/chosen": -9.478939056396484, "logps/rejected": -25.657081604003906, "loss": 0.1414, "rewards/accuracies": 1.0, "rewards/chosen": 2.4863359928131104, "rewards/margins": 2.4863359928131104, "rewards/rejected": 0.0, "step": 1231 }, { "epoch": 6.88268156424581, "grad_norm": 8.374221840904603, "learning_rate": 7.033126353605816e-07, "logits/chosen": -3.3767518997192383, "logits/rejected": -3.1736011505126953, "logps/chosen": -9.914738655090332, "logps/rejected": -28.2974853515625, "loss": 0.1791, "rewards/accuracies": 1.0, "rewards/chosen": 2.1386775970458984, "rewards/margins": 2.1386775970458984, "rewards/rejected": 0.0, "step": 1232 }, { "epoch": 6.888268156424581, "grad_norm": 1.7720574028531357, "learning_rate": 7.027556075366202e-07, "logits/chosen": -3.438734531402588, "logits/rejected": -3.4485371112823486, "logps/chosen": -3.397510051727295, "logps/rejected": -41.679359436035156, "loss": 0.1313, "rewards/accuracies": 1.0, "rewards/chosen": 2.425100803375244, "rewards/margins": 2.425100803375244, "rewards/rejected": 0.0, "step": 1233 }, { "epoch": 6.893854748603352, "grad_norm": 4.468675600832932, "learning_rate": 7.021982783817172e-07, "logits/chosen": -3.4875001907348633, "logits/rejected": -3.5468428134918213, "logps/chosen": -9.436210632324219, "logps/rejected": -37.21449279785156, "loss": 0.1091, "rewards/accuracies": 1.0, "rewards/chosen": 2.802164077758789, "rewards/margins": 2.802164077758789, "rewards/rejected": 0.0, "step": 1234 }, { "epoch": 6.899441340782123, "grad_norm": 4.452848779091001, "learning_rate": 7.01640648724163e-07, "logits/chosen": -3.4259796142578125, "logits/rejected": -3.4563775062561035, "logps/chosen": -5.967182159423828, "logps/rejected": -51.27204513549805, "loss": 0.1195, "rewards/accuracies": 1.0, "rewards/chosen": 2.487522602081299, "rewards/margins": 2.487522602081299, "rewards/rejected": 0.0, "step": 1235 }, { "epoch": 6.905027932960894, "grad_norm": 3.8344758884608425, "learning_rate": 7.010827193926946e-07, "logits/chosen": -3.5665276050567627, "logits/rejected": -3.6301960945129395, "logps/chosen": -0.8891786336898804, "logps/rejected": -61.749176025390625, "loss": 0.1233, "rewards/accuracies": 1.0, "rewards/chosen": 1.3752551078796387, "rewards/margins": 1.3752551078796387, "rewards/rejected": 0.0, "step": 1236 }, { "epoch": 6.910614525139665, "grad_norm": 1.4105069292029504, "learning_rate": 7.005244912164941e-07, "logits/chosen": -3.363919734954834, "logits/rejected": -3.5228559970855713, "logps/chosen": -6.403987884521484, "logps/rejected": -29.543251037597656, "loss": 0.1053, "rewards/accuracies": 1.0, "rewards/chosen": 2.6485252380371094, "rewards/margins": 2.6485252380371094, "rewards/rejected": 0.0, "step": 1237 }, { "epoch": 6.916201117318436, "grad_norm": 1.567324172711957, "learning_rate": 6.999659650251885e-07, "logits/chosen": -3.5861716270446777, "logits/rejected": -3.5285205841064453, "logps/chosen": -19.173553466796875, "logps/rejected": -30.03134536743164, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 3.41219162940979, "rewards/margins": 3.41219162940979, "rewards/rejected": 0.0, "step": 1238 }, { "epoch": 6.921787709497207, "grad_norm": 2.9385896411756254, "learning_rate": 6.994071416488467e-07, "logits/chosen": -3.4926271438598633, "logits/rejected": -3.8362624645233154, "logps/chosen": -2.885002851486206, "logps/rejected": -75.10549926757812, "loss": 0.1026, "rewards/accuracies": 1.0, "rewards/chosen": 2.034637212753296, "rewards/margins": 2.034637212753296, "rewards/rejected": 0.0, "step": 1239 }, { "epoch": 6.927374301675978, "grad_norm": 1.8495862875255806, "learning_rate": 6.988480219179801e-07, "logits/chosen": -3.431464195251465, "logits/rejected": -3.0147957801818848, "logps/chosen": -1.5547078847885132, "logps/rejected": -49.615867614746094, "loss": 0.1516, "rewards/accuracies": 1.0, "rewards/chosen": 1.7282437086105347, "rewards/margins": 1.7282437086105347, "rewards/rejected": 0.0, "step": 1240 }, { "epoch": 6.927374301675978, "eval_logits/chosen": -3.40470814704895, "eval_logits/rejected": -3.530635118484497, "eval_logps/chosen": -23.34773826599121, "eval_logps/rejected": -43.31367874145508, "eval_loss": 0.6201418042182922, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.6722301840782166, "eval_rewards/margins": 0.6722301840782166, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7094, "eval_samples_per_second": 9.477, "eval_steps_per_second": 0.306, "step": 1240 }, { "epoch": 6.932960893854749, "grad_norm": 3.022799239575308, "learning_rate": 6.982886066635398e-07, "logits/chosen": -3.509122371673584, "logits/rejected": -3.611570358276367, "logps/chosen": -5.77220344543457, "logps/rejected": -30.937862396240234, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": 2.851273536682129, "rewards/margins": 2.851273536682129, "rewards/rejected": 0.0, "step": 1241 }, { "epoch": 6.93854748603352, "grad_norm": 3.6542646780509718, "learning_rate": 6.97728896716917e-07, "logits/chosen": -3.1856911182403564, "logits/rejected": -3.246323823928833, "logps/chosen": -5.146172523498535, "logps/rejected": -26.055557250976562, "loss": 0.1565, "rewards/accuracies": 1.0, "rewards/chosen": 2.2226128578186035, "rewards/margins": 2.2226128578186035, "rewards/rejected": 0.0, "step": 1242 }, { "epoch": 6.94413407821229, "grad_norm": 2.724818942837176, "learning_rate": 6.9716889290994e-07, "logits/chosen": -3.484645128250122, "logits/rejected": -3.6872596740722656, "logps/chosen": -15.662714004516602, "logps/rejected": -20.226940155029297, "loss": 0.0977, "rewards/accuracies": 1.0, "rewards/chosen": 2.2329084873199463, "rewards/margins": 2.2329084873199463, "rewards/rejected": 0.0, "step": 1243 }, { "epoch": 6.949720670391061, "grad_norm": 3.233174886627538, "learning_rate": 6.966085960748741e-07, "logits/chosen": -3.3511745929718018, "logits/rejected": -3.443159818649292, "logps/chosen": -15.10945987701416, "logps/rejected": -27.131855010986328, "loss": 0.1387, "rewards/accuracies": 1.0, "rewards/chosen": 3.1416854858398438, "rewards/margins": 3.1416854858398438, "rewards/rejected": 0.0, "step": 1244 }, { "epoch": 6.955307262569832, "grad_norm": 1.7873288884349265, "learning_rate": 6.960480070444205e-07, "logits/chosen": -3.6787734031677246, "logits/rejected": -3.816436767578125, "logps/chosen": -8.850296020507812, "logps/rejected": -18.008380889892578, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": 3.0034384727478027, "rewards/margins": 3.0034384727478027, "rewards/rejected": 0.0, "step": 1245 }, { "epoch": 6.960893854748603, "grad_norm": 3.933611288564565, "learning_rate": 6.954871266517142e-07, "logits/chosen": -3.684723138809204, "logits/rejected": -3.5727052688598633, "logps/chosen": -9.609380722045898, "logps/rejected": -21.106409072875977, "loss": 0.1188, "rewards/accuracies": 1.0, "rewards/chosen": 2.8668670654296875, "rewards/margins": 2.8668670654296875, "rewards/rejected": 0.0, "step": 1246 }, { "epoch": 6.966480446927374, "grad_norm": 3.188199242094565, "learning_rate": 6.949259557303232e-07, "logits/chosen": -3.592222213745117, "logits/rejected": -3.5707173347473145, "logps/chosen": -3.5327694416046143, "logps/rejected": -23.995849609375, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": 2.4314236640930176, "rewards/margins": 2.4314236640930176, "rewards/rejected": 0.0, "step": 1247 }, { "epoch": 6.972067039106145, "grad_norm": 6.666790429932437, "learning_rate": 6.943644951142477e-07, "logits/chosen": -3.750788688659668, "logits/rejected": -3.7600064277648926, "logps/chosen": -25.545528411865234, "logps/rejected": -38.38447570800781, "loss": 0.1738, "rewards/accuracies": 1.0, "rewards/chosen": 3.568408250808716, "rewards/margins": 3.568408250808716, "rewards/rejected": 0.0, "step": 1248 }, { "epoch": 6.977653631284916, "grad_norm": 1.9302870448901726, "learning_rate": 6.938027456379181e-07, "logits/chosen": -3.6586012840270996, "logits/rejected": -3.828281879425049, "logps/chosen": -2.125006914138794, "logps/rejected": -25.19756317138672, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 2.2528858184814453, "rewards/margins": 2.2528858184814453, "rewards/rejected": 0.0, "step": 1249 }, { "epoch": 6.983240223463687, "grad_norm": 4.1458219129676985, "learning_rate": 6.93240708136194e-07, "logits/chosen": -3.2185046672821045, "logits/rejected": -3.186969518661499, "logps/chosen": -46.381134033203125, "logps/rejected": -36.031314849853516, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 2.10642409324646, "rewards/margins": 2.10642409324646, "rewards/rejected": 0.0, "step": 1250 }, { "epoch": 6.988826815642458, "grad_norm": 2.0311511446524064, "learning_rate": 6.926783834443633e-07, "logits/chosen": -3.650008201599121, "logits/rejected": -3.5961015224456787, "logps/chosen": -2.199688196182251, "logps/rejected": -33.29227828979492, "loss": 0.1303, "rewards/accuracies": 1.0, "rewards/chosen": 1.8048226833343506, "rewards/margins": 1.8048226833343506, "rewards/rejected": 0.0, "step": 1251 }, { "epoch": 6.994413407821229, "grad_norm": 5.875325680193311, "learning_rate": 6.921157723981408e-07, "logits/chosen": -3.4090051651000977, "logits/rejected": -3.5231189727783203, "logps/chosen": -37.33778762817383, "logps/rejected": -23.965280532836914, "loss": 0.127, "rewards/accuracies": 1.0, "rewards/chosen": 2.1387805938720703, "rewards/margins": 2.1387805938720703, "rewards/rejected": 0.0, "step": 1252 }, { "epoch": 7.0, "grad_norm": 4.5952188096971085, "learning_rate": 6.915528758336664e-07, "logits/chosen": -3.5402402877807617, "logits/rejected": -3.58508563041687, "logps/chosen": -10.173696517944336, "logps/rejected": -33.168235778808594, "loss": 0.1321, "rewards/accuracies": 1.0, "rewards/chosen": 2.598673105239868, "rewards/margins": 2.598673105239868, "rewards/rejected": 0.0, "step": 1253 }, { "epoch": 7.005586592178771, "grad_norm": 1.9089672348865112, "learning_rate": 6.909896945875049e-07, "logits/chosen": -3.530308723449707, "logits/rejected": -3.4146568775177, "logps/chosen": -7.088906288146973, "logps/rejected": -26.716922760009766, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 3.2254104614257812, "rewards/margins": 3.2254104614257812, "rewards/rejected": 0.0, "step": 1254 }, { "epoch": 7.011173184357542, "grad_norm": 1.9922181482129386, "learning_rate": 6.904262294966438e-07, "logits/chosen": -3.759143114089966, "logits/rejected": -3.964033842086792, "logps/chosen": -8.769649505615234, "logps/rejected": -39.40819549560547, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 3.294891357421875, "rewards/margins": 3.294891357421875, "rewards/rejected": 0.0, "step": 1255 }, { "epoch": 7.016759776536313, "grad_norm": 2.8377968284119115, "learning_rate": 6.898624813984927e-07, "logits/chosen": -3.4786267280578613, "logits/rejected": -3.4019179344177246, "logps/chosen": -3.114628791809082, "logps/rejected": -68.92147827148438, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 2.311002254486084, "rewards/margins": 2.311002254486084, "rewards/rejected": 0.0, "step": 1256 }, { "epoch": 7.022346368715084, "grad_norm": 1.6878198951718528, "learning_rate": 6.892984511308813e-07, "logits/chosen": -3.630856990814209, "logits/rejected": -3.5413107872009277, "logps/chosen": -4.833260536193848, "logps/rejected": -46.88436508178711, "loss": 0.1113, "rewards/accuracies": 1.0, "rewards/chosen": 2.4715824127197266, "rewards/margins": 2.4715824127197266, "rewards/rejected": 0.0, "step": 1257 }, { "epoch": 7.027932960893855, "grad_norm": 1.3587412021660126, "learning_rate": 6.887341395320596e-07, "logits/chosen": -3.56107234954834, "logits/rejected": -3.789989948272705, "logps/chosen": -1.250314712524414, "logps/rejected": -30.90006446838379, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": 1.7148796319961548, "rewards/margins": 1.7148796319961548, "rewards/rejected": 0.0, "step": 1258 }, { "epoch": 7.033519553072626, "grad_norm": 1.6857703997299212, "learning_rate": 6.881695474406947e-07, "logits/chosen": -3.827329635620117, "logits/rejected": -3.8502554893493652, "logps/chosen": -2.4466514587402344, "logps/rejected": -41.08592987060547, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": 1.9332318305969238, "rewards/margins": 1.9332318305969238, "rewards/rejected": 0.0, "step": 1259 }, { "epoch": 7.039106145251397, "grad_norm": 2.4773497393362045, "learning_rate": 6.876046756958713e-07, "logits/chosen": -3.564875841140747, "logits/rejected": -3.6816511154174805, "logps/chosen": -13.001611709594727, "logps/rejected": -52.05155944824219, "loss": 0.1004, "rewards/accuracies": 1.0, "rewards/chosen": 2.8277783393859863, "rewards/margins": 2.8277783393859863, "rewards/rejected": 0.0, "step": 1260 }, { "epoch": 7.039106145251397, "eval_logits/chosen": -3.3980274200439453, "eval_logits/rejected": -3.524296283721924, "eval_logps/chosen": -23.279247283935547, "eval_logps/rejected": -42.626617431640625, "eval_loss": 0.6063249111175537, "eval_rewards/accuracies": 0.8500000238418579, "eval_rewards/chosen": 0.6790796518325806, "eval_rewards/margins": 0.6790796518325806, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7375, "eval_samples_per_second": 9.469, "eval_steps_per_second": 0.305, "step": 1260 }, { "epoch": 7.044692737430168, "grad_norm": 3.0503634919952907, "learning_rate": 6.870395251370893e-07, "logits/chosen": -3.6887543201446533, "logits/rejected": -3.7471749782562256, "logps/chosen": -13.48740005493164, "logps/rejected": -36.2447395324707, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 3.6292214393615723, "rewards/margins": 3.6292214393615723, "rewards/rejected": 0.0, "step": 1261 }, { "epoch": 7.050279329608939, "grad_norm": 2.355830206300344, "learning_rate": 6.864740966042628e-07, "logits/chosen": -3.3249711990356445, "logits/rejected": -3.4159483909606934, "logps/chosen": -2.5284647941589355, "logps/rejected": -30.496246337890625, "loss": 0.1099, "rewards/accuracies": 1.0, "rewards/chosen": 2.2643685340881348, "rewards/margins": 2.2643685340881348, "rewards/rejected": 0.0, "step": 1262 }, { "epoch": 7.055865921787709, "grad_norm": 2.7909867720166153, "learning_rate": 6.859083909377196e-07, "logits/chosen": -3.4230048656463623, "logits/rejected": -3.4332618713378906, "logps/chosen": -12.817394256591797, "logps/rejected": -29.420516967773438, "loss": 0.1351, "rewards/accuracies": 1.0, "rewards/chosen": 2.819218873977661, "rewards/margins": 2.819218873977661, "rewards/rejected": 0.0, "step": 1263 }, { "epoch": 7.06145251396648, "grad_norm": 1.1915542948733544, "learning_rate": 6.85342408978199e-07, "logits/chosen": -3.303455352783203, "logits/rejected": -3.5971484184265137, "logps/chosen": -7.716994762420654, "logps/rejected": -54.277198791503906, "loss": 0.1486, "rewards/accuracies": 1.0, "rewards/chosen": 2.4927916526794434, "rewards/margins": 2.4927916526794434, "rewards/rejected": 0.0, "step": 1264 }, { "epoch": 7.067039106145251, "grad_norm": 4.254128587452519, "learning_rate": 6.847761515668511e-07, "logits/chosen": -3.419790506362915, "logits/rejected": -3.2606422901153564, "logps/chosen": -16.361888885498047, "logps/rejected": -57.17011642456055, "loss": 0.1224, "rewards/accuracies": 1.0, "rewards/chosen": 2.781437397003174, "rewards/margins": 2.781437397003174, "rewards/rejected": 0.0, "step": 1265 }, { "epoch": 7.072625698324022, "grad_norm": 1.572679292363798, "learning_rate": 6.842096195452349e-07, "logits/chosen": -3.235483169555664, "logits/rejected": -3.463332414627075, "logps/chosen": -2.7052762508392334, "logps/rejected": -94.96967315673828, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 1.9657814502716064, "rewards/margins": 1.9657814502716064, "rewards/rejected": 0.0, "step": 1266 }, { "epoch": 7.078212290502793, "grad_norm": 3.991580921121, "learning_rate": 6.836428137553183e-07, "logits/chosen": -3.574601411819458, "logits/rejected": -3.6130170822143555, "logps/chosen": -7.246374607086182, "logps/rejected": -52.97753143310547, "loss": 0.1111, "rewards/accuracies": 1.0, "rewards/chosen": 3.0662167072296143, "rewards/margins": 3.0662167072296143, "rewards/rejected": 0.0, "step": 1267 }, { "epoch": 7.083798882681564, "grad_norm": 1.5920670141128423, "learning_rate": 6.830757350394754e-07, "logits/chosen": -3.4136204719543457, "logits/rejected": -3.473283529281616, "logps/chosen": -1.0013225078582764, "logps/rejected": -64.36286926269531, "loss": 0.1319, "rewards/accuracies": 1.0, "rewards/chosen": 1.665197491645813, "rewards/margins": 1.665197491645813, "rewards/rejected": 0.0, "step": 1268 }, { "epoch": 7.089385474860335, "grad_norm": 1.9658474063902145, "learning_rate": 6.825083842404862e-07, "logits/chosen": -3.531083583831787, "logits/rejected": -3.6414847373962402, "logps/chosen": -1.0360397100448608, "logps/rejected": -97.47354888916016, "loss": 0.1368, "rewards/accuracies": 1.0, "rewards/chosen": 1.8494411706924438, "rewards/margins": 1.8494411706924438, "rewards/rejected": 0.0, "step": 1269 }, { "epoch": 7.094972067039106, "grad_norm": 1.8695660026719596, "learning_rate": 6.819407622015352e-07, "logits/chosen": -3.7749783992767334, "logits/rejected": -3.5601696968078613, "logps/chosen": -13.966944694519043, "logps/rejected": -26.170475006103516, "loss": 0.1157, "rewards/accuracies": 1.0, "rewards/chosen": 3.500847578048706, "rewards/margins": 3.500847578048706, "rewards/rejected": 0.0, "step": 1270 }, { "epoch": 7.100558659217877, "grad_norm": 2.098493826176811, "learning_rate": 6.813728697662096e-07, "logits/chosen": -3.7067666053771973, "logits/rejected": -3.4574549198150635, "logps/chosen": -6.075762748718262, "logps/rejected": -60.33864974975586, "loss": 0.0944, "rewards/accuracies": 1.0, "rewards/chosen": 2.639904737472534, "rewards/margins": 2.639904737472534, "rewards/rejected": 0.0, "step": 1271 }, { "epoch": 7.106145251396648, "grad_norm": 1.1897135710493125, "learning_rate": 6.808047077784987e-07, "logits/chosen": -3.3910703659057617, "logits/rejected": -3.6197991371154785, "logps/chosen": -8.341439247131348, "logps/rejected": -38.314170837402344, "loss": 0.0781, "rewards/accuracies": 1.0, "rewards/chosen": 2.6378109455108643, "rewards/margins": 2.6378109455108643, "rewards/rejected": 0.0, "step": 1272 }, { "epoch": 7.111731843575419, "grad_norm": 1.4566550515248526, "learning_rate": 6.802362770827926e-07, "logits/chosen": -3.423917770385742, "logits/rejected": -3.4773166179656982, "logps/chosen": -9.888988494873047, "logps/rejected": -38.88087844848633, "loss": 0.1033, "rewards/accuracies": 1.0, "rewards/chosen": 3.1006269454956055, "rewards/margins": 3.1006269454956055, "rewards/rejected": 0.0, "step": 1273 }, { "epoch": 7.11731843575419, "grad_norm": 1.5518900819244872, "learning_rate": 6.796675785238803e-07, "logits/chosen": -3.8480265140533447, "logits/rejected": -3.7801334857940674, "logps/chosen": -7.182609558105469, "logps/rejected": -22.92597198486328, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 3.425747871398926, "rewards/margins": 3.425747871398926, "rewards/rejected": 0.0, "step": 1274 }, { "epoch": 7.122905027932961, "grad_norm": 1.0502692279830264, "learning_rate": 6.790986129469494e-07, "logits/chosen": -3.9019930362701416, "logits/rejected": -3.716822624206543, "logps/chosen": -3.439368963241577, "logps/rejected": -28.22347640991211, "loss": 0.0755, "rewards/accuracies": 1.0, "rewards/chosen": 2.6313304901123047, "rewards/margins": 2.6313304901123047, "rewards/rejected": 0.0, "step": 1275 }, { "epoch": 7.128491620111732, "grad_norm": 1.6363874006250305, "learning_rate": 6.785293811975838e-07, "logits/chosen": -3.638106346130371, "logits/rejected": -3.825218915939331, "logps/chosen": -1.558204174041748, "logps/rejected": -50.177581787109375, "loss": 0.1165, "rewards/accuracies": 1.0, "rewards/chosen": 1.8915550708770752, "rewards/margins": 1.8915550708770752, "rewards/rejected": 0.0, "step": 1276 }, { "epoch": 7.134078212290503, "grad_norm": 2.603208295074469, "learning_rate": 6.779598841217635e-07, "logits/chosen": -3.2676031589508057, "logits/rejected": -3.6138453483581543, "logps/chosen": -1.8926982879638672, "logps/rejected": -48.11708068847656, "loss": 0.1167, "rewards/accuracies": 1.0, "rewards/chosen": 2.1727561950683594, "rewards/margins": 2.1727561950683594, "rewards/rejected": 0.0, "step": 1277 }, { "epoch": 7.139664804469274, "grad_norm": 2.3267088087405785, "learning_rate": 6.773901225658625e-07, "logits/chosen": -3.5336978435516357, "logits/rejected": -3.351433038711548, "logps/chosen": -2.522913694381714, "logps/rejected": -54.36046600341797, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 2.253704786300659, "rewards/margins": 2.253704786300659, "rewards/rejected": 0.0, "step": 1278 }, { "epoch": 7.145251396648045, "grad_norm": 1.0778863521725175, "learning_rate": 6.768200973766478e-07, "logits/chosen": -3.33052396774292, "logits/rejected": -3.2629311084747314, "logps/chosen": -4.489758491516113, "logps/rejected": -36.76377868652344, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": 2.9570207595825195, "rewards/margins": 2.9570207595825195, "rewards/rejected": 0.0, "step": 1279 }, { "epoch": 7.150837988826815, "grad_norm": 1.4641419644916576, "learning_rate": 6.762498094012786e-07, "logits/chosen": -3.2497832775115967, "logits/rejected": -3.2622108459472656, "logps/chosen": -5.682682991027832, "logps/rejected": -47.43547821044922, "loss": 0.1213, "rewards/accuracies": 1.0, "rewards/chosen": 1.835364580154419, "rewards/margins": 1.835364580154419, "rewards/rejected": 0.0, "step": 1280 }, { "epoch": 7.150837988826815, "eval_logits/chosen": -3.366528034210205, "eval_logits/rejected": -3.4954731464385986, "eval_logps/chosen": -25.02907943725586, "eval_logps/rejected": -45.20484161376953, "eval_loss": 0.688734769821167, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": 0.5040964484214783, "eval_rewards/margins": 0.5040964484214783, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7258, "eval_samples_per_second": 9.473, "eval_steps_per_second": 0.306, "step": 1280 }, { "epoch": 7.156424581005586, "grad_norm": 2.3060896288623574, "learning_rate": 6.756792594873042e-07, "logits/chosen": -3.3901827335357666, "logits/rejected": -3.4088780879974365, "logps/chosen": -6.201159477233887, "logps/rejected": -30.820392608642578, "loss": 0.1134, "rewards/accuracies": 1.0, "rewards/chosen": 2.5701470375061035, "rewards/margins": 2.5701470375061035, "rewards/rejected": 0.0, "step": 1281 }, { "epoch": 7.162011173184357, "grad_norm": 1.126589676343673, "learning_rate": 6.751084484826635e-07, "logits/chosen": -3.718583106994629, "logits/rejected": -3.7990591526031494, "logps/chosen": -2.9070825576782227, "logps/rejected": -38.872406005859375, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 2.5956931114196777, "rewards/margins": 2.5956931114196777, "rewards/rejected": 0.0, "step": 1282 }, { "epoch": 7.167597765363128, "grad_norm": 1.243585930640092, "learning_rate": 6.74537377235683e-07, "logits/chosen": -3.553194761276245, "logits/rejected": -3.4584012031555176, "logps/chosen": -2.784400463104248, "logps/rejected": -50.803123474121094, "loss": 0.1197, "rewards/accuracies": 1.0, "rewards/chosen": 2.5433735847473145, "rewards/margins": 2.5433735847473145, "rewards/rejected": 0.0, "step": 1283 }, { "epoch": 7.173184357541899, "grad_norm": 1.0585329819655018, "learning_rate": 6.739660465950769e-07, "logits/chosen": -2.675751209259033, "logits/rejected": -2.5353894233703613, "logps/chosen": -16.220413208007812, "logps/rejected": -52.3974609375, "loss": 0.0922, "rewards/accuracies": 1.0, "rewards/chosen": 2.149181842803955, "rewards/margins": 2.149181842803955, "rewards/rejected": 0.0, "step": 1284 }, { "epoch": 7.17877094972067, "grad_norm": 2.700793634333128, "learning_rate": 6.733944574099436e-07, "logits/chosen": -3.5255300998687744, "logits/rejected": -3.5196940898895264, "logps/chosen": -12.33900260925293, "logps/rejected": -53.60920715332031, "loss": 0.1398, "rewards/accuracies": 1.0, "rewards/chosen": 3.0111923217773438, "rewards/margins": 3.0111923217773438, "rewards/rejected": 0.0, "step": 1285 }, { "epoch": 7.184357541899441, "grad_norm": 1.0390366336173877, "learning_rate": 6.728226105297667e-07, "logits/chosen": -3.7337443828582764, "logits/rejected": -3.573716402053833, "logps/chosen": -1.5995138883590698, "logps/rejected": -57.31018829345703, "loss": 0.0987, "rewards/accuracies": 1.0, "rewards/chosen": 2.3661184310913086, "rewards/margins": 2.3661184310913086, "rewards/rejected": 0.0, "step": 1286 }, { "epoch": 7.189944134078212, "grad_norm": 1.104951125795208, "learning_rate": 6.722505068044127e-07, "logits/chosen": -3.5012717247009277, "logits/rejected": -3.511260747909546, "logps/chosen": -18.973304748535156, "logps/rejected": -33.1102409362793, "loss": 0.0739, "rewards/accuracies": 1.0, "rewards/chosen": 3.6038103103637695, "rewards/margins": 3.6038103103637695, "rewards/rejected": 0.0, "step": 1287 }, { "epoch": 7.195530726256983, "grad_norm": 1.6404524953515758, "learning_rate": 6.716781470841295e-07, "logits/chosen": -3.426778554916382, "logits/rejected": -3.508471727371216, "logps/chosen": -1.3954945802688599, "logps/rejected": -61.31631088256836, "loss": 0.119, "rewards/accuracies": 1.0, "rewards/chosen": 1.7865869998931885, "rewards/margins": 1.7865869998931885, "rewards/rejected": 0.0, "step": 1288 }, { "epoch": 7.201117318435754, "grad_norm": 2.086553690039233, "learning_rate": 6.711055322195455e-07, "logits/chosen": -3.692363739013672, "logits/rejected": -3.5762698650360107, "logps/chosen": -7.974408149719238, "logps/rejected": -28.232044219970703, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": 3.3824398517608643, "rewards/margins": 3.3824398517608643, "rewards/rejected": 0.0, "step": 1289 }, { "epoch": 7.206703910614525, "grad_norm": 1.3407543422176662, "learning_rate": 6.705326630616683e-07, "logits/chosen": -3.232694625854492, "logits/rejected": -3.614248037338257, "logps/chosen": -1.9202529191970825, "logps/rejected": -54.0255126953125, "loss": 0.1078, "rewards/accuracies": 1.0, "rewards/chosen": 2.092041254043579, "rewards/margins": 2.092041254043579, "rewards/rejected": 0.0, "step": 1290 }, { "epoch": 7.212290502793296, "grad_norm": 2.7446037098899643, "learning_rate": 6.699595404618836e-07, "logits/chosen": -3.6303162574768066, "logits/rejected": -3.593430519104004, "logps/chosen": -12.793563842773438, "logps/rejected": -23.735065460205078, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 2.336655616760254, "rewards/margins": 2.336655616760254, "rewards/rejected": 0.0, "step": 1291 }, { "epoch": 7.217877094972067, "grad_norm": 1.1170534654840107, "learning_rate": 6.693861652719536e-07, "logits/chosen": -3.604806661605835, "logits/rejected": -3.708782196044922, "logps/chosen": -5.701390266418457, "logps/rejected": -73.40631103515625, "loss": 0.1009, "rewards/accuracies": 1.0, "rewards/chosen": 2.7073326110839844, "rewards/margins": 2.7073326110839844, "rewards/rejected": 0.0, "step": 1292 }, { "epoch": 7.223463687150838, "grad_norm": 2.337765936858399, "learning_rate": 6.68812538344016e-07, "logits/chosen": -3.408153533935547, "logits/rejected": -3.438262462615967, "logps/chosen": -51.61593246459961, "logps/rejected": -25.005937576293945, "loss": 0.1092, "rewards/accuracies": 1.0, "rewards/chosen": 3.4474377632141113, "rewards/margins": 3.4474377632141113, "rewards/rejected": 0.0, "step": 1293 }, { "epoch": 7.229050279329609, "grad_norm": 1.1792228953093058, "learning_rate": 6.682386605305827e-07, "logits/chosen": -3.4808995723724365, "logits/rejected": -3.4360406398773193, "logps/chosen": -3.751499652862549, "logps/rejected": -42.26259994506836, "loss": 0.1093, "rewards/accuracies": 1.0, "rewards/chosen": 2.625836133956909, "rewards/margins": 2.625836133956909, "rewards/rejected": 0.0, "step": 1294 }, { "epoch": 7.23463687150838, "grad_norm": 4.963918382196016, "learning_rate": 6.676645326845379e-07, "logits/chosen": -3.3212473392486572, "logits/rejected": -3.2344558238983154, "logps/chosen": -8.124931335449219, "logps/rejected": -29.031097412109375, "loss": 0.1292, "rewards/accuracies": 1.0, "rewards/chosen": 2.6535797119140625, "rewards/margins": 2.6535797119140625, "rewards/rejected": 0.0, "step": 1295 }, { "epoch": 7.240223463687151, "grad_norm": 1.1186223144673042, "learning_rate": 6.670901556591383e-07, "logits/chosen": -3.446485996246338, "logits/rejected": -3.4796955585479736, "logps/chosen": -1.435394287109375, "logps/rejected": -35.34324645996094, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": 1.9041576385498047, "rewards/margins": 1.9041576385498047, "rewards/rejected": 0.0, "step": 1296 }, { "epoch": 7.245810055865922, "grad_norm": 2.754067509491694, "learning_rate": 6.665155303080103e-07, "logits/chosen": -3.3536322116851807, "logits/rejected": -3.560081720352173, "logps/chosen": -10.537589073181152, "logps/rejected": -37.63751220703125, "loss": 0.093, "rewards/accuracies": 1.0, "rewards/chosen": 3.356642723083496, "rewards/margins": 3.356642723083496, "rewards/rejected": 0.0, "step": 1297 }, { "epoch": 7.251396648044693, "grad_norm": 1.9364333731940997, "learning_rate": 6.659406574851494e-07, "logits/chosen": -3.443791151046753, "logits/rejected": -3.346338987350464, "logps/chosen": -6.035096168518066, "logps/rejected": -64.06090545654297, "loss": 0.0988, "rewards/accuracies": 1.0, "rewards/chosen": 3.0059869289398193, "rewards/margins": 3.0059869289398193, "rewards/rejected": 0.0, "step": 1298 }, { "epoch": 7.256983240223463, "grad_norm": 1.4585896778906036, "learning_rate": 6.653655380449192e-07, "logits/chosen": -2.924417018890381, "logits/rejected": -2.9158060550689697, "logps/chosen": -22.547714233398438, "logps/rejected": -62.49207305908203, "loss": 0.1306, "rewards/accuracies": 1.0, "rewards/chosen": 2.475858211517334, "rewards/margins": 2.475858211517334, "rewards/rejected": 0.0, "step": 1299 }, { "epoch": 7.262569832402234, "grad_norm": 2.772557178346172, "learning_rate": 6.647901728420493e-07, "logits/chosen": -3.595991373062134, "logits/rejected": -3.5744409561157227, "logps/chosen": -18.906320571899414, "logps/rejected": -29.811229705810547, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": 2.889451742172241, "rewards/margins": 2.889451742172241, "rewards/rejected": 0.0, "step": 1300 }, { "epoch": 7.262569832402234, "eval_logits/chosen": -3.3801612854003906, "eval_logits/rejected": -3.5086123943328857, "eval_logps/chosen": -24.592073440551758, "eval_logps/rejected": -44.9486083984375, "eval_loss": 0.6780498027801514, "eval_rewards/accuracies": 0.824999988079071, "eval_rewards/chosen": 0.5477972626686096, "eval_rewards/margins": 0.5477972626686096, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7423, "eval_samples_per_second": 9.468, "eval_steps_per_second": 0.305, "step": 1300 }, { "epoch": 7.268156424581005, "grad_norm": 1.6144703090043173, "learning_rate": 6.64214562731635e-07, "logits/chosen": -3.5145280361175537, "logits/rejected": -3.453125476837158, "logps/chosen": -15.009201049804688, "logps/rejected": -35.985870361328125, "loss": 0.0935, "rewards/accuracies": 1.0, "rewards/chosen": 3.4529547691345215, "rewards/margins": 3.4529547691345215, "rewards/rejected": 0.0, "step": 1301 }, { "epoch": 7.273743016759776, "grad_norm": 13.75040159306429, "learning_rate": 6.636387085691355e-07, "logits/chosen": -3.710878610610962, "logits/rejected": -3.6133155822753906, "logps/chosen": -19.006427764892578, "logps/rejected": -27.593320846557617, "loss": 0.1549, "rewards/accuracies": 1.0, "rewards/chosen": 2.8588337898254395, "rewards/margins": 2.8588337898254395, "rewards/rejected": 0.0, "step": 1302 }, { "epoch": 7.279329608938547, "grad_norm": 1.5685706957219425, "learning_rate": 6.630626112103727e-07, "logits/chosen": -3.2809746265411377, "logits/rejected": -3.403027057647705, "logps/chosen": -5.856812477111816, "logps/rejected": -106.83222198486328, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 2.7082905769348145, "rewards/margins": 2.7082905769348145, "rewards/rejected": 0.0, "step": 1303 }, { "epoch": 7.284916201117318, "grad_norm": 5.498404105160891, "learning_rate": 6.624862715115297e-07, "logits/chosen": -3.476365566253662, "logits/rejected": -3.4864091873168945, "logps/chosen": -8.03408432006836, "logps/rejected": -54.53045654296875, "loss": 0.1425, "rewards/accuracies": 1.0, "rewards/chosen": 2.623234272003174, "rewards/margins": 2.623234272003174, "rewards/rejected": 0.0, "step": 1304 }, { "epoch": 7.290502793296089, "grad_norm": 1.7987959234105906, "learning_rate": 6.6190969032915e-07, "logits/chosen": -3.5582940578460693, "logits/rejected": -3.632675886154175, "logps/chosen": -4.348865985870361, "logps/rejected": -73.341552734375, "loss": 0.1236, "rewards/accuracies": 1.0, "rewards/chosen": 2.675520181655884, "rewards/margins": 2.675520181655884, "rewards/rejected": 0.0, "step": 1305 }, { "epoch": 7.29608938547486, "grad_norm": 7.064715714550168, "learning_rate": 6.613328685201361e-07, "logits/chosen": -3.6168596744537354, "logits/rejected": -3.6552817821502686, "logps/chosen": -9.071053504943848, "logps/rejected": -21.970504760742188, "loss": 0.1761, "rewards/accuracies": 0.75, "rewards/chosen": 2.4692320823669434, "rewards/margins": 2.4692320823669434, "rewards/rejected": 0.0, "step": 1306 }, { "epoch": 7.301675977653631, "grad_norm": 1.3862521233978937, "learning_rate": 6.607558069417476e-07, "logits/chosen": -3.6154673099517822, "logits/rejected": -3.7376348972320557, "logps/chosen": -8.16813850402832, "logps/rejected": -33.8210334777832, "loss": 0.1045, "rewards/accuracies": 1.0, "rewards/chosen": 2.583134651184082, "rewards/margins": 2.583134651184082, "rewards/rejected": 0.0, "step": 1307 }, { "epoch": 7.307262569832402, "grad_norm": 1.8055495272432047, "learning_rate": 6.60178506451601e-07, "logits/chosen": -3.5150320529937744, "logits/rejected": -3.4990432262420654, "logps/chosen": -1.468370795249939, "logps/rejected": -52.503963470458984, "loss": 0.14, "rewards/accuracies": 1.0, "rewards/chosen": 1.68797767162323, "rewards/margins": 1.68797767162323, "rewards/rejected": 0.0, "step": 1308 }, { "epoch": 7.312849162011173, "grad_norm": 2.222508934949117, "learning_rate": 6.596009679076675e-07, "logits/chosen": -3.292964458465576, "logits/rejected": -3.464843988418579, "logps/chosen": -11.528118133544922, "logps/rejected": -59.465240478515625, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/chosen": 2.302183151245117, "rewards/margins": 2.302183151245117, "rewards/rejected": 0.0, "step": 1309 }, { "epoch": 7.318435754189944, "grad_norm": 1.0094083066470343, "learning_rate": 6.590231921682724e-07, "logits/chosen": -3.560662269592285, "logits/rejected": -3.674679756164551, "logps/chosen": -9.870981216430664, "logps/rejected": -36.52283477783203, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": 3.251835346221924, "rewards/margins": 3.251835346221924, "rewards/rejected": 0.0, "step": 1310 }, { "epoch": 7.324022346368715, "grad_norm": 1.2169747164098956, "learning_rate": 6.584451800920932e-07, "logits/chosen": -3.272944450378418, "logits/rejected": -3.4275331497192383, "logps/chosen": -0.7554080486297607, "logps/rejected": -94.23004150390625, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 1.7341724634170532, "rewards/margins": 1.7341724634170532, "rewards/rejected": 0.0, "step": 1311 }, { "epoch": 7.329608938547486, "grad_norm": 6.70686787687767, "learning_rate": 6.578669325381589e-07, "logits/chosen": -3.729093551635742, "logits/rejected": -3.688983678817749, "logps/chosen": -12.128386497497559, "logps/rejected": -46.44956970214844, "loss": 0.1367, "rewards/accuracies": 1.0, "rewards/chosen": 3.1582024097442627, "rewards/margins": 3.1582024097442627, "rewards/rejected": 0.0, "step": 1312 }, { "epoch": 7.335195530726257, "grad_norm": 1.0753440753555625, "learning_rate": 6.572884503658482e-07, "logits/chosen": -2.8142926692962646, "logits/rejected": -2.823444128036499, "logps/chosen": -69.76309967041016, "logps/rejected": -50.779598236083984, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 3.2833805084228516, "rewards/margins": 3.2833805084228516, "rewards/rejected": 0.0, "step": 1313 }, { "epoch": 7.340782122905028, "grad_norm": 2.4490095582853115, "learning_rate": 6.567097344348889e-07, "logits/chosen": -3.2057392597198486, "logits/rejected": -3.4715209007263184, "logps/chosen": -3.2136125564575195, "logps/rejected": -28.98649787902832, "loss": 0.1164, "rewards/accuracies": 1.0, "rewards/chosen": 2.4721896648406982, "rewards/margins": 2.4721896648406982, "rewards/rejected": 0.0, "step": 1314 }, { "epoch": 7.346368715083799, "grad_norm": 2.898508073567845, "learning_rate": 6.561307856053555e-07, "logits/chosen": -3.6992645263671875, "logits/rejected": -3.738304615020752, "logps/chosen": -8.727750778198242, "logps/rejected": -50.29473114013672, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": 3.2454607486724854, "rewards/margins": 3.2454607486724854, "rewards/rejected": 0.0, "step": 1315 }, { "epoch": 7.351955307262569, "grad_norm": 1.9034543468916336, "learning_rate": 6.555516047376695e-07, "logits/chosen": -3.5489768981933594, "logits/rejected": -3.7010304927825928, "logps/chosen": -4.087454795837402, "logps/rejected": -40.70397186279297, "loss": 0.1156, "rewards/accuracies": 1.0, "rewards/chosen": 2.49857234954834, "rewards/margins": 2.49857234954834, "rewards/rejected": 0.0, "step": 1316 }, { "epoch": 7.35754189944134, "grad_norm": 7.340890756786072, "learning_rate": 6.549721926925965e-07, "logits/chosen": -3.744727849960327, "logits/rejected": -3.801370620727539, "logps/chosen": -17.25729751586914, "logps/rejected": -53.50926971435547, "loss": 0.1597, "rewards/accuracies": 1.0, "rewards/chosen": 2.3661885261535645, "rewards/margins": 2.3661885261535645, "rewards/rejected": 0.0, "step": 1317 }, { "epoch": 7.363128491620111, "grad_norm": 1.7321282384917218, "learning_rate": 6.543925503312462e-07, "logits/chosen": -3.7766597270965576, "logits/rejected": -3.6987500190734863, "logps/chosen": -3.585883617401123, "logps/rejected": -24.525768280029297, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/chosen": 2.4641125202178955, "rewards/margins": 2.4641125202178955, "rewards/rejected": 0.0, "step": 1318 }, { "epoch": 7.368715083798882, "grad_norm": 1.643057365136865, "learning_rate": 6.538126785150704e-07, "logits/chosen": -3.5576860904693604, "logits/rejected": -3.6237735748291016, "logps/chosen": -11.430519104003906, "logps/rejected": -24.14063262939453, "loss": 0.1243, "rewards/accuracies": 1.0, "rewards/chosen": 3.605529308319092, "rewards/margins": 3.605529308319092, "rewards/rejected": 0.0, "step": 1319 }, { "epoch": 7.374301675977653, "grad_norm": 1.0075092519372641, "learning_rate": 6.532325781058616e-07, "logits/chosen": -3.627290964126587, "logits/rejected": -3.6239960193634033, "logps/chosen": -4.140728950500488, "logps/rejected": -57.34563446044922, "loss": 0.0938, "rewards/accuracies": 1.0, "rewards/chosen": 2.8437869548797607, "rewards/margins": 2.8437869548797607, "rewards/rejected": 0.0, "step": 1320 }, { "epoch": 7.374301675977653, "eval_logits/chosen": -3.3619918823242188, "eval_logits/rejected": -3.4917492866516113, "eval_logps/chosen": -24.981060028076172, "eval_logps/rejected": -44.87649917602539, "eval_loss": 0.6837815046310425, "eval_rewards/accuracies": 0.7749999761581421, "eval_rewards/chosen": 0.5088983774185181, "eval_rewards/margins": 0.5088983774185181, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7419, "eval_samples_per_second": 9.468, "eval_steps_per_second": 0.305, "step": 1320 }, { "epoch": 7.379888268156424, "grad_norm": 1.933407383049931, "learning_rate": 6.526522499657526e-07, "logits/chosen": -3.636347770690918, "logits/rejected": -3.7644951343536377, "logps/chosen": -18.489547729492188, "logps/rejected": -52.53203582763672, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": 3.291066884994507, "rewards/margins": 3.291066884994507, "rewards/rejected": 0.0, "step": 1321 }, { "epoch": 7.385474860335195, "grad_norm": 2.543537622248895, "learning_rate": 6.520716949572142e-07, "logits/chosen": -3.3230035305023193, "logits/rejected": -3.4805049896240234, "logps/chosen": -9.104710578918457, "logps/rejected": -39.75267791748047, "loss": 0.1369, "rewards/accuracies": 1.0, "rewards/chosen": 3.337641716003418, "rewards/margins": 3.337641716003418, "rewards/rejected": 0.0, "step": 1322 }, { "epoch": 7.391061452513966, "grad_norm": 2.669030596207329, "learning_rate": 6.514909139430548e-07, "logits/chosen": -3.0569634437561035, "logits/rejected": -3.1443471908569336, "logps/chosen": -9.642778396606445, "logps/rejected": -35.89179992675781, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": 2.8764891624450684, "rewards/margins": 2.8764891624450684, "rewards/rejected": 0.0, "step": 1323 }, { "epoch": 7.396648044692737, "grad_norm": 1.3889452322670262, "learning_rate": 6.509099077864179e-07, "logits/chosen": -3.413339853286743, "logits/rejected": -3.6618871688842773, "logps/chosen": -3.332961082458496, "logps/rejected": -51.81547927856445, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": 2.3606014251708984, "rewards/margins": 2.3606014251708984, "rewards/rejected": 0.0, "step": 1324 }, { "epoch": 7.402234636871508, "grad_norm": 2.696801803963323, "learning_rate": 6.503286773507828e-07, "logits/chosen": -3.6457481384277344, "logits/rejected": -3.445002317428589, "logps/chosen": -7.384559631347656, "logps/rejected": -45.16246032714844, "loss": 0.1266, "rewards/accuracies": 1.0, "rewards/chosen": 2.4800567626953125, "rewards/margins": 2.4800567626953125, "rewards/rejected": 0.0, "step": 1325 }, { "epoch": 7.407821229050279, "grad_norm": 1.676843188205714, "learning_rate": 6.497472234999608e-07, "logits/chosen": -3.5677103996276855, "logits/rejected": -3.4524805545806885, "logps/chosen": -12.379425048828125, "logps/rejected": -22.92210578918457, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 3.429171562194824, "rewards/margins": 3.429171562194824, "rewards/rejected": 0.0, "step": 1326 }, { "epoch": 7.41340782122905, "grad_norm": 3.349754471052375, "learning_rate": 6.491655470980964e-07, "logits/chosen": -3.7054364681243896, "logits/rejected": -3.617354154586792, "logps/chosen": -2.215886116027832, "logps/rejected": -32.634761810302734, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": 2.1901493072509766, "rewards/margins": 2.1901493072509766, "rewards/rejected": 0.0, "step": 1327 }, { "epoch": 7.418994413407821, "grad_norm": 7.6664406022562614, "learning_rate": 6.485836490096639e-07, "logits/chosen": -3.5133492946624756, "logits/rejected": -3.573883295059204, "logps/chosen": -2.703808307647705, "logps/rejected": -37.35500717163086, "loss": 0.1085, "rewards/accuracies": 1.0, "rewards/chosen": 2.566051483154297, "rewards/margins": 2.566051483154297, "rewards/rejected": 0.0, "step": 1328 }, { "epoch": 7.424581005586592, "grad_norm": 0.9687419745063361, "learning_rate": 6.480015300994677e-07, "logits/chosen": -2.770904064178467, "logits/rejected": -2.7978460788726807, "logps/chosen": -11.93392562866211, "logps/rejected": -45.412567138671875, "loss": 0.0909, "rewards/accuracies": 1.0, "rewards/chosen": 3.1353366374969482, "rewards/margins": 3.1353366374969482, "rewards/rejected": 0.0, "step": 1329 }, { "epoch": 7.430167597765363, "grad_norm": 3.5022599679615394, "learning_rate": 6.474191912326403e-07, "logits/chosen": -3.3307547569274902, "logits/rejected": -3.483093738555908, "logps/chosen": -3.9055068492889404, "logps/rejected": -44.35377502441406, "loss": 0.1128, "rewards/accuracies": 1.0, "rewards/chosen": 2.005783796310425, "rewards/margins": 2.005783796310425, "rewards/rejected": 0.0, "step": 1330 }, { "epoch": 7.435754189944134, "grad_norm": 0.9502362216600457, "learning_rate": 6.468366332746406e-07, "logits/chosen": -3.3484418392181396, "logits/rejected": -3.2991244792938232, "logps/chosen": -3.163999080657959, "logps/rejected": -51.34564208984375, "loss": 0.0864, "rewards/accuracies": 1.0, "rewards/chosen": 2.339700222015381, "rewards/margins": 2.339700222015381, "rewards/rejected": 0.0, "step": 1331 }, { "epoch": 7.441340782122905, "grad_norm": 1.3659937131735682, "learning_rate": 6.462538570912538e-07, "logits/chosen": -3.2325432300567627, "logits/rejected": -3.4618213176727295, "logps/chosen": -3.2940423488616943, "logps/rejected": -40.990081787109375, "loss": 0.1101, "rewards/accuracies": 1.0, "rewards/chosen": 3.100055694580078, "rewards/margins": 3.100055694580078, "rewards/rejected": 0.0, "step": 1332 }, { "epoch": 7.446927374301676, "grad_norm": 1.3018900265061804, "learning_rate": 6.456708635485889e-07, "logits/chosen": -3.415616750717163, "logits/rejected": -3.306875228881836, "logps/chosen": -8.464548110961914, "logps/rejected": -21.140100479125977, "loss": 0.0953, "rewards/accuracies": 1.0, "rewards/chosen": 2.3580121994018555, "rewards/margins": 2.3580121994018555, "rewards/rejected": 0.0, "step": 1333 }, { "epoch": 7.452513966480447, "grad_norm": 1.73838771227929, "learning_rate": 6.450876535130781e-07, "logits/chosen": -3.4838478565216064, "logits/rejected": -3.6847469806671143, "logps/chosen": -1.210036039352417, "logps/rejected": -52.936614990234375, "loss": 0.0995, "rewards/accuracies": 1.0, "rewards/chosen": 2.044281005859375, "rewards/margins": 2.044281005859375, "rewards/rejected": 0.0, "step": 1334 }, { "epoch": 7.4581005586592175, "grad_norm": 1.3309924063638015, "learning_rate": 6.445042278514758e-07, "logits/chosen": -3.348163366317749, "logits/rejected": -3.305093288421631, "logps/chosen": -6.772055149078369, "logps/rejected": -59.42049026489258, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2093491554260254, "rewards/margins": 2.2093491554260254, "rewards/rejected": 0.0, "step": 1335 }, { "epoch": 7.4636871508379885, "grad_norm": 1.2217187287545177, "learning_rate": 6.439205874308558e-07, "logits/chosen": -3.406967878341675, "logits/rejected": -3.4981319904327393, "logps/chosen": -4.282097816467285, "logps/rejected": -32.48981857299805, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": 2.5642776489257812, "rewards/margins": 2.5642776489257812, "rewards/rejected": 0.0, "step": 1336 }, { "epoch": 7.4692737430167595, "grad_norm": 1.4537598880879743, "learning_rate": 6.433367331186121e-07, "logits/chosen": -3.6480681896209717, "logits/rejected": -3.6677560806274414, "logps/chosen": -2.5883102416992188, "logps/rejected": -42.5275764465332, "loss": 0.1105, "rewards/accuracies": 1.0, "rewards/chosen": 2.205169439315796, "rewards/margins": 2.205169439315796, "rewards/rejected": 0.0, "step": 1337 }, { "epoch": 7.4748603351955305, "grad_norm": 3.323751503592095, "learning_rate": 6.427526657824563e-07, "logits/chosen": -3.100895404815674, "logits/rejected": -3.221378803253174, "logps/chosen": -9.436579704284668, "logps/rejected": -37.93687438964844, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 2.607145071029663, "rewards/margins": 2.607145071029663, "rewards/rejected": 0.0, "step": 1338 }, { "epoch": 7.4804469273743015, "grad_norm": 2.498765371403953, "learning_rate": 6.421683862904162e-07, "logits/chosen": -3.614877223968506, "logits/rejected": -3.744763135910034, "logps/chosen": -12.693595886230469, "logps/rejected": -46.41218566894531, "loss": 0.1353, "rewards/accuracies": 1.0, "rewards/chosen": 3.1861581802368164, "rewards/margins": 3.1861581802368164, "rewards/rejected": 0.0, "step": 1339 }, { "epoch": 7.4860335195530725, "grad_norm": 3.5104236517711866, "learning_rate": 6.415838955108352e-07, "logits/chosen": -3.611034631729126, "logits/rejected": -3.6447675228118896, "logps/chosen": -3.8875608444213867, "logps/rejected": -37.52974319458008, "loss": 0.142, "rewards/accuracies": 1.0, "rewards/chosen": 2.5034303665161133, "rewards/margins": 2.5034303665161133, "rewards/rejected": 0.0, "step": 1340 }, { "epoch": 7.4860335195530725, "eval_logits/chosen": -3.3717868328094482, "eval_logits/rejected": -3.5007662773132324, "eval_logps/chosen": -24.902873992919922, "eval_logps/rejected": -45.34352493286133, "eval_loss": 0.6902182102203369, "eval_rewards/accuracies": 0.7749999761581421, "eval_rewards/chosen": 0.5167169570922852, "eval_rewards/margins": 0.5167169570922852, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7296, "eval_samples_per_second": 9.472, "eval_steps_per_second": 0.306, "step": 1340 }, { "epoch": 7.4916201117318435, "grad_norm": 1.9746337321102174, "learning_rate": 6.409991943123708e-07, "logits/chosen": -3.668294906616211, "logits/rejected": -3.6877212524414062, "logps/chosen": -9.687832832336426, "logps/rejected": -51.59500503540039, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": 3.5231974124908447, "rewards/margins": 3.5231974124908447, "rewards/rejected": 0.0, "step": 1341 }, { "epoch": 7.4972067039106145, "grad_norm": 3.10042766484964, "learning_rate": 6.40414283563993e-07, "logits/chosen": -3.335176706314087, "logits/rejected": -3.398043632507324, "logps/chosen": -29.620380401611328, "logps/rejected": -28.167808532714844, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 3.102391481399536, "rewards/margins": 3.102391481399536, "rewards/rejected": 0.0, "step": 1342 }, { "epoch": 7.5027932960893855, "grad_norm": 1.8660695386987614, "learning_rate": 6.398291641349835e-07, "logits/chosen": -3.3379101753234863, "logits/rejected": -3.1388955116271973, "logps/chosen": -3.3202102184295654, "logps/rejected": -45.3973388671875, "loss": 0.1241, "rewards/accuracies": 1.0, "rewards/chosen": 2.2802844047546387, "rewards/margins": 2.2802844047546387, "rewards/rejected": 0.0, "step": 1343 }, { "epoch": 7.5083798882681565, "grad_norm": 1.118053151884192, "learning_rate": 6.392438368949338e-07, "logits/chosen": -3.490548849105835, "logits/rejected": -3.5045557022094727, "logps/chosen": -17.330812454223633, "logps/rejected": -46.646480560302734, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": 2.551020622253418, "rewards/margins": 2.551020622253418, "rewards/rejected": 0.0, "step": 1344 }, { "epoch": 7.5139664804469275, "grad_norm": 1.5846715669566198, "learning_rate": 6.386583027137446e-07, "logits/chosen": -3.4728899002075195, "logits/rejected": -3.342719078063965, "logps/chosen": -6.973177909851074, "logps/rejected": -72.66995239257812, "loss": 0.121, "rewards/accuracies": 1.0, "rewards/chosen": 2.5646984577178955, "rewards/margins": 2.5646984577178955, "rewards/rejected": 0.0, "step": 1345 }, { "epoch": 7.5195530726256985, "grad_norm": 2.014230838623146, "learning_rate": 6.380725624616239e-07, "logits/chosen": -3.4623639583587646, "logits/rejected": -3.6248762607574463, "logps/chosen": -7.952892303466797, "logps/rejected": -51.53924560546875, "loss": 0.1505, "rewards/accuracies": 1.0, "rewards/chosen": 2.5686726570129395, "rewards/margins": 2.5686726570129395, "rewards/rejected": 0.0, "step": 1346 }, { "epoch": 7.5251396648044695, "grad_norm": 1.158398029141674, "learning_rate": 6.374866170090858e-07, "logits/chosen": -3.652508020401001, "logits/rejected": -3.475364923477173, "logps/chosen": -6.89300012588501, "logps/rejected": -44.19477844238281, "loss": 0.1357, "rewards/accuracies": 1.0, "rewards/chosen": 2.634492874145508, "rewards/margins": 2.634492874145508, "rewards/rejected": 0.0, "step": 1347 }, { "epoch": 7.5307262569832405, "grad_norm": 1.2878840045831155, "learning_rate": 6.3690046722695e-07, "logits/chosen": -3.477654457092285, "logits/rejected": -3.5552642345428467, "logps/chosen": -9.743959426879883, "logps/rejected": -57.691184997558594, "loss": 0.1222, "rewards/accuracies": 1.0, "rewards/chosen": 2.865281581878662, "rewards/margins": 2.865281581878662, "rewards/rejected": 0.0, "step": 1348 }, { "epoch": 7.5363128491620115, "grad_norm": 2.260138463270068, "learning_rate": 6.363141139863393e-07, "logits/chosen": -3.523780584335327, "logits/rejected": -3.8094823360443115, "logps/chosen": -2.557177782058716, "logps/rejected": -60.25283432006836, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 2.2795915603637695, "rewards/margins": 2.2795915603637695, "rewards/rejected": 0.0, "step": 1349 }, { "epoch": 7.5418994413407825, "grad_norm": 3.1029917301525454, "learning_rate": 6.357275581586787e-07, "logits/chosen": -3.578190565109253, "logits/rejected": -3.648209571838379, "logps/chosen": -5.742236137390137, "logps/rejected": -31.89114761352539, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": 2.640815496444702, "rewards/margins": 2.640815496444702, "rewards/rejected": 0.0, "step": 1350 }, { "epoch": 7.547486033519553, "grad_norm": 1.8185286470035993, "learning_rate": 6.351408006156952e-07, "logits/chosen": -3.3877675533294678, "logits/rejected": -3.6783602237701416, "logps/chosen": -4.36691951751709, "logps/rejected": -93.2117691040039, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 1.9463605880737305, "rewards/margins": 1.9463605880737305, "rewards/rejected": 0.0, "step": 1351 }, { "epoch": 7.553072625698324, "grad_norm": 1.4962646551008194, "learning_rate": 6.345538422294146e-07, "logits/chosen": -3.5849568843841553, "logits/rejected": -3.41233491897583, "logps/chosen": -8.534076690673828, "logps/rejected": -20.666913986206055, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 2.941847801208496, "rewards/margins": 2.941847801208496, "rewards/rejected": 0.0, "step": 1352 }, { "epoch": 7.558659217877095, "grad_norm": 3.0771917352843436, "learning_rate": 6.339666838721618e-07, "logits/chosen": -3.6744654178619385, "logits/rejected": -3.5337159633636475, "logps/chosen": -6.005249500274658, "logps/rejected": -44.79070281982422, "loss": 0.1613, "rewards/accuracies": 1.0, "rewards/chosen": 2.5271992683410645, "rewards/margins": 2.5271992683410645, "rewards/rejected": 0.0, "step": 1353 }, { "epoch": 7.564245810055866, "grad_norm": 1.4739657920602398, "learning_rate": 6.333793264165585e-07, "logits/chosen": -3.5482747554779053, "logits/rejected": -3.4847331047058105, "logps/chosen": -2.1421570777893066, "logps/rejected": -73.30380249023438, "loss": 0.0898, "rewards/accuracies": 1.0, "rewards/chosen": 2.2679672241210938, "rewards/margins": 2.2679672241210938, "rewards/rejected": 0.0, "step": 1354 }, { "epoch": 7.569832402234637, "grad_norm": 3.218194655153971, "learning_rate": 6.327917707355227e-07, "logits/chosen": -3.604827642440796, "logits/rejected": -3.4153332710266113, "logps/chosen": -2.7689061164855957, "logps/rejected": -67.11903381347656, "loss": 0.1253, "rewards/accuracies": 1.0, "rewards/chosen": 2.2885615825653076, "rewards/margins": 2.2885615825653076, "rewards/rejected": 0.0, "step": 1355 }, { "epoch": 7.575418994413408, "grad_norm": 2.9586776277300273, "learning_rate": 6.322040177022666e-07, "logits/chosen": -3.6942250728607178, "logits/rejected": -3.5640459060668945, "logps/chosen": -10.615409851074219, "logps/rejected": -36.34138488769531, "loss": 0.117, "rewards/accuracies": 1.0, "rewards/chosen": 3.105916976928711, "rewards/margins": 3.105916976928711, "rewards/rejected": 0.0, "step": 1356 }, { "epoch": 7.581005586592179, "grad_norm": 1.2298329754678976, "learning_rate": 6.316160681902959e-07, "logits/chosen": -3.546031951904297, "logits/rejected": -3.579220771789551, "logps/chosen": -2.216337203979492, "logps/rejected": -45.56477355957031, "loss": 0.107, "rewards/accuracies": 1.0, "rewards/chosen": 2.4204089641571045, "rewards/margins": 2.4204089641571045, "rewards/rejected": 0.0, "step": 1357 }, { "epoch": 7.58659217877095, "grad_norm": 1.1688780139728336, "learning_rate": 6.310279230734084e-07, "logits/chosen": -3.5705759525299072, "logits/rejected": -3.547123908996582, "logps/chosen": -3.538571357727051, "logps/rejected": -37.826637268066406, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 2.537139415740967, "rewards/margins": 2.537139415740967, "rewards/rejected": 0.0, "step": 1358 }, { "epoch": 7.592178770949721, "grad_norm": 5.304963121125634, "learning_rate": 6.304395832256925e-07, "logits/chosen": -3.28761625289917, "logits/rejected": -3.271043539047241, "logps/chosen": -4.488574504852295, "logps/rejected": -73.1676025390625, "loss": 0.1322, "rewards/accuracies": 1.0, "rewards/chosen": 2.421415328979492, "rewards/margins": 2.421415328979492, "rewards/rejected": 0.0, "step": 1359 }, { "epoch": 7.597765363128492, "grad_norm": 2.023221321031377, "learning_rate": 6.298510495215258e-07, "logits/chosen": -3.6465795040130615, "logits/rejected": -3.650517225265503, "logps/chosen": -5.583526611328125, "logps/rejected": -19.741701126098633, "loss": 0.127, "rewards/accuracies": 1.0, "rewards/chosen": 2.5471010208129883, "rewards/margins": 2.5471010208129883, "rewards/rejected": 0.0, "step": 1360 }, { "epoch": 7.597765363128492, "eval_logits/chosen": -3.377352476119995, "eval_logits/rejected": -3.5058913230895996, "eval_logps/chosen": -24.613401412963867, "eval_logps/rejected": -45.38679122924805, "eval_loss": 0.6831362247467041, "eval_rewards/accuracies": 0.7749999761581421, "eval_rewards/chosen": 0.5456639528274536, "eval_rewards/margins": 0.5456639528274536, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6911, "eval_samples_per_second": 9.483, "eval_steps_per_second": 0.306, "step": 1360 }, { "epoch": 7.603351955307263, "grad_norm": 1.8617615502700466, "learning_rate": 6.292623228355742e-07, "logits/chosen": -3.835742712020874, "logits/rejected": -3.8247580528259277, "logps/chosen": -17.866558074951172, "logps/rejected": -32.8570442199707, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 3.352844715118408, "rewards/margins": 3.352844715118408, "rewards/rejected": 0.0, "step": 1361 }, { "epoch": 7.608938547486034, "grad_norm": 1.1518435267353249, "learning_rate": 6.286734040427908e-07, "logits/chosen": -3.4593331813812256, "logits/rejected": -3.492604970932007, "logps/chosen": -7.050579071044922, "logps/rejected": -48.49711608886719, "loss": 0.1059, "rewards/accuracies": 1.0, "rewards/chosen": 2.9449377059936523, "rewards/margins": 2.9449377059936523, "rewards/rejected": 0.0, "step": 1362 }, { "epoch": 7.614525139664805, "grad_norm": 2.0091735988016413, "learning_rate": 6.280842940184132e-07, "logits/chosen": -3.2921576499938965, "logits/rejected": -3.3532657623291016, "logps/chosen": -4.33574104309082, "logps/rejected": -46.30808639526367, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 2.816415309906006, "rewards/margins": 2.816415309906006, "rewards/rejected": 0.0, "step": 1363 }, { "epoch": 7.620111731843576, "grad_norm": 6.099087823168098, "learning_rate": 6.274949936379643e-07, "logits/chosen": -3.1784050464630127, "logits/rejected": -3.2182962894439697, "logps/chosen": -18.77121353149414, "logps/rejected": -33.73542022705078, "loss": 0.1596, "rewards/accuracies": 1.0, "rewards/chosen": 2.436699628829956, "rewards/margins": 2.436699628829956, "rewards/rejected": 0.0, "step": 1364 }, { "epoch": 7.625698324022347, "grad_norm": 1.6373569972408615, "learning_rate": 6.269055037772491e-07, "logits/chosen": -3.4280288219451904, "logits/rejected": -3.6520519256591797, "logps/chosen": -1.1923744678497314, "logps/rejected": -47.18501281738281, "loss": 0.1218, "rewards/accuracies": 1.0, "rewards/chosen": 1.9932301044464111, "rewards/margins": 1.9932301044464111, "rewards/rejected": 0.0, "step": 1365 }, { "epoch": 7.631284916201118, "grad_norm": 2.7189590731815687, "learning_rate": 6.263158253123547e-07, "logits/chosen": -3.5601494312286377, "logits/rejected": -3.5052757263183594, "logps/chosen": -9.23727035522461, "logps/rejected": -65.1521224975586, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": 2.8892292976379395, "rewards/margins": 2.8892292976379395, "rewards/rejected": 0.0, "step": 1366 }, { "epoch": 7.636871508379889, "grad_norm": 5.333570060425901, "learning_rate": 6.257259591196484e-07, "logits/chosen": -3.7892396450042725, "logits/rejected": -3.836857318878174, "logps/chosen": -14.42813491821289, "logps/rejected": -40.66142272949219, "loss": 0.1179, "rewards/accuracies": 1.0, "rewards/chosen": 2.175755500793457, "rewards/margins": 2.175755500793457, "rewards/rejected": 0.0, "step": 1367 }, { "epoch": 7.64245810055866, "grad_norm": 1.5204927438185927, "learning_rate": 6.251359060757762e-07, "logits/chosen": -3.2748348712921143, "logits/rejected": -3.301457643508911, "logps/chosen": -1.2259882688522339, "logps/rejected": -36.65496063232422, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": 1.735575795173645, "rewards/margins": 1.735575795173645, "rewards/rejected": 0.0, "step": 1368 }, { "epoch": 7.648044692737431, "grad_norm": 3.374061606176809, "learning_rate": 6.245456670576621e-07, "logits/chosen": -3.1197638511657715, "logits/rejected": -3.1260502338409424, "logps/chosen": -26.035179138183594, "logps/rejected": -49.376625061035156, "loss": 0.1317, "rewards/accuracies": 1.0, "rewards/chosen": 2.248068332672119, "rewards/margins": 2.248068332672119, "rewards/rejected": 0.0, "step": 1369 }, { "epoch": 7.653631284916202, "grad_norm": 7.988430551598308, "learning_rate": 6.239552429425063e-07, "logits/chosen": -3.3330259323120117, "logits/rejected": -3.3822152614593506, "logps/chosen": -5.702103614807129, "logps/rejected": -44.26451873779297, "loss": 0.1481, "rewards/accuracies": 1.0, "rewards/chosen": 2.56434965133667, "rewards/margins": 2.56434965133667, "rewards/rejected": 0.0, "step": 1370 }, { "epoch": 7.659217877094972, "grad_norm": 2.0178085734716706, "learning_rate": 6.233646346077844e-07, "logits/chosen": -3.6526939868927, "logits/rejected": -3.6671154499053955, "logps/chosen": -4.053013801574707, "logps/rejected": -42.36505889892578, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": 2.2508602142333984, "rewards/margins": 2.2508602142333984, "rewards/rejected": 0.0, "step": 1371 }, { "epoch": 7.664804469273743, "grad_norm": 4.93361389110381, "learning_rate": 6.227738429312455e-07, "logits/chosen": -3.4513962268829346, "logits/rejected": -3.3486549854278564, "logps/chosen": -3.926506996154785, "logps/rejected": -51.46466064453125, "loss": 0.1274, "rewards/accuracies": 1.0, "rewards/chosen": 1.7176238298416138, "rewards/margins": 1.7176238298416138, "rewards/rejected": 0.0, "step": 1372 }, { "epoch": 7.670391061452514, "grad_norm": 1.2575238577960888, "learning_rate": 6.221828687909108e-07, "logits/chosen": -3.532562732696533, "logits/rejected": -3.596668004989624, "logps/chosen": -3.615190267562866, "logps/rejected": -54.00471115112305, "loss": 0.1243, "rewards/accuracies": 1.0, "rewards/chosen": 2.430891990661621, "rewards/margins": 2.430891990661621, "rewards/rejected": 0.0, "step": 1373 }, { "epoch": 7.675977653631285, "grad_norm": 2.0991029561289083, "learning_rate": 6.215917130650739e-07, "logits/chosen": -3.5558907985687256, "logits/rejected": -3.493356704711914, "logps/chosen": -2.2038702964782715, "logps/rejected": -34.609466552734375, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 2.418152332305908, "rewards/margins": 2.418152332305908, "rewards/rejected": 0.0, "step": 1374 }, { "epoch": 7.681564245810056, "grad_norm": 1.631611969715661, "learning_rate": 6.210003766322969e-07, "logits/chosen": -3.4739394187927246, "logits/rejected": -3.3518495559692383, "logps/chosen": -14.285957336425781, "logps/rejected": -38.5153923034668, "loss": 0.1243, "rewards/accuracies": 1.0, "rewards/chosen": 3.2559762001037598, "rewards/margins": 3.2559762001037598, "rewards/rejected": 0.0, "step": 1375 }, { "epoch": 7.687150837988827, "grad_norm": 2.013490296619291, "learning_rate": 6.204088603714111e-07, "logits/chosen": -3.411038398742676, "logits/rejected": -3.455512762069702, "logps/chosen": -2.3266758918762207, "logps/rejected": -44.296722412109375, "loss": 0.1025, "rewards/accuracies": 1.0, "rewards/chosen": 2.3725152015686035, "rewards/margins": 2.3725152015686035, "rewards/rejected": 0.0, "step": 1376 }, { "epoch": 7.692737430167598, "grad_norm": 2.404214334111452, "learning_rate": 6.198171651615154e-07, "logits/chosen": -3.2101945877075195, "logits/rejected": -3.4877567291259766, "logps/chosen": -2.0130977630615234, "logps/rejected": -56.4400634765625, "loss": 0.1071, "rewards/accuracies": 1.0, "rewards/chosen": 1.49690580368042, "rewards/margins": 1.49690580368042, "rewards/rejected": 0.0, "step": 1377 }, { "epoch": 7.698324022346369, "grad_norm": 12.290736935137124, "learning_rate": 6.192252918819739e-07, "logits/chosen": -3.4152767658233643, "logits/rejected": -3.3613784313201904, "logps/chosen": -26.862014770507812, "logps/rejected": -40.26948547363281, "loss": 0.1156, "rewards/accuracies": 1.0, "rewards/chosen": 3.6470947265625, "rewards/margins": 3.6470947265625, "rewards/rejected": 0.0, "step": 1378 }, { "epoch": 7.70391061452514, "grad_norm": 2.6495774255832147, "learning_rate": 6.186332414124159e-07, "logits/chosen": -2.77083158493042, "logits/rejected": -2.869678020477295, "logps/chosen": -6.970669746398926, "logps/rejected": -46.74264907836914, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": 2.762322425842285, "rewards/margins": 2.762322425842285, "rewards/rejected": 0.0, "step": 1379 }, { "epoch": 7.709497206703911, "grad_norm": 1.446488271803958, "learning_rate": 6.180410146327336e-07, "logits/chosen": -3.3112292289733887, "logits/rejected": -3.009565830230713, "logps/chosen": -11.268660545349121, "logps/rejected": -48.60431671142578, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": 2.6027472019195557, "rewards/margins": 2.6027472019195557, "rewards/rejected": 0.0, "step": 1380 }, { "epoch": 7.709497206703911, "eval_logits/chosen": -3.383289337158203, "eval_logits/rejected": -3.508803129196167, "eval_logps/chosen": -24.955305099487305, "eval_logps/rejected": -45.27484893798828, "eval_loss": 0.6841699481010437, "eval_rewards/accuracies": 0.7749999761581421, "eval_rewards/chosen": 0.5114735960960388, "eval_rewards/margins": 0.5114735960960388, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6665, "eval_samples_per_second": 9.49, "eval_steps_per_second": 0.306, "step": 1380 }, { "epoch": 7.715083798882682, "grad_norm": 2.171345951728274, "learning_rate": 6.174486124230817e-07, "logits/chosen": -3.494107961654663, "logits/rejected": -3.584721803665161, "logps/chosen": -1.2909730672836304, "logps/rejected": -39.61445236206055, "loss": 0.1355, "rewards/accuracies": 1.0, "rewards/chosen": 1.7321256399154663, "rewards/margins": 1.7321256399154663, "rewards/rejected": 0.0, "step": 1381 }, { "epoch": 7.720670391061453, "grad_norm": 2.1896169795722726, "learning_rate": 6.168560356638751e-07, "logits/chosen": -3.6455185413360596, "logits/rejected": -3.780693531036377, "logps/chosen": -8.931732177734375, "logps/rejected": -32.342994689941406, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 3.001253128051758, "rewards/margins": 3.001253128051758, "rewards/rejected": 0.0, "step": 1382 }, { "epoch": 7.726256983240224, "grad_norm": 1.1964830984574129, "learning_rate": 6.162632852357887e-07, "logits/chosen": -3.446711540222168, "logits/rejected": -3.3967432975769043, "logps/chosen": -8.087589263916016, "logps/rejected": -30.408592224121094, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 2.4150784015655518, "rewards/margins": 2.4150784015655518, "rewards/rejected": 0.0, "step": 1383 }, { "epoch": 7.731843575418995, "grad_norm": 1.4030991044792858, "learning_rate": 6.156703620197552e-07, "logits/chosen": -3.4570517539978027, "logits/rejected": -3.558060884475708, "logps/chosen": -4.256320953369141, "logps/rejected": -38.700252532958984, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": 2.5645179748535156, "rewards/margins": 2.5645179748535156, "rewards/rejected": 0.0, "step": 1384 }, { "epoch": 7.737430167597766, "grad_norm": 2.858857863046088, "learning_rate": 6.150772668969639e-07, "logits/chosen": -3.7470078468322754, "logits/rejected": -3.738076686859131, "logps/chosen": -14.466458320617676, "logps/rejected": -39.010990142822266, "loss": 0.1212, "rewards/accuracies": 1.0, "rewards/chosen": 3.303770065307617, "rewards/margins": 3.303770065307617, "rewards/rejected": 0.0, "step": 1385 }, { "epoch": 7.743016759776537, "grad_norm": 1.8436366580988155, "learning_rate": 6.144840007488598e-07, "logits/chosen": -3.9643008708953857, "logits/rejected": -3.5621531009674072, "logps/chosen": -3.323282480239868, "logps/rejected": -33.563270568847656, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": 2.2505550384521484, "rewards/margins": 2.2505550384521484, "rewards/rejected": 0.0, "step": 1386 }, { "epoch": 7.748603351955307, "grad_norm": 33.18464451137197, "learning_rate": 6.13890564457142e-07, "logits/chosen": -3.5103845596313477, "logits/rejected": -3.5708584785461426, "logps/chosen": -5.393838882446289, "logps/rejected": -81.33026123046875, "loss": 0.1234, "rewards/accuracies": 1.0, "rewards/chosen": 2.862974166870117, "rewards/margins": 2.862974166870117, "rewards/rejected": 0.0, "step": 1387 }, { "epoch": 7.754189944134078, "grad_norm": 2.6710716417733202, "learning_rate": 6.132969589037629e-07, "logits/chosen": -3.633074998855591, "logits/rejected": -3.5671002864837646, "logps/chosen": -2.210402727127075, "logps/rejected": -37.03133010864258, "loss": 0.1126, "rewards/accuracies": 1.0, "rewards/chosen": 1.8473315238952637, "rewards/margins": 1.8473315238952637, "rewards/rejected": 0.0, "step": 1388 }, { "epoch": 7.759776536312849, "grad_norm": 1.9040518769852222, "learning_rate": 6.127031849709256e-07, "logits/chosen": -3.615497350692749, "logits/rejected": -3.615487813949585, "logps/chosen": -8.555794715881348, "logps/rejected": -52.548377990722656, "loss": 0.1161, "rewards/accuracies": 1.0, "rewards/chosen": 2.9727935791015625, "rewards/margins": 2.9727935791015625, "rewards/rejected": 0.0, "step": 1389 }, { "epoch": 7.76536312849162, "grad_norm": 3.1192216970559175, "learning_rate": 6.12109243541084e-07, "logits/chosen": -3.539001703262329, "logits/rejected": -3.648571491241455, "logps/chosen": -1.8654552698135376, "logps/rejected": -49.581146240234375, "loss": 0.1286, "rewards/accuracies": 1.0, "rewards/chosen": 2.5140178203582764, "rewards/margins": 2.5140178203582764, "rewards/rejected": 0.0, "step": 1390 }, { "epoch": 7.770949720670391, "grad_norm": 1.357172241819613, "learning_rate": 6.115151354969413e-07, "logits/chosen": -3.668170690536499, "logits/rejected": -3.449744939804077, "logps/chosen": -17.671070098876953, "logps/rejected": -23.16139793395996, "loss": 0.0973, "rewards/accuracies": 1.0, "rewards/chosen": 2.8469767570495605, "rewards/margins": 2.8469767570495605, "rewards/rejected": 0.0, "step": 1391 }, { "epoch": 7.776536312849162, "grad_norm": 1.929706858236694, "learning_rate": 6.10920861721447e-07, "logits/chosen": -3.673409938812256, "logits/rejected": -3.611703872680664, "logps/chosen": -1.868884563446045, "logps/rejected": -102.60816955566406, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 2.1316471099853516, "rewards/margins": 2.1316471099853516, "rewards/rejected": 0.0, "step": 1392 }, { "epoch": 7.782122905027933, "grad_norm": 4.120154268907915, "learning_rate": 6.103264230977986e-07, "logits/chosen": -3.464028835296631, "logits/rejected": -3.5241379737854004, "logps/chosen": -8.902050018310547, "logps/rejected": -28.881071090698242, "loss": 0.1233, "rewards/accuracies": 1.0, "rewards/chosen": 2.9710700511932373, "rewards/margins": 2.9710700511932373, "rewards/rejected": 0.0, "step": 1393 }, { "epoch": 7.787709497206704, "grad_norm": 2.72460910417234, "learning_rate": 6.097318205094373e-07, "logits/chosen": -3.392451286315918, "logits/rejected": -3.666982412338257, "logps/chosen": -13.357746124267578, "logps/rejected": -72.52420806884766, "loss": 0.0985, "rewards/accuracies": 1.0, "rewards/chosen": 2.2183361053466797, "rewards/margins": 2.2183361053466797, "rewards/rejected": 0.0, "step": 1394 }, { "epoch": 7.793296089385475, "grad_norm": 1.9192607441144842, "learning_rate": 6.091370548400485e-07, "logits/chosen": -3.409224033355713, "logits/rejected": -3.3265554904937744, "logps/chosen": -1.9528727531433105, "logps/rejected": -83.61040496826172, "loss": 0.1048, "rewards/accuracies": 1.0, "rewards/chosen": 2.0228054523468018, "rewards/margins": 2.0228054523468018, "rewards/rejected": 0.0, "step": 1395 }, { "epoch": 7.798882681564246, "grad_norm": 1.867699872620725, "learning_rate": 6.0854212697356e-07, "logits/chosen": -3.6545894145965576, "logits/rejected": -3.3519484996795654, "logps/chosen": -7.546158790588379, "logps/rejected": -30.187633514404297, "loss": 0.116, "rewards/accuracies": 1.0, "rewards/chosen": 2.773406982421875, "rewards/margins": 2.773406982421875, "rewards/rejected": 0.0, "step": 1396 }, { "epoch": 7.804469273743017, "grad_norm": 2.2953090984405846, "learning_rate": 6.079470377941406e-07, "logits/chosen": -3.3122012615203857, "logits/rejected": -3.3559436798095703, "logps/chosen": -3.9074928760528564, "logps/rejected": -51.002899169921875, "loss": 0.1215, "rewards/accuracies": 1.0, "rewards/chosen": 2.5583930015563965, "rewards/margins": 2.5583930015563965, "rewards/rejected": 0.0, "step": 1397 }, { "epoch": 7.810055865921788, "grad_norm": 1.544662917361502, "learning_rate": 6.073517881861987e-07, "logits/chosen": -3.674224615097046, "logits/rejected": -3.7264983654022217, "logps/chosen": -4.493802547454834, "logps/rejected": -24.393909454345703, "loss": 0.0847, "rewards/accuracies": 1.0, "rewards/chosen": 2.688265562057495, "rewards/margins": 2.688265562057495, "rewards/rejected": 0.0, "step": 1398 }, { "epoch": 7.815642458100559, "grad_norm": 1.439069885695365, "learning_rate": 6.067563790343813e-07, "logits/chosen": -3.4475038051605225, "logits/rejected": -3.591827869415283, "logps/chosen": -5.838848114013672, "logps/rejected": -49.471961975097656, "loss": 0.1321, "rewards/accuracies": 1.0, "rewards/chosen": 2.5757193565368652, "rewards/margins": 2.5757193565368652, "rewards/rejected": 0.0, "step": 1399 }, { "epoch": 7.82122905027933, "grad_norm": 1.1246715943382999, "learning_rate": 6.061608112235723e-07, "logits/chosen": -3.374066114425659, "logits/rejected": -3.497426986694336, "logps/chosen": -1.7087607383728027, "logps/rejected": -63.19783020019531, "loss": 0.1178, "rewards/accuracies": 1.0, "rewards/chosen": 1.7700719833374023, "rewards/margins": 1.7700719833374023, "rewards/rejected": 0.0, "step": 1400 }, { "epoch": 7.82122905027933, "eval_logits/chosen": -3.366156816482544, "eval_logits/rejected": -3.4939181804656982, "eval_logps/chosen": -25.586759567260742, "eval_logps/rejected": -46.298309326171875, "eval_loss": 0.7026585936546326, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": 0.4483281075954437, "eval_rewards/margins": 0.4483281075954437, "eval_rewards/rejected": 0.0, "eval_runtime": 32.698, "eval_samples_per_second": 9.481, "eval_steps_per_second": 0.306, "step": 1400 }, { "epoch": 7.826815642458101, "grad_norm": 2.260126420189014, "learning_rate": 6.055650856388916e-07, "logits/chosen": -3.5005438327789307, "logits/rejected": -3.439100742340088, "logps/chosen": -1.6743199825286865, "logps/rejected": -31.64121437072754, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 2.0029587745666504, "rewards/margins": 2.0029587745666504, "rewards/rejected": 0.0, "step": 1401 }, { "epoch": 7.832402234636872, "grad_norm": 3.736988949502141, "learning_rate": 6.049692031656934e-07, "logits/chosen": -3.4197769165039062, "logits/rejected": -3.3480851650238037, "logps/chosen": -6.497530460357666, "logps/rejected": -40.690635681152344, "loss": 0.1166, "rewards/accuracies": 1.0, "rewards/chosen": 2.5752921104431152, "rewards/margins": 2.5752921104431152, "rewards/rejected": 0.0, "step": 1402 }, { "epoch": 7.837988826815643, "grad_norm": 4.0719521366359785, "learning_rate": 6.043731646895655e-07, "logits/chosen": -3.29199481010437, "logits/rejected": -3.2787442207336426, "logps/chosen": -4.126980304718018, "logps/rejected": -25.43942642211914, "loss": 0.1306, "rewards/accuracies": 1.0, "rewards/chosen": 2.086606025695801, "rewards/margins": 2.086606025695801, "rewards/rejected": 0.0, "step": 1403 }, { "epoch": 7.843575418994414, "grad_norm": 2.2633841595929978, "learning_rate": 6.037769710963271e-07, "logits/chosen": -3.43090558052063, "logits/rejected": -3.5747225284576416, "logps/chosen": -4.280132293701172, "logps/rejected": -33.68571472167969, "loss": 0.1028, "rewards/accuracies": 1.0, "rewards/chosen": 2.464203119277954, "rewards/margins": 2.464203119277954, "rewards/rejected": 0.0, "step": 1404 }, { "epoch": 7.849162011173185, "grad_norm": 1.9929313553569437, "learning_rate": 6.031806232720277e-07, "logits/chosen": -3.09665846824646, "logits/rejected": -3.1195924282073975, "logps/chosen": -32.372947692871094, "logps/rejected": -27.54912567138672, "loss": 0.1052, "rewards/accuracies": 1.0, "rewards/chosen": 3.689082622528076, "rewards/margins": 3.689082622528076, "rewards/rejected": 0.0, "step": 1405 }, { "epoch": 7.854748603351956, "grad_norm": 1.611789802544927, "learning_rate": 6.025841221029469e-07, "logits/chosen": -3.410153865814209, "logits/rejected": -3.562817335128784, "logps/chosen": -3.033844470977783, "logps/rejected": -54.31105041503906, "loss": 0.0946, "rewards/accuracies": 1.0, "rewards/chosen": 2.2825844287872314, "rewards/margins": 2.2825844287872314, "rewards/rejected": 0.0, "step": 1406 }, { "epoch": 7.860335195530726, "grad_norm": 6.442979724468341, "learning_rate": 6.019874684755913e-07, "logits/chosen": -3.4948666095733643, "logits/rejected": -3.6496846675872803, "logps/chosen": -4.0476837158203125, "logps/rejected": -115.38198852539062, "loss": 0.1469, "rewards/accuracies": 1.0, "rewards/chosen": 1.433524489402771, "rewards/margins": 1.433524489402771, "rewards/rejected": 0.0, "step": 1407 }, { "epoch": 7.865921787709497, "grad_norm": 2.789367083541046, "learning_rate": 6.013906632766946e-07, "logits/chosen": -3.7730135917663574, "logits/rejected": -3.7376444339752197, "logps/chosen": -4.7420172691345215, "logps/rejected": -45.96259307861328, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/chosen": 3.087348461151123, "rewards/margins": 3.087348461151123, "rewards/rejected": 0.0, "step": 1408 }, { "epoch": 7.871508379888268, "grad_norm": 1.6642517172457674, "learning_rate": 6.007937073932156e-07, "logits/chosen": -3.703899621963501, "logits/rejected": -3.7811319828033447, "logps/chosen": -2.2926485538482666, "logps/rejected": -42.525848388671875, "loss": 0.1109, "rewards/accuracies": 1.0, "rewards/chosen": 2.5533089637756348, "rewards/margins": 2.5533089637756348, "rewards/rejected": 0.0, "step": 1409 }, { "epoch": 7.877094972067039, "grad_norm": 2.5132314691328, "learning_rate": 6.001966017123371e-07, "logits/chosen": -3.6104767322540283, "logits/rejected": -3.619384765625, "logps/chosen": -1.8226158618927002, "logps/rejected": -48.86735534667969, "loss": 0.1258, "rewards/accuracies": 1.0, "rewards/chosen": 1.5055662393569946, "rewards/margins": 1.5055662393569946, "rewards/rejected": 0.0, "step": 1410 }, { "epoch": 7.88268156424581, "grad_norm": 3.5448643787501206, "learning_rate": 5.995993471214644e-07, "logits/chosen": -3.394293785095215, "logits/rejected": -3.4889090061187744, "logps/chosen": -21.46180534362793, "logps/rejected": -35.470645904541016, "loss": 0.1531, "rewards/accuracies": 1.0, "rewards/chosen": 3.3129191398620605, "rewards/margins": 3.3129191398620605, "rewards/rejected": 0.0, "step": 1411 }, { "epoch": 7.888268156424581, "grad_norm": 2.71436756552808, "learning_rate": 5.990019445082241e-07, "logits/chosen": -3.2157909870147705, "logits/rejected": -3.5748908519744873, "logps/chosen": -8.530254364013672, "logps/rejected": -38.43312072753906, "loss": 0.1243, "rewards/accuracies": 1.0, "rewards/chosen": 2.5410425662994385, "rewards/margins": 2.5410425662994385, "rewards/rejected": 0.0, "step": 1412 }, { "epoch": 7.893854748603352, "grad_norm": 3.4204042283735383, "learning_rate": 5.98404394760463e-07, "logits/chosen": -3.390915870666504, "logits/rejected": -3.157789945602417, "logps/chosen": -10.765122413635254, "logps/rejected": -43.94075393676758, "loss": 0.1314, "rewards/accuracies": 1.0, "rewards/chosen": 2.5835838317871094, "rewards/margins": 2.5835838317871094, "rewards/rejected": 0.0, "step": 1413 }, { "epoch": 7.899441340782123, "grad_norm": 2.5722225871384077, "learning_rate": 5.978066987662464e-07, "logits/chosen": -3.525294542312622, "logits/rejected": -3.432190418243408, "logps/chosen": -8.895794868469238, "logps/rejected": -32.79235076904297, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": 2.473673105239868, "rewards/margins": 2.473673105239868, "rewards/rejected": 0.0, "step": 1414 }, { "epoch": 7.905027932960894, "grad_norm": 2.356667195650094, "learning_rate": 5.97208857413857e-07, "logits/chosen": -3.508132219314575, "logits/rejected": -3.1443114280700684, "logps/chosen": -3.209148645401001, "logps/rejected": -38.9202995300293, "loss": 0.103, "rewards/accuracies": 1.0, "rewards/chosen": 2.4470789432525635, "rewards/margins": 2.4470789432525635, "rewards/rejected": 0.0, "step": 1415 }, { "epoch": 7.910614525139665, "grad_norm": 2.4473535198646243, "learning_rate": 5.966108715917937e-07, "logits/chosen": -3.510397434234619, "logits/rejected": -3.4470112323760986, "logps/chosen": -7.509511947631836, "logps/rejected": -74.8454360961914, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 2.598639965057373, "rewards/margins": 2.598639965057373, "rewards/rejected": 0.0, "step": 1416 }, { "epoch": 7.916201117318436, "grad_norm": 2.4295257304259916, "learning_rate": 5.960127421887696e-07, "logits/chosen": -3.386652946472168, "logits/rejected": -3.4341964721679688, "logps/chosen": -11.838761329650879, "logps/rejected": -29.63746452331543, "loss": 0.13, "rewards/accuracies": 1.0, "rewards/chosen": 2.595767021179199, "rewards/margins": 2.595767021179199, "rewards/rejected": 0.0, "step": 1417 }, { "epoch": 7.921787709497207, "grad_norm": 2.2345989719667037, "learning_rate": 5.954144700937118e-07, "logits/chosen": -3.4850287437438965, "logits/rejected": -3.368241548538208, "logps/chosen": -14.28183364868164, "logps/rejected": -23.823863983154297, "loss": 0.1171, "rewards/accuracies": 1.0, "rewards/chosen": 2.896121025085449, "rewards/margins": 2.896121025085449, "rewards/rejected": 0.0, "step": 1418 }, { "epoch": 7.927374301675978, "grad_norm": 3.0241236298958203, "learning_rate": 5.948160561957591e-07, "logits/chosen": -3.1309311389923096, "logits/rejected": -3.131141185760498, "logps/chosen": -3.4051127433776855, "logps/rejected": -28.47091293334961, "loss": 0.1267, "rewards/accuracies": 1.0, "rewards/chosen": 2.3636093139648438, "rewards/margins": 2.3636093139648438, "rewards/rejected": 0.0, "step": 1419 }, { "epoch": 7.932960893854749, "grad_norm": 2.158676825979555, "learning_rate": 5.942175013842609e-07, "logits/chosen": -3.1837263107299805, "logits/rejected": -3.4182281494140625, "logps/chosen": -20.090307235717773, "logps/rejected": -59.078121185302734, "loss": 0.1169, "rewards/accuracies": 1.0, "rewards/chosen": 2.475255012512207, "rewards/margins": 2.475255012512207, "rewards/rejected": 0.0, "step": 1420 }, { "epoch": 7.932960893854749, "eval_logits/chosen": -3.361480236053467, "eval_logits/rejected": -3.488327741622925, "eval_logps/chosen": -25.533275604248047, "eval_logps/rejected": -45.56114196777344, "eval_loss": 0.6961826086044312, "eval_rewards/accuracies": 0.7749999761581421, "eval_rewards/chosen": 0.4536762833595276, "eval_rewards/margins": 0.4536762833595276, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7276, "eval_samples_per_second": 9.472, "eval_steps_per_second": 0.306, "step": 1420 }, { "epoch": 7.93854748603352, "grad_norm": 3.6104957537270526, "learning_rate": 5.936188065487764e-07, "logits/chosen": -3.586928129196167, "logits/rejected": -3.7177696228027344, "logps/chosen": -1.4119430780410767, "logps/rejected": -24.606647491455078, "loss": 0.1305, "rewards/accuracies": 1.0, "rewards/chosen": 1.733350396156311, "rewards/margins": 1.733350396156311, "rewards/rejected": 0.0, "step": 1421 }, { "epoch": 7.94413407821229, "grad_norm": 1.5794556899989902, "learning_rate": 5.930199725790729e-07, "logits/chosen": -3.2176473140716553, "logits/rejected": -3.1848368644714355, "logps/chosen": -21.64169692993164, "logps/rejected": -41.134521484375, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": 3.4605870246887207, "rewards/margins": 3.4605870246887207, "rewards/rejected": 0.0, "step": 1422 }, { "epoch": 7.949720670391061, "grad_norm": 1.8753331860776365, "learning_rate": 5.924210003651241e-07, "logits/chosen": -3.482959032058716, "logits/rejected": -3.593095541000366, "logps/chosen": -4.3200578689575195, "logps/rejected": -32.17010498046875, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": 2.347722053527832, "rewards/margins": 2.347722053527832, "rewards/rejected": 0.0, "step": 1423 }, { "epoch": 7.955307262569832, "grad_norm": 2.255555162649948, "learning_rate": 5.918218907971094e-07, "logits/chosen": -3.6305172443389893, "logits/rejected": -3.607184886932373, "logps/chosen": -1.2554359436035156, "logps/rejected": -29.569927215576172, "loss": 0.1317, "rewards/accuracies": 1.0, "rewards/chosen": 2.031468152999878, "rewards/margins": 2.031468152999878, "rewards/rejected": 0.0, "step": 1424 }, { "epoch": 7.960893854748603, "grad_norm": 1.597704858025818, "learning_rate": 5.912226447654126e-07, "logits/chosen": -3.569333553314209, "logits/rejected": -3.6216049194335938, "logps/chosen": -7.23366641998291, "logps/rejected": -30.640758514404297, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": 3.3793201446533203, "rewards/margins": 3.3793201446533203, "rewards/rejected": 0.0, "step": 1425 }, { "epoch": 7.966480446927374, "grad_norm": 1.8122627061811327, "learning_rate": 5.906232631606197e-07, "logits/chosen": -3.5693445205688477, "logits/rejected": -3.5513341426849365, "logps/chosen": -10.951475143432617, "logps/rejected": -31.499202728271484, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": 2.9858345985412598, "rewards/margins": 2.9858345985412598, "rewards/rejected": 0.0, "step": 1426 }, { "epoch": 7.972067039106145, "grad_norm": 2.6827762380234916, "learning_rate": 5.900237468735187e-07, "logits/chosen": -3.6298258304595947, "logits/rejected": -3.8624138832092285, "logps/chosen": -27.023897171020508, "logps/rejected": -43.194000244140625, "loss": 0.1022, "rewards/accuracies": 1.0, "rewards/chosen": 2.627375841140747, "rewards/margins": 2.627375841140747, "rewards/rejected": 0.0, "step": 1427 }, { "epoch": 7.977653631284916, "grad_norm": 5.03487213920907, "learning_rate": 5.894240967950975e-07, "logits/chosen": -3.6355860233306885, "logits/rejected": -3.4657349586486816, "logps/chosen": -4.8664655685424805, "logps/rejected": -72.21208190917969, "loss": 0.1381, "rewards/accuracies": 1.0, "rewards/chosen": 2.4866435527801514, "rewards/margins": 2.4866435527801514, "rewards/rejected": 0.0, "step": 1428 }, { "epoch": 7.983240223463687, "grad_norm": 6.345048662850746, "learning_rate": 5.888243138165428e-07, "logits/chosen": -3.5810298919677734, "logits/rejected": -3.5718092918395996, "logps/chosen": -8.52374267578125, "logps/rejected": -21.86713409423828, "loss": 0.1264, "rewards/accuracies": 1.0, "rewards/chosen": 2.503370761871338, "rewards/margins": 2.503370761871338, "rewards/rejected": 0.0, "step": 1429 }, { "epoch": 7.988826815642458, "grad_norm": 2.591814369716358, "learning_rate": 5.882243988292392e-07, "logits/chosen": -3.4187800884246826, "logits/rejected": -3.482560396194458, "logps/chosen": -2.0377821922302246, "logps/rejected": -29.07729148864746, "loss": 0.0949, "rewards/accuracies": 1.0, "rewards/chosen": 2.1618528366088867, "rewards/margins": 2.1618528366088867, "rewards/rejected": 0.0, "step": 1430 }, { "epoch": 7.994413407821229, "grad_norm": 1.2980300131545568, "learning_rate": 5.876243527247668e-07, "logits/chosen": -3.6745760440826416, "logits/rejected": -3.680694341659546, "logps/chosen": -10.520817756652832, "logps/rejected": -22.926095962524414, "loss": 0.1125, "rewards/accuracies": 1.0, "rewards/chosen": 2.663787603378296, "rewards/margins": 2.663787603378296, "rewards/rejected": 0.0, "step": 1431 }, { "epoch": 8.0, "grad_norm": 1.6699100499013064, "learning_rate": 5.870241763949014e-07, "logits/chosen": -3.602869987487793, "logits/rejected": -3.5776760578155518, "logps/chosen": -2.982872486114502, "logps/rejected": -26.168415069580078, "loss": 0.1088, "rewards/accuracies": 1.0, "rewards/chosen": 2.163339614868164, "rewards/margins": 2.163339614868164, "rewards/rejected": 0.0, "step": 1432 }, { "epoch": 8.005586592178771, "grad_norm": 0.9050705109730053, "learning_rate": 5.864238707316116e-07, "logits/chosen": -3.2913479804992676, "logits/rejected": -3.4438178539276123, "logps/chosen": -26.037870407104492, "logps/rejected": -39.11576461791992, "loss": 0.0951, "rewards/accuracies": 1.0, "rewards/chosen": 3.1983556747436523, "rewards/margins": 3.1983556747436523, "rewards/rejected": 0.0, "step": 1433 }, { "epoch": 8.011173184357542, "grad_norm": 1.127936173615925, "learning_rate": 5.858234366270586e-07, "logits/chosen": -3.483159065246582, "logits/rejected": -3.3761470317840576, "logps/chosen": -1.068917989730835, "logps/rejected": -38.09598922729492, "loss": 0.0899, "rewards/accuracies": 1.0, "rewards/chosen": 1.7994022369384766, "rewards/margins": 1.7994022369384766, "rewards/rejected": 0.0, "step": 1434 }, { "epoch": 8.016759776536313, "grad_norm": 1.8724140846559836, "learning_rate": 5.852228749735946e-07, "logits/chosen": -3.496731758117676, "logits/rejected": -3.6163177490234375, "logps/chosen": -1.5850613117218018, "logps/rejected": -40.112396240234375, "loss": 0.1264, "rewards/accuracies": 1.0, "rewards/chosen": 1.8730049133300781, "rewards/margins": 1.8730049133300781, "rewards/rejected": 0.0, "step": 1435 }, { "epoch": 8.022346368715084, "grad_norm": 1.5935093998237166, "learning_rate": 5.84622186663761e-07, "logits/chosen": -3.453112840652466, "logits/rejected": -3.3465051651000977, "logps/chosen": -2.498274803161621, "logps/rejected": -21.617000579833984, "loss": 0.0952, "rewards/accuracies": 1.0, "rewards/chosen": 2.6029903888702393, "rewards/margins": 2.6029903888702393, "rewards/rejected": 0.0, "step": 1436 }, { "epoch": 8.027932960893855, "grad_norm": 2.15951044683322, "learning_rate": 5.840213725902876e-07, "logits/chosen": -3.4027249813079834, "logits/rejected": -3.431945562362671, "logps/chosen": -5.285755157470703, "logps/rejected": -34.72210693359375, "loss": 0.1311, "rewards/accuracies": 1.0, "rewards/chosen": 2.708683490753174, "rewards/margins": 2.708683490753174, "rewards/rejected": 0.0, "step": 1437 }, { "epoch": 8.033519553072626, "grad_norm": 0.8011175434960901, "learning_rate": 5.834204336460911e-07, "logits/chosen": -3.5468287467956543, "logits/rejected": -3.7178070545196533, "logps/chosen": -5.940998077392578, "logps/rejected": -49.089820861816406, "loss": 0.1146, "rewards/accuracies": 1.0, "rewards/chosen": 3.1336450576782227, "rewards/margins": 3.1336450576782227, "rewards/rejected": 0.0, "step": 1438 }, { "epoch": 8.039106145251397, "grad_norm": 1.3280364220804615, "learning_rate": 5.828193707242739e-07, "logits/chosen": -3.666799306869507, "logits/rejected": -3.7120518684387207, "logps/chosen": -21.294273376464844, "logps/rejected": -39.77464294433594, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": 4.398981094360352, "rewards/margins": 4.398981094360352, "rewards/rejected": 0.0, "step": 1439 }, { "epoch": 8.044692737430168, "grad_norm": 3.402853983729217, "learning_rate": 5.822181847181225e-07, "logits/chosen": -3.434978485107422, "logits/rejected": -3.446974515914917, "logps/chosen": -0.9313244819641113, "logps/rejected": -58.56283950805664, "loss": 0.1522, "rewards/accuracies": 1.0, "rewards/chosen": 1.3463492393493652, "rewards/margins": 1.3463492393493652, "rewards/rejected": 0.0, "step": 1440 }, { "epoch": 8.044692737430168, "eval_logits/chosen": -3.360203504562378, "eval_logits/rejected": -3.488086700439453, "eval_logps/chosen": -25.463388442993164, "eval_logps/rejected": -45.98029327392578, "eval_loss": 0.7024779915809631, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": 0.46066561341285706, "eval_rewards/margins": 0.46066561341285706, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7122, "eval_samples_per_second": 9.477, "eval_steps_per_second": 0.306, "step": 1440 }, { "epoch": 8.050279329608939, "grad_norm": 1.0956868530919193, "learning_rate": 5.816168765211062e-07, "logits/chosen": -3.486517906188965, "logits/rejected": -3.427119255065918, "logps/chosen": -9.0908203125, "logps/rejected": -30.563962936401367, "loss": 0.1106, "rewards/accuracies": 1.0, "rewards/chosen": 3.1291401386260986, "rewards/margins": 3.1291401386260986, "rewards/rejected": 0.0, "step": 1441 }, { "epoch": 8.05586592178771, "grad_norm": 1.4963688156216446, "learning_rate": 5.810154470268763e-07, "logits/chosen": -3.4776787757873535, "logits/rejected": -3.3631644248962402, "logps/chosen": -6.872175693511963, "logps/rejected": -41.6898193359375, "loss": 0.1404, "rewards/accuracies": 1.0, "rewards/chosen": 3.4865894317626953, "rewards/margins": 3.4865894317626953, "rewards/rejected": 0.0, "step": 1442 }, { "epoch": 8.061452513966481, "grad_norm": 2.350593152421529, "learning_rate": 5.804138971292641e-07, "logits/chosen": -3.5967485904693604, "logits/rejected": -3.61181902885437, "logps/chosen": -8.008771896362305, "logps/rejected": -44.63813781738281, "loss": 0.1545, "rewards/accuracies": 1.0, "rewards/chosen": 3.2435052394866943, "rewards/margins": 3.2435052394866943, "rewards/rejected": 0.0, "step": 1443 }, { "epoch": 8.067039106145252, "grad_norm": 0.8201971929711597, "learning_rate": 5.798122277222797e-07, "logits/chosen": -3.495875597000122, "logits/rejected": -3.578230142593384, "logps/chosen": -4.396419525146484, "logps/rejected": -61.46167755126953, "loss": 0.078, "rewards/accuracies": 1.0, "rewards/chosen": 2.4080634117126465, "rewards/margins": 2.4080634117126465, "rewards/rejected": 0.0, "step": 1444 }, { "epoch": 8.072625698324023, "grad_norm": 2.6149905749508098, "learning_rate": 5.792104397001113e-07, "logits/chosen": -3.2644505500793457, "logits/rejected": -3.475562334060669, "logps/chosen": -1.256878137588501, "logps/rejected": -65.22468566894531, "loss": 0.125, "rewards/accuracies": 1.0, "rewards/chosen": 1.986826777458191, "rewards/margins": 1.986826777458191, "rewards/rejected": 0.0, "step": 1445 }, { "epoch": 8.078212290502794, "grad_norm": 1.7701652989609762, "learning_rate": 5.786085339571229e-07, "logits/chosen": -3.4310989379882812, "logits/rejected": -3.526287794113159, "logps/chosen": -1.9093594551086426, "logps/rejected": -46.967960357666016, "loss": 0.1098, "rewards/accuracies": 1.0, "rewards/chosen": 2.1475911140441895, "rewards/margins": 2.1475911140441895, "rewards/rejected": 0.0, "step": 1446 }, { "epoch": 8.083798882681565, "grad_norm": 2.0776576727262817, "learning_rate": 5.780065113878537e-07, "logits/chosen": -3.533348321914673, "logits/rejected": -3.5505824089050293, "logps/chosen": -3.810591220855713, "logps/rejected": -56.04825973510742, "loss": 0.1019, "rewards/accuracies": 1.0, "rewards/chosen": 2.585858106613159, "rewards/margins": 2.585858106613159, "rewards/rejected": 0.0, "step": 1447 }, { "epoch": 8.089385474860336, "grad_norm": 0.9398781507882047, "learning_rate": 5.774043728870161e-07, "logits/chosen": -3.622270345687866, "logits/rejected": -3.5502419471740723, "logps/chosen": -8.514244079589844, "logps/rejected": -24.470233917236328, "loss": 0.0982, "rewards/accuracies": 1.0, "rewards/chosen": 2.955456256866455, "rewards/margins": 2.955456256866455, "rewards/rejected": 0.0, "step": 1448 }, { "epoch": 8.094972067039107, "grad_norm": 1.0188300956345018, "learning_rate": 5.768021193494956e-07, "logits/chosen": -3.660067081451416, "logits/rejected": -3.6409153938293457, "logps/chosen": -0.8675261735916138, "logps/rejected": -57.86646270751953, "loss": 0.1059, "rewards/accuracies": 1.0, "rewards/chosen": 2.0977039337158203, "rewards/margins": 2.0977039337158203, "rewards/rejected": 0.0, "step": 1449 }, { "epoch": 8.100558659217878, "grad_norm": 3.1775383354857127, "learning_rate": 5.761997516703478e-07, "logits/chosen": -3.269120931625366, "logits/rejected": -3.2979540824890137, "logps/chosen": -1.818122386932373, "logps/rejected": -73.30386352539062, "loss": 0.1367, "rewards/accuracies": 1.0, "rewards/chosen": 2.148003339767456, "rewards/margins": 2.148003339767456, "rewards/rejected": 0.0, "step": 1450 }, { "epoch": 8.106145251396647, "grad_norm": 1.1074461025220506, "learning_rate": 5.755972707447989e-07, "logits/chosen": -3.720050096511841, "logits/rejected": -3.878394365310669, "logps/chosen": -1.9917762279510498, "logps/rejected": -46.12985610961914, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 2.4722256660461426, "rewards/margins": 2.4722256660461426, "rewards/rejected": 0.0, "step": 1451 }, { "epoch": 8.111731843575418, "grad_norm": 0.8836695493814023, "learning_rate": 5.749946774682425e-07, "logits/chosen": -3.4956867694854736, "logits/rejected": -3.3236804008483887, "logps/chosen": -18.465164184570312, "logps/rejected": -59.880489349365234, "loss": 0.1071, "rewards/accuracies": 1.0, "rewards/chosen": 3.942150115966797, "rewards/margins": 3.942150115966797, "rewards/rejected": 0.0, "step": 1452 }, { "epoch": 8.11731843575419, "grad_norm": 0.9583283573562732, "learning_rate": 5.743919727362394e-07, "logits/chosen": -3.217168092727661, "logits/rejected": -3.538447856903076, "logps/chosen": -1.6311407089233398, "logps/rejected": -62.83324432373047, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": 2.1093578338623047, "rewards/margins": 2.1093578338623047, "rewards/rejected": 0.0, "step": 1453 }, { "epoch": 8.12290502793296, "grad_norm": 0.8329803477244008, "learning_rate": 5.737891574445164e-07, "logits/chosen": -3.36934232711792, "logits/rejected": -3.589510679244995, "logps/chosen": -11.139848709106445, "logps/rejected": -36.51717758178711, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": 2.744891405105591, "rewards/margins": 2.744891405105591, "rewards/rejected": 0.0, "step": 1454 }, { "epoch": 8.128491620111731, "grad_norm": 5.529502196973929, "learning_rate": 5.731862324889644e-07, "logits/chosen": -3.3760480880737305, "logits/rejected": -3.513336420059204, "logps/chosen": -2.8150508403778076, "logps/rejected": -29.952720642089844, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": 2.7353410720825195, "rewards/margins": 2.7353410720825195, "rewards/rejected": 0.0, "step": 1455 }, { "epoch": 8.134078212290502, "grad_norm": 2.6192638848792242, "learning_rate": 5.72583198765637e-07, "logits/chosen": -3.425025463104248, "logits/rejected": -3.4201722145080566, "logps/chosen": -0.9650635123252869, "logps/rejected": -38.63450241088867, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 1.8116285800933838, "rewards/margins": 1.8116285800933838, "rewards/rejected": 0.0, "step": 1456 }, { "epoch": 8.139664804469273, "grad_norm": 1.0286521930047465, "learning_rate": 5.7198005717075e-07, "logits/chosen": -3.4640731811523438, "logits/rejected": -3.6357429027557373, "logps/chosen": -1.1994825601577759, "logps/rejected": -65.90774536132812, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 2.2714264392852783, "rewards/margins": 2.2714264392852783, "rewards/rejected": 0.0, "step": 1457 }, { "epoch": 8.145251396648044, "grad_norm": 0.9117180594633306, "learning_rate": 5.71376808600679e-07, "logits/chosen": -3.1062095165252686, "logits/rejected": -3.5535805225372314, "logps/chosen": -4.527098178863525, "logps/rejected": -61.18075942993164, "loss": 0.1038, "rewards/accuracies": 1.0, "rewards/chosen": 2.646533727645874, "rewards/margins": 2.646533727645874, "rewards/rejected": 0.0, "step": 1458 }, { "epoch": 8.150837988826815, "grad_norm": 1.3223691265385114, "learning_rate": 5.707734539519591e-07, "logits/chosen": -3.413194417953491, "logits/rejected": -3.099423885345459, "logps/chosen": -13.974654197692871, "logps/rejected": -33.39994812011719, "loss": 0.095, "rewards/accuracies": 1.0, "rewards/chosen": 4.1584930419921875, "rewards/margins": 4.1584930419921875, "rewards/rejected": 0.0, "step": 1459 }, { "epoch": 8.156424581005586, "grad_norm": 2.409433570849459, "learning_rate": 5.701699941212823e-07, "logits/chosen": -3.4567711353302, "logits/rejected": -3.5697836875915527, "logps/chosen": -1.32706880569458, "logps/rejected": -30.027732849121094, "loss": 0.1275, "rewards/accuracies": 1.0, "rewards/chosen": 2.199556827545166, "rewards/margins": 2.199556827545166, "rewards/rejected": 0.0, "step": 1460 }, { "epoch": 8.156424581005586, "eval_logits/chosen": -3.3588969707489014, "eval_logits/rejected": -3.4880967140197754, "eval_logps/chosen": -26.15518569946289, "eval_logps/rejected": -48.0563850402832, "eval_loss": 0.7525140047073364, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": 0.3914857506752014, "eval_rewards/margins": 0.3914857506752014, "eval_rewards/rejected": 0.0, "eval_runtime": 32.703, "eval_samples_per_second": 9.479, "eval_steps_per_second": 0.306, "step": 1460 }, { "epoch": 8.162011173184357, "grad_norm": 0.8386509025147895, "learning_rate": 5.695664300054978e-07, "logits/chosen": -3.4141733646392822, "logits/rejected": -3.585333824157715, "logps/chosen": -12.4519681930542, "logps/rejected": -68.42787170410156, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 3.225080966949463, "rewards/margins": 3.225080966949463, "rewards/rejected": 0.0, "step": 1461 }, { "epoch": 8.167597765363128, "grad_norm": 3.1654528953590955, "learning_rate": 5.689627625016091e-07, "logits/chosen": -3.5402932167053223, "logits/rejected": -3.541752338409424, "logps/chosen": -1.338120937347412, "logps/rejected": -75.27066040039062, "loss": 0.1238, "rewards/accuracies": 1.0, "rewards/chosen": 1.8199812173843384, "rewards/margins": 1.8199812173843384, "rewards/rejected": 0.0, "step": 1462 }, { "epoch": 8.1731843575419, "grad_norm": 0.920781738470232, "learning_rate": 5.683589925067738e-07, "logits/chosen": -3.5387191772460938, "logits/rejected": -3.5149905681610107, "logps/chosen": -4.906529903411865, "logps/rejected": -55.59393310546875, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 3.0590710639953613, "rewards/margins": 3.0590710639953613, "rewards/rejected": 0.0, "step": 1463 }, { "epoch": 8.17877094972067, "grad_norm": 1.2810884456437077, "learning_rate": 5.677551209183016e-07, "logits/chosen": -3.6051743030548096, "logits/rejected": -3.4584357738494873, "logps/chosen": -3.035280227661133, "logps/rejected": -31.97887420654297, "loss": 0.1005, "rewards/accuracies": 1.0, "rewards/chosen": 2.869049549102783, "rewards/margins": 2.869049549102783, "rewards/rejected": 0.0, "step": 1464 }, { "epoch": 8.184357541899441, "grad_norm": 0.8879067099029657, "learning_rate": 5.671511486336531e-07, "logits/chosen": -3.5902838706970215, "logits/rejected": -3.6346681118011475, "logps/chosen": -0.8220909833908081, "logps/rejected": -31.008304595947266, "loss": 0.1005, "rewards/accuracies": 1.0, "rewards/chosen": 2.3344693183898926, "rewards/margins": 2.3344693183898926, "rewards/rejected": 0.0, "step": 1465 }, { "epoch": 8.189944134078212, "grad_norm": 1.8151028436159289, "learning_rate": 5.665470765504388e-07, "logits/chosen": -3.507279872894287, "logits/rejected": -3.320805311203003, "logps/chosen": -12.214398384094238, "logps/rejected": -28.21538543701172, "loss": 0.1373, "rewards/accuracies": 1.0, "rewards/chosen": 2.6122233867645264, "rewards/margins": 2.6122233867645264, "rewards/rejected": 0.0, "step": 1466 }, { "epoch": 8.195530726256983, "grad_norm": 3.9022381059470432, "learning_rate": 5.659429055664175e-07, "logits/chosen": -3.4230411052703857, "logits/rejected": -3.436345338821411, "logps/chosen": -9.788567543029785, "logps/rejected": -41.93543243408203, "loss": 0.123, "rewards/accuracies": 1.0, "rewards/chosen": 3.5234522819519043, "rewards/margins": 3.5234522819519043, "rewards/rejected": 0.0, "step": 1467 }, { "epoch": 8.201117318435754, "grad_norm": 2.587203604927202, "learning_rate": 5.653386365794946e-07, "logits/chosen": -3.4539031982421875, "logits/rejected": -3.4906692504882812, "logps/chosen": -4.39523458480835, "logps/rejected": -39.78955078125, "loss": 0.0902, "rewards/accuracies": 1.0, "rewards/chosen": 3.061246871948242, "rewards/margins": 3.061246871948242, "rewards/rejected": 0.0, "step": 1468 }, { "epoch": 8.206703910614525, "grad_norm": 2.4062376878197487, "learning_rate": 5.647342704877216e-07, "logits/chosen": -3.698641538619995, "logits/rejected": -3.6472220420837402, "logps/chosen": -3.5880136489868164, "logps/rejected": -31.792198181152344, "loss": 0.1164, "rewards/accuracies": 1.0, "rewards/chosen": 2.213914155960083, "rewards/margins": 2.213914155960083, "rewards/rejected": 0.0, "step": 1469 }, { "epoch": 8.212290502793296, "grad_norm": 1.4506575969019242, "learning_rate": 5.64129808189294e-07, "logits/chosen": -3.744248151779175, "logits/rejected": -3.6063907146453857, "logps/chosen": -6.610927581787109, "logps/rejected": -19.785215377807617, "loss": 0.0858, "rewards/accuracies": 1.0, "rewards/chosen": 2.8722078800201416, "rewards/margins": 2.8722078800201416, "rewards/rejected": 0.0, "step": 1470 }, { "epoch": 8.217877094972067, "grad_norm": 0.8290463451813451, "learning_rate": 5.635252505825507e-07, "logits/chosen": -3.140641927719116, "logits/rejected": -3.2642526626586914, "logps/chosen": -1.733540654182434, "logps/rejected": -94.34074401855469, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": 2.3417115211486816, "rewards/margins": 2.3417115211486816, "rewards/rejected": 0.0, "step": 1471 }, { "epoch": 8.223463687150838, "grad_norm": 1.2338899754429675, "learning_rate": 5.629205985659718e-07, "logits/chosen": -3.5912399291992188, "logits/rejected": -3.4617409706115723, "logps/chosen": -16.728078842163086, "logps/rejected": -44.76894760131836, "loss": 0.0951, "rewards/accuracies": 1.0, "rewards/chosen": 3.295330286026001, "rewards/margins": 3.295330286026001, "rewards/rejected": 0.0, "step": 1472 }, { "epoch": 8.22905027932961, "grad_norm": 3.4023832547711845, "learning_rate": 5.623158530381777e-07, "logits/chosen": -3.5220608711242676, "logits/rejected": -3.5308523178100586, "logps/chosen": -2.3623828887939453, "logps/rejected": -31.084972381591797, "loss": 0.1355, "rewards/accuracies": 1.0, "rewards/chosen": 2.6224255561828613, "rewards/margins": 2.6224255561828613, "rewards/rejected": 0.0, "step": 1473 }, { "epoch": 8.23463687150838, "grad_norm": 4.063475271499999, "learning_rate": 5.617110148979283e-07, "logits/chosen": -3.2087531089782715, "logits/rejected": -3.1776955127716064, "logps/chosen": -0.906381368637085, "logps/rejected": -37.97704315185547, "loss": 0.1116, "rewards/accuracies": 1.0, "rewards/chosen": 1.9931944608688354, "rewards/margins": 1.9931944608688354, "rewards/rejected": 0.0, "step": 1474 }, { "epoch": 8.240223463687151, "grad_norm": 2.315438183563028, "learning_rate": 5.611060850441206e-07, "logits/chosen": -3.4199037551879883, "logits/rejected": -3.3875744342803955, "logps/chosen": -7.336108207702637, "logps/rejected": -31.82926368713379, "loss": 0.1327, "rewards/accuracies": 1.0, "rewards/chosen": 3.218153715133667, "rewards/margins": 3.218153715133667, "rewards/rejected": 0.0, "step": 1475 }, { "epoch": 8.245810055865922, "grad_norm": 2.101473370594678, "learning_rate": 5.60501064375788e-07, "logits/chosen": -3.6835854053497314, "logits/rejected": -3.642129898071289, "logps/chosen": -6.935665607452393, "logps/rejected": -57.88630294799805, "loss": 0.1028, "rewards/accuracies": 1.0, "rewards/chosen": 2.5883429050445557, "rewards/margins": 2.5883429050445557, "rewards/rejected": 0.0, "step": 1476 }, { "epoch": 8.251396648044693, "grad_norm": 8.024140845797877, "learning_rate": 5.598959537920994e-07, "logits/chosen": -3.7375729084014893, "logits/rejected": -3.604775905609131, "logps/chosen": -9.299234390258789, "logps/rejected": -40.53462219238281, "loss": 0.1059, "rewards/accuracies": 1.0, "rewards/chosen": 2.622415542602539, "rewards/margins": 2.622415542602539, "rewards/rejected": 0.0, "step": 1477 }, { "epoch": 8.256983240223464, "grad_norm": 1.1968602033493951, "learning_rate": 5.592907541923564e-07, "logits/chosen": -3.542449951171875, "logits/rejected": -3.4163997173309326, "logps/chosen": -0.6911807656288147, "logps/rejected": -55.949310302734375, "loss": 0.1026, "rewards/accuracies": 1.0, "rewards/chosen": 2.0382418632507324, "rewards/margins": 2.0382418632507324, "rewards/rejected": 0.0, "step": 1478 }, { "epoch": 8.262569832402235, "grad_norm": 1.7473793253098002, "learning_rate": 5.586854664759935e-07, "logits/chosen": -3.5408735275268555, "logits/rejected": -3.641183376312256, "logps/chosen": -10.276369094848633, "logps/rejected": -46.6556510925293, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 3.896777629852295, "rewards/margins": 3.896777629852295, "rewards/rejected": 0.0, "step": 1479 }, { "epoch": 8.268156424581006, "grad_norm": 0.8557682911360565, "learning_rate": 5.580800915425762e-07, "logits/chosen": -3.7345876693725586, "logits/rejected": -3.5522172451019287, "logps/chosen": -11.918123245239258, "logps/rejected": -31.203411102294922, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": 3.502218723297119, "rewards/margins": 3.502218723297119, "rewards/rejected": 0.0, "step": 1480 }, { "epoch": 8.268156424581006, "eval_logits/chosen": -3.3670451641082764, "eval_logits/rejected": -3.493870258331299, "eval_logps/chosen": -25.99761390686035, "eval_logps/rejected": -47.492557525634766, "eval_loss": 0.7414113283157349, "eval_rewards/accuracies": 0.699999988079071, "eval_rewards/chosen": 0.4072425961494446, "eval_rewards/margins": 0.4072425961494446, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7446, "eval_samples_per_second": 9.467, "eval_steps_per_second": 0.305, "step": 1480 }, { "epoch": 8.273743016759777, "grad_norm": 1.5173921999512312, "learning_rate": 5.574746302917993e-07, "logits/chosen": -3.706956386566162, "logits/rejected": -3.369967460632324, "logps/chosen": -1.1656317710876465, "logps/rejected": -44.58979797363281, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 2.2945690155029297, "rewards/margins": 2.2945690155029297, "rewards/rejected": 0.0, "step": 1481 }, { "epoch": 8.279329608938548, "grad_norm": 1.7484709661490887, "learning_rate": 5.56869083623486e-07, "logits/chosen": -3.258481025695801, "logits/rejected": -3.519138813018799, "logps/chosen": -1.1044570207595825, "logps/rejected": -66.29925537109375, "loss": 0.1047, "rewards/accuracies": 1.0, "rewards/chosen": 2.114016056060791, "rewards/margins": 2.114016056060791, "rewards/rejected": 0.0, "step": 1482 }, { "epoch": 8.28491620111732, "grad_norm": 1.1513836623784581, "learning_rate": 5.562634524375869e-07, "logits/chosen": -3.3325798511505127, "logits/rejected": -3.3959743976593018, "logps/chosen": -3.7329177856445312, "logps/rejected": -28.587566375732422, "loss": 0.0785, "rewards/accuracies": 1.0, "rewards/chosen": 2.9908037185668945, "rewards/margins": 2.9908037185668945, "rewards/rejected": 0.0, "step": 1483 }, { "epoch": 8.29050279329609, "grad_norm": 0.6982183167014285, "learning_rate": 5.556577376341773e-07, "logits/chosen": -3.776313304901123, "logits/rejected": -3.878551721572876, "logps/chosen": -5.873186111450195, "logps/rejected": -36.530662536621094, "loss": 0.1085, "rewards/accuracies": 1.0, "rewards/chosen": 2.72420597076416, "rewards/margins": 2.72420597076416, "rewards/rejected": 0.0, "step": 1484 }, { "epoch": 8.296089385474861, "grad_norm": 1.493891468823741, "learning_rate": 5.550519401134576e-07, "logits/chosen": -3.0950396060943604, "logits/rejected": -3.0590198040008545, "logps/chosen": -18.803293228149414, "logps/rejected": -29.01365852355957, "loss": 0.1231, "rewards/accuracies": 1.0, "rewards/chosen": 2.442892551422119, "rewards/margins": 2.442892551422119, "rewards/rejected": 0.0, "step": 1485 }, { "epoch": 8.30167597765363, "grad_norm": 0.8501588378546953, "learning_rate": 5.544460607757505e-07, "logits/chosen": -3.559340238571167, "logits/rejected": -3.6847593784332275, "logps/chosen": -24.36299705505371, "logps/rejected": -51.866939544677734, "loss": 0.1111, "rewards/accuracies": 1.0, "rewards/chosen": 3.3191587924957275, "rewards/margins": 3.3191587924957275, "rewards/rejected": 0.0, "step": 1486 }, { "epoch": 8.307262569832401, "grad_norm": 1.912154215172588, "learning_rate": 5.53840100521501e-07, "logits/chosen": -3.541095018386841, "logits/rejected": -3.482886552810669, "logps/chosen": -1.7762627601623535, "logps/rejected": -65.16224670410156, "loss": 0.1138, "rewards/accuracies": 1.0, "rewards/chosen": 2.2329888343811035, "rewards/margins": 2.2329888343811035, "rewards/rejected": 0.0, "step": 1487 }, { "epoch": 8.312849162011172, "grad_norm": 0.8990791179958932, "learning_rate": 5.532340602512736e-07, "logits/chosen": -3.5542407035827637, "logits/rejected": -3.6396937370300293, "logps/chosen": -3.978266716003418, "logps/rejected": -63.02491760253906, "loss": 0.1247, "rewards/accuracies": 1.0, "rewards/chosen": 2.5638134479522705, "rewards/margins": 2.5638134479522705, "rewards/rejected": 0.0, "step": 1488 }, { "epoch": 8.318435754189943, "grad_norm": 2.317155548097579, "learning_rate": 5.526279408657522e-07, "logits/chosen": -3.3917346000671387, "logits/rejected": -3.515409231185913, "logps/chosen": -11.306192398071289, "logps/rejected": -67.20194244384766, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": 3.041069507598877, "rewards/margins": 3.041069507598877, "rewards/rejected": 0.0, "step": 1489 }, { "epoch": 8.324022346368714, "grad_norm": 1.0533836877127634, "learning_rate": 5.520217432657381e-07, "logits/chosen": -3.6714675426483154, "logits/rejected": -3.4912898540496826, "logps/chosen": -6.029515266418457, "logps/rejected": -71.71287536621094, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 2.8722400665283203, "rewards/margins": 2.8722400665283203, "rewards/rejected": 0.0, "step": 1490 }, { "epoch": 8.329608938547485, "grad_norm": 0.9798544392097811, "learning_rate": 5.51415468352149e-07, "logits/chosen": -3.5855624675750732, "logits/rejected": -3.446227788925171, "logps/chosen": -4.108002185821533, "logps/rejected": -44.90166091918945, "loss": 0.1151, "rewards/accuracies": 1.0, "rewards/chosen": 2.6292355060577393, "rewards/margins": 2.6292355060577393, "rewards/rejected": 0.0, "step": 1491 }, { "epoch": 8.335195530726256, "grad_norm": 2.018163556164869, "learning_rate": 5.508091170260173e-07, "logits/chosen": -3.568167209625244, "logits/rejected": -3.5685997009277344, "logps/chosen": -0.9147201776504517, "logps/rejected": -71.11444091796875, "loss": 0.093, "rewards/accuracies": 1.0, "rewards/chosen": 2.080580234527588, "rewards/margins": 2.080580234527588, "rewards/rejected": 0.0, "step": 1492 }, { "epoch": 8.340782122905027, "grad_norm": 1.0815963292564568, "learning_rate": 5.502026901884892e-07, "logits/chosen": -3.4329142570495605, "logits/rejected": -3.4202609062194824, "logps/chosen": -2.4329872131347656, "logps/rejected": -53.15180206298828, "loss": 0.0895, "rewards/accuracies": 1.0, "rewards/chosen": 2.744094133377075, "rewards/margins": 2.744094133377075, "rewards/rejected": 0.0, "step": 1493 }, { "epoch": 8.346368715083798, "grad_norm": 0.9285472556353264, "learning_rate": 5.495961887408228e-07, "logits/chosen": -3.2632803916931152, "logits/rejected": -3.1433935165405273, "logps/chosen": -5.980401515960693, "logps/rejected": -30.302764892578125, "loss": 0.1049, "rewards/accuracies": 1.0, "rewards/chosen": 2.8719334602355957, "rewards/margins": 2.8719334602355957, "rewards/rejected": 0.0, "step": 1494 }, { "epoch": 8.35195530726257, "grad_norm": 0.9039569381629475, "learning_rate": 5.489896135843873e-07, "logits/chosen": -3.6132705211639404, "logits/rejected": -3.616997480392456, "logps/chosen": -1.6922589540481567, "logps/rejected": -57.058143615722656, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 2.035038948059082, "rewards/margins": 2.035038948059082, "rewards/rejected": 0.0, "step": 1495 }, { "epoch": 8.35754189944134, "grad_norm": 1.8677324812628862, "learning_rate": 5.483829656206614e-07, "logits/chosen": -3.5454249382019043, "logits/rejected": -3.6025378704071045, "logps/chosen": -26.181739807128906, "logps/rejected": -51.99799346923828, "loss": 0.1369, "rewards/accuracies": 1.0, "rewards/chosen": 3.705493211746216, "rewards/margins": 3.705493211746216, "rewards/rejected": 0.0, "step": 1496 }, { "epoch": 8.363128491620111, "grad_norm": 0.8850994610735713, "learning_rate": 5.477762457512322e-07, "logits/chosen": -3.386732816696167, "logits/rejected": -3.421370506286621, "logps/chosen": -15.596943855285645, "logps/rejected": -49.773773193359375, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 3.1644387245178223, "rewards/margins": 3.1644387245178223, "rewards/rejected": 0.0, "step": 1497 }, { "epoch": 8.368715083798882, "grad_norm": 1.8447022932238195, "learning_rate": 5.471694548777933e-07, "logits/chosen": -3.4856345653533936, "logits/rejected": -3.5015311241149902, "logps/chosen": -11.751615524291992, "logps/rejected": -30.04815101623535, "loss": 0.1134, "rewards/accuracies": 1.0, "rewards/chosen": 3.336698293685913, "rewards/margins": 3.336698293685913, "rewards/rejected": 0.0, "step": 1498 }, { "epoch": 8.374301675977653, "grad_norm": 1.3753032193008334, "learning_rate": 5.46562593902144e-07, "logits/chosen": -3.624182939529419, "logits/rejected": -3.7773618698120117, "logps/chosen": -7.332164287567139, "logps/rejected": -43.15809631347656, "loss": 0.081, "rewards/accuracies": 1.0, "rewards/chosen": 3.1540799140930176, "rewards/margins": 3.1540799140930176, "rewards/rejected": 0.0, "step": 1499 }, { "epoch": 8.379888268156424, "grad_norm": 2.678452490793768, "learning_rate": 5.459556637261878e-07, "logits/chosen": -3.5382821559906006, "logits/rejected": -3.62601375579834, "logps/chosen": -0.8343517780303955, "logps/rejected": -106.4999771118164, "loss": 0.1039, "rewards/accuracies": 1.0, "rewards/chosen": 1.4513757228851318, "rewards/margins": 1.4513757228851318, "rewards/rejected": 0.0, "step": 1500 }, { "epoch": 8.379888268156424, "eval_logits/chosen": -3.3645050525665283, "eval_logits/rejected": -3.4921786785125732, "eval_logps/chosen": -26.272689819335938, "eval_logps/rejected": -48.18663024902344, "eval_loss": 0.7770476341247559, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": 0.37973541021347046, "eval_rewards/margins": 0.37973541021347046, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7228, "eval_samples_per_second": 9.474, "eval_steps_per_second": 0.306, "step": 1500 }, { "epoch": 8.385474860335195, "grad_norm": 2.290756959944643, "learning_rate": 5.45348665251931e-07, "logits/chosen": -3.5833957195281982, "logits/rejected": -3.587348699569702, "logps/chosen": -18.493114471435547, "logps/rejected": -57.7999267578125, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 3.490065574645996, "rewards/margins": 3.490065574645996, "rewards/rejected": 0.0, "step": 1501 }, { "epoch": 8.391061452513966, "grad_norm": 1.3910650255217563, "learning_rate": 5.447415993814815e-07, "logits/chosen": -3.4692883491516113, "logits/rejected": -3.611524820327759, "logps/chosen": -9.678472518920898, "logps/rejected": -27.444807052612305, "loss": 0.1313, "rewards/accuracies": 1.0, "rewards/chosen": 3.2595767974853516, "rewards/margins": 3.2595767974853516, "rewards/rejected": 0.0, "step": 1502 }, { "epoch": 8.396648044692737, "grad_norm": 1.6357281876397822, "learning_rate": 5.441344670170474e-07, "logits/chosen": -3.481549024581909, "logits/rejected": -3.4805691242218018, "logps/chosen": -14.577237129211426, "logps/rejected": -57.74549865722656, "loss": 0.1266, "rewards/accuracies": 1.0, "rewards/chosen": 3.264359474182129, "rewards/margins": 3.264359474182129, "rewards/rejected": 0.0, "step": 1503 }, { "epoch": 8.402234636871508, "grad_norm": 2.0084390245276076, "learning_rate": 5.435272690609352e-07, "logits/chosen": -2.948986768722534, "logits/rejected": -2.76161789894104, "logps/chosen": -15.853133201599121, "logps/rejected": -30.536693572998047, "loss": 0.0982, "rewards/accuracies": 1.0, "rewards/chosen": 3.1021533012390137, "rewards/margins": 3.1021533012390137, "rewards/rejected": 0.0, "step": 1504 }, { "epoch": 8.40782122905028, "grad_norm": 2.0121402411498, "learning_rate": 5.429200064155495e-07, "logits/chosen": -2.9251699447631836, "logits/rejected": -2.935551643371582, "logps/chosen": -6.076290130615234, "logps/rejected": -37.09535217285156, "loss": 0.1138, "rewards/accuracies": 1.0, "rewards/chosen": 2.805009603500366, "rewards/margins": 2.805009603500366, "rewards/rejected": 0.0, "step": 1505 }, { "epoch": 8.41340782122905, "grad_norm": 3.389742466031773, "learning_rate": 5.423126799833906e-07, "logits/chosen": -3.3590941429138184, "logits/rejected": -3.596940040588379, "logps/chosen": -2.6399240493774414, "logps/rejected": -69.94694519042969, "loss": 0.1042, "rewards/accuracies": 1.0, "rewards/chosen": 2.5809128284454346, "rewards/margins": 2.5809128284454346, "rewards/rejected": 0.0, "step": 1506 }, { "epoch": 8.418994413407821, "grad_norm": 1.358979664747821, "learning_rate": 5.41705290667054e-07, "logits/chosen": -3.5871829986572266, "logits/rejected": -3.5777151584625244, "logps/chosen": -12.903154373168945, "logps/rejected": -46.774505615234375, "loss": 0.1079, "rewards/accuracies": 1.0, "rewards/chosen": 3.6193244457244873, "rewards/margins": 3.6193244457244873, "rewards/rejected": 0.0, "step": 1507 }, { "epoch": 8.424581005586592, "grad_norm": 5.539810397595974, "learning_rate": 5.410978393692278e-07, "logits/chosen": -3.370941400527954, "logits/rejected": -3.4300525188446045, "logps/chosen": -5.34040641784668, "logps/rejected": -72.93252563476562, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": 2.609839916229248, "rewards/margins": 2.609839916229248, "rewards/rejected": 0.0, "step": 1508 }, { "epoch": 8.430167597765363, "grad_norm": 1.8268379553037781, "learning_rate": 5.404903269926933e-07, "logits/chosen": -3.537126064300537, "logits/rejected": -3.7760980129241943, "logps/chosen": -8.276466369628906, "logps/rejected": -29.803316116333008, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 3.295217514038086, "rewards/margins": 3.295217514038086, "rewards/rejected": 0.0, "step": 1509 }, { "epoch": 8.435754189944134, "grad_norm": 1.6483309748748936, "learning_rate": 5.398827544403221e-07, "logits/chosen": -3.3367722034454346, "logits/rejected": -3.454108476638794, "logps/chosen": -2.6563539505004883, "logps/rejected": -61.56664276123047, "loss": 0.1038, "rewards/accuracies": 1.0, "rewards/chosen": 2.2975759506225586, "rewards/margins": 2.2975759506225586, "rewards/rejected": 0.0, "step": 1510 }, { "epoch": 8.441340782122905, "grad_norm": 1.2754593474707578, "learning_rate": 5.392751226150748e-07, "logits/chosen": -3.487358570098877, "logits/rejected": -3.4356279373168945, "logps/chosen": -1.3859034776687622, "logps/rejected": -25.730405807495117, "loss": 0.1079, "rewards/accuracies": 1.0, "rewards/chosen": 1.415024995803833, "rewards/margins": 1.415024995803833, "rewards/rejected": 0.0, "step": 1511 }, { "epoch": 8.446927374301676, "grad_norm": 1.9779814605988295, "learning_rate": 5.386674324200009e-07, "logits/chosen": -3.5959315299987793, "logits/rejected": -3.629880428314209, "logps/chosen": -1.9470138549804688, "logps/rejected": -47.28021240234375, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 2.8410720825195312, "rewards/margins": 2.8410720825195312, "rewards/rejected": 0.0, "step": 1512 }, { "epoch": 8.452513966480447, "grad_norm": 1.0735096680239324, "learning_rate": 5.380596847582362e-07, "logits/chosen": -3.4882895946502686, "logits/rejected": -3.619725227355957, "logps/chosen": -2.756035089492798, "logps/rejected": -37.28264236450195, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": 2.608781337738037, "rewards/margins": 2.608781337738037, "rewards/rejected": 0.0, "step": 1513 }, { "epoch": 8.458100558659218, "grad_norm": 3.0054957128503537, "learning_rate": 5.374518805330019e-07, "logits/chosen": -3.7351033687591553, "logits/rejected": -3.6582224369049072, "logps/chosen": -1.2357591390609741, "logps/rejected": -46.32286071777344, "loss": 0.0819, "rewards/accuracies": 1.0, "rewards/chosen": 2.195836067199707, "rewards/margins": 2.195836067199707, "rewards/rejected": 0.0, "step": 1514 }, { "epoch": 8.46368715083799, "grad_norm": 1.702956122371466, "learning_rate": 5.368440206476032e-07, "logits/chosen": -3.5749435424804688, "logits/rejected": -3.3852646350860596, "logps/chosen": -2.3170430660247803, "logps/rejected": -30.341320037841797, "loss": 0.1247, "rewards/accuracies": 1.0, "rewards/chosen": 2.3695919513702393, "rewards/margins": 2.3695919513702393, "rewards/rejected": 0.0, "step": 1515 }, { "epoch": 8.46927374301676, "grad_norm": 3.202145899584805, "learning_rate": 5.362361060054284e-07, "logits/chosen": -3.424440622329712, "logits/rejected": -3.652538299560547, "logps/chosen": -1.2449885606765747, "logps/rejected": -80.08869934082031, "loss": 0.1181, "rewards/accuracies": 1.0, "rewards/chosen": 2.3632490634918213, "rewards/margins": 2.3632490634918213, "rewards/rejected": 0.0, "step": 1516 }, { "epoch": 8.474860335195531, "grad_norm": 1.1199726304529054, "learning_rate": 5.356281375099467e-07, "logits/chosen": -3.2655513286590576, "logits/rejected": -3.421264171600342, "logps/chosen": -6.837052345275879, "logps/rejected": -21.229103088378906, "loss": 0.1155, "rewards/accuracies": 1.0, "rewards/chosen": 2.299281120300293, "rewards/margins": 2.299281120300293, "rewards/rejected": 0.0, "step": 1517 }, { "epoch": 8.480446927374302, "grad_norm": 2.959212075028282, "learning_rate": 5.350201160647077e-07, "logits/chosen": -3.680194854736328, "logits/rejected": -3.5862388610839844, "logps/chosen": -6.810611724853516, "logps/rejected": -26.74586296081543, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 2.6470818519592285, "rewards/margins": 2.6470818519592285, "rewards/rejected": 0.0, "step": 1518 }, { "epoch": 8.486033519553073, "grad_norm": 3.517136317026258, "learning_rate": 5.344120425733394e-07, "logits/chosen": -3.3773891925811768, "logits/rejected": -3.3142926692962646, "logps/chosen": -1.5903003215789795, "logps/rejected": -86.06102752685547, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 2.4226956367492676, "rewards/margins": 2.4226956367492676, "rewards/rejected": 0.0, "step": 1519 }, { "epoch": 8.491620111731844, "grad_norm": 1.0342319624873688, "learning_rate": 5.338039179395474e-07, "logits/chosen": -3.496845245361328, "logits/rejected": -3.676945447921753, "logps/chosen": -0.4623601734638214, "logps/rejected": -49.321067810058594, "loss": 0.1203, "rewards/accuracies": 1.0, "rewards/chosen": 1.5963795185089111, "rewards/margins": 1.5963795185089111, "rewards/rejected": 0.0, "step": 1520 }, { "epoch": 8.491620111731844, "eval_logits/chosen": -3.3499298095703125, "eval_logits/rejected": -3.480783462524414, "eval_logps/chosen": -26.07053565979004, "eval_logps/rejected": -47.708984375, "eval_loss": 0.7492290139198303, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": 0.39995068311691284, "eval_rewards/margins": 0.39995068311691284, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7066, "eval_samples_per_second": 9.478, "eval_steps_per_second": 0.306, "step": 1520 }, { "epoch": 8.497206703910614, "grad_norm": 1.1080581426802347, "learning_rate": 5.331957430671131e-07, "logits/chosen": -3.567438840866089, "logits/rejected": -3.6613609790802, "logps/chosen": -5.783607482910156, "logps/rejected": -44.21714782714844, "loss": 0.0865, "rewards/accuracies": 1.0, "rewards/chosen": 2.9833672046661377, "rewards/margins": 2.9833672046661377, "rewards/rejected": 0.0, "step": 1521 }, { "epoch": 8.502793296089386, "grad_norm": 0.7701425543102947, "learning_rate": 5.325875188598928e-07, "logits/chosen": -3.6467268466949463, "logits/rejected": -3.6299521923065186, "logps/chosen": -2.856154441833496, "logps/rejected": -51.156734466552734, "loss": 0.1139, "rewards/accuracies": 1.0, "rewards/chosen": 2.678068161010742, "rewards/margins": 2.678068161010742, "rewards/rejected": 0.0, "step": 1522 }, { "epoch": 8.508379888268156, "grad_norm": 0.8195655538118419, "learning_rate": 5.319792462218162e-07, "logits/chosen": -3.4476943016052246, "logits/rejected": -3.5049538612365723, "logps/chosen": -7.8609185218811035, "logps/rejected": -33.09964370727539, "loss": 0.0884, "rewards/accuracies": 1.0, "rewards/chosen": 2.5700316429138184, "rewards/margins": 2.5700316429138184, "rewards/rejected": 0.0, "step": 1523 }, { "epoch": 8.513966480446927, "grad_norm": 3.8949954334163324, "learning_rate": 5.313709260568842e-07, "logits/chosen": -3.5582966804504395, "logits/rejected": -3.485438346862793, "logps/chosen": -10.91259479522705, "logps/rejected": -30.21276092529297, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": 3.235612392425537, "rewards/margins": 3.235612392425537, "rewards/rejected": 0.0, "step": 1524 }, { "epoch": 8.519553072625698, "grad_norm": 1.1650742674810737, "learning_rate": 5.307625592691694e-07, "logits/chosen": -3.4802331924438477, "logits/rejected": -3.549264669418335, "logps/chosen": -4.471827507019043, "logps/rejected": -28.547901153564453, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": 2.785182237625122, "rewards/margins": 2.785182237625122, "rewards/rejected": 0.0, "step": 1525 }, { "epoch": 8.525139664804469, "grad_norm": 2.9184506790702223, "learning_rate": 5.30154146762813e-07, "logits/chosen": -3.5332322120666504, "logits/rejected": -3.2711181640625, "logps/chosen": -6.034396648406982, "logps/rejected": -29.913333892822266, "loss": 0.0907, "rewards/accuracies": 1.0, "rewards/chosen": 3.1569344997406006, "rewards/margins": 3.1569344997406006, "rewards/rejected": 0.0, "step": 1526 }, { "epoch": 8.53072625698324, "grad_norm": 2.0859174781379504, "learning_rate": 5.295456894420243e-07, "logits/chosen": -3.5591535568237305, "logits/rejected": -3.5449156761169434, "logps/chosen": -4.0158162117004395, "logps/rejected": -25.790225982666016, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": 2.8487677574157715, "rewards/margins": 2.8487677574157715, "rewards/rejected": 0.0, "step": 1527 }, { "epoch": 8.53631284916201, "grad_norm": 2.575040652943191, "learning_rate": 5.289371882110794e-07, "logits/chosen": -3.381300687789917, "logits/rejected": -3.3595049381256104, "logps/chosen": -0.8191465139389038, "logps/rejected": -40.855125427246094, "loss": 0.129, "rewards/accuracies": 1.0, "rewards/chosen": 1.7980883121490479, "rewards/margins": 1.7980883121490479, "rewards/rejected": 0.0, "step": 1528 }, { "epoch": 8.541899441340782, "grad_norm": 2.0603094432918603, "learning_rate": 5.283286439743194e-07, "logits/chosen": -3.4769701957702637, "logits/rejected": -3.696000337600708, "logps/chosen": -2.3960256576538086, "logps/rejected": -79.72267150878906, "loss": 0.1039, "rewards/accuracies": 1.0, "rewards/chosen": 2.579245090484619, "rewards/margins": 2.579245090484619, "rewards/rejected": 0.0, "step": 1529 }, { "epoch": 8.547486033519553, "grad_norm": 3.296843118035138, "learning_rate": 5.277200576361492e-07, "logits/chosen": -3.4191207885742188, "logits/rejected": -3.3528926372528076, "logps/chosen": -3.467456579208374, "logps/rejected": -77.3072280883789, "loss": 0.1088, "rewards/accuracies": 1.0, "rewards/chosen": 2.1921169757843018, "rewards/margins": 2.1921169757843018, "rewards/rejected": 0.0, "step": 1530 }, { "epoch": 8.553072625698324, "grad_norm": 4.3778375320671366, "learning_rate": 5.271114301010368e-07, "logits/chosen": -3.7481722831726074, "logits/rejected": -3.469683885574341, "logps/chosen": -4.541937828063965, "logps/rejected": -50.02873992919922, "loss": 0.0998, "rewards/accuracies": 1.0, "rewards/chosen": 2.622313976287842, "rewards/margins": 2.622313976287842, "rewards/rejected": 0.0, "step": 1531 }, { "epoch": 8.558659217877095, "grad_norm": 1.442013610427837, "learning_rate": 5.265027622735109e-07, "logits/chosen": -3.201296806335449, "logits/rejected": -3.4194226264953613, "logps/chosen": -0.5189584493637085, "logps/rejected": -107.17063903808594, "loss": 0.1156, "rewards/accuracies": 1.0, "rewards/chosen": 1.6658756732940674, "rewards/margins": 1.6658756732940674, "rewards/rejected": 0.0, "step": 1532 }, { "epoch": 8.564245810055866, "grad_norm": 8.037115998414105, "learning_rate": 5.258940550581605e-07, "logits/chosen": -3.4911601543426514, "logits/rejected": -3.5738885402679443, "logps/chosen": -5.360485076904297, "logps/rejected": -48.8051643371582, "loss": 0.1379, "rewards/accuracies": 1.0, "rewards/chosen": 2.3148505687713623, "rewards/margins": 2.3148505687713623, "rewards/rejected": 0.0, "step": 1533 }, { "epoch": 8.569832402234637, "grad_norm": 0.8811270446557269, "learning_rate": 5.252853093596327e-07, "logits/chosen": -3.525219678878784, "logits/rejected": -3.527994155883789, "logps/chosen": -29.17947006225586, "logps/rejected": -39.194740295410156, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": 3.913750648498535, "rewards/margins": 3.913750648498535, "rewards/rejected": 0.0, "step": 1534 }, { "epoch": 8.575418994413408, "grad_norm": 1.4367890402400878, "learning_rate": 5.246765260826321e-07, "logits/chosen": -3.659507989883423, "logits/rejected": -3.5021913051605225, "logps/chosen": -14.039175987243652, "logps/rejected": -39.691810607910156, "loss": 0.1033, "rewards/accuracies": 1.0, "rewards/chosen": 3.6174304485321045, "rewards/margins": 3.6174304485321045, "rewards/rejected": 0.0, "step": 1535 }, { "epoch": 8.581005586592179, "grad_norm": 1.6099255248292248, "learning_rate": 5.240677061319193e-07, "logits/chosen": -3.3976521492004395, "logits/rejected": -3.2862162590026855, "logps/chosen": -14.649343490600586, "logps/rejected": -43.704246520996094, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 2.6249542236328125, "rewards/margins": 2.6249542236328125, "rewards/rejected": 0.0, "step": 1536 }, { "epoch": 8.58659217877095, "grad_norm": 1.438840290175727, "learning_rate": 5.234588504123088e-07, "logits/chosen": -3.433427095413208, "logits/rejected": -3.3993639945983887, "logps/chosen": -1.6107079982757568, "logps/rejected": -46.80220031738281, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": 2.3125686645507812, "rewards/margins": 2.3125686645507812, "rewards/rejected": 0.0, "step": 1537 }, { "epoch": 8.59217877094972, "grad_norm": 2.1215231779282946, "learning_rate": 5.228499598286689e-07, "logits/chosen": -3.644007921218872, "logits/rejected": -3.645432472229004, "logps/chosen": -3.7853474617004395, "logps/rejected": -36.43778610229492, "loss": 0.1082, "rewards/accuracies": 1.0, "rewards/chosen": 3.251805305480957, "rewards/margins": 3.251805305480957, "rewards/rejected": 0.0, "step": 1538 }, { "epoch": 8.597765363128492, "grad_norm": 3.0671268932467886, "learning_rate": 5.222410352859192e-07, "logits/chosen": -3.644315719604492, "logits/rejected": -3.7836644649505615, "logps/chosen": -5.186097145080566, "logps/rejected": -33.71754455566406, "loss": 0.113, "rewards/accuracies": 1.0, "rewards/chosen": 3.1996922492980957, "rewards/margins": 3.1996922492980957, "rewards/rejected": 0.0, "step": 1539 }, { "epoch": 8.603351955307263, "grad_norm": 0.8552394511306705, "learning_rate": 5.216320776890303e-07, "logits/chosen": -3.527188539505005, "logits/rejected": -3.445709705352783, "logps/chosen": -14.652050018310547, "logps/rejected": -43.6121826171875, "loss": 0.0618, "rewards/accuracies": 1.0, "rewards/chosen": 3.5236968994140625, "rewards/margins": 3.5236968994140625, "rewards/rejected": 0.0, "step": 1540 }, { "epoch": 8.603351955307263, "eval_logits/chosen": -3.355664014816284, "eval_logits/rejected": -3.4829509258270264, "eval_logps/chosen": -26.36115074157715, "eval_logps/rejected": -48.14537811279297, "eval_loss": 0.7530530095100403, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": 0.3708893656730652, "eval_rewards/margins": 0.3708893656730652, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6917, "eval_samples_per_second": 9.483, "eval_steps_per_second": 0.306, "step": 1540 }, { "epoch": 8.608938547486034, "grad_norm": 2.0342816055264823, "learning_rate": 5.210230879430213e-07, "logits/chosen": -3.7195377349853516, "logits/rejected": -3.627046585083008, "logps/chosen": -6.564638614654541, "logps/rejected": -45.38517761230469, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": 3.422337770462036, "rewards/margins": 3.422337770462036, "rewards/rejected": 0.0, "step": 1541 }, { "epoch": 8.614525139664805, "grad_norm": 2.3634223161676147, "learning_rate": 5.204140669529597e-07, "logits/chosen": -3.469510316848755, "logits/rejected": -3.533547878265381, "logps/chosen": -1.5332813262939453, "logps/rejected": -40.70933151245117, "loss": 0.1201, "rewards/accuracies": 1.0, "rewards/chosen": 2.298218011856079, "rewards/margins": 2.298218011856079, "rewards/rejected": 0.0, "step": 1542 }, { "epoch": 8.620111731843576, "grad_norm": 4.001016468988307, "learning_rate": 5.198050156239592e-07, "logits/chosen": -3.583134651184082, "logits/rejected": -3.668936014175415, "logps/chosen": -2.190661668777466, "logps/rejected": -43.959571838378906, "loss": 0.1196, "rewards/accuracies": 1.0, "rewards/chosen": 1.7127172946929932, "rewards/margins": 1.7127172946929932, "rewards/rejected": 0.0, "step": 1543 }, { "epoch": 8.625698324022347, "grad_norm": 3.2715619364999884, "learning_rate": 5.191959348611782e-07, "logits/chosen": -3.462294816970825, "logits/rejected": -3.3381593227386475, "logps/chosen": -7.955991268157959, "logps/rejected": -46.20689392089844, "loss": 0.1126, "rewards/accuracies": 1.0, "rewards/chosen": 3.412771701812744, "rewards/margins": 3.412771701812744, "rewards/rejected": 0.0, "step": 1544 }, { "epoch": 8.631284916201118, "grad_norm": 1.2037941282863056, "learning_rate": 5.185868255698195e-07, "logits/chosen": -3.3791279792785645, "logits/rejected": -3.088306188583374, "logps/chosen": -9.527669906616211, "logps/rejected": -71.26609802246094, "loss": 0.0806, "rewards/accuracies": 1.0, "rewards/chosen": 2.5406413078308105, "rewards/margins": 2.5406413078308105, "rewards/rejected": 0.0, "step": 1545 }, { "epoch": 8.636871508379889, "grad_norm": 1.313322479436088, "learning_rate": 5.179776886551279e-07, "logits/chosen": -3.0730252265930176, "logits/rejected": -3.005383253097534, "logps/chosen": -6.606561660766602, "logps/rejected": -43.8348388671875, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 3.1686437129974365, "rewards/margins": 3.1686437129974365, "rewards/rejected": 0.0, "step": 1546 }, { "epoch": 8.64245810055866, "grad_norm": 2.0170485666166607, "learning_rate": 5.173685250223893e-07, "logits/chosen": -3.499685764312744, "logits/rejected": -3.5609824657440186, "logps/chosen": -8.598729133605957, "logps/rejected": -36.26488494873047, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": 3.3983078002929688, "rewards/margins": 3.3983078002929688, "rewards/rejected": 0.0, "step": 1547 }, { "epoch": 8.64804469273743, "grad_norm": 0.9900053587305689, "learning_rate": 5.167593355769293e-07, "logits/chosen": -3.0416650772094727, "logits/rejected": -2.926699638366699, "logps/chosen": -21.06253433227539, "logps/rejected": -38.30773162841797, "loss": 0.1033, "rewards/accuracies": 1.0, "rewards/chosen": 3.208059787750244, "rewards/margins": 3.208059787750244, "rewards/rejected": 0.0, "step": 1548 }, { "epoch": 8.653631284916202, "grad_norm": 1.560846482349804, "learning_rate": 5.161501212241119e-07, "logits/chosen": -3.668612241744995, "logits/rejected": -3.6418755054473877, "logps/chosen": -0.6443605422973633, "logps/rejected": -52.83890151977539, "loss": 0.1239, "rewards/accuracies": 1.0, "rewards/chosen": 1.734978437423706, "rewards/margins": 1.734978437423706, "rewards/rejected": 0.0, "step": 1549 }, { "epoch": 8.659217877094973, "grad_norm": 2.687339764265376, "learning_rate": 5.155408828693382e-07, "logits/chosen": -3.4790260791778564, "logits/rejected": -3.5892174243927, "logps/chosen": -6.555553913116455, "logps/rejected": -25.03845977783203, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 2.771357536315918, "rewards/margins": 2.771357536315918, "rewards/rejected": 0.0, "step": 1550 }, { "epoch": 8.664804469273744, "grad_norm": 2.065569564093539, "learning_rate": 5.14931621418045e-07, "logits/chosen": -3.499326229095459, "logits/rejected": -3.357387065887451, "logps/chosen": -5.375399589538574, "logps/rejected": -26.244762420654297, "loss": 0.1045, "rewards/accuracies": 1.0, "rewards/chosen": 2.560816764831543, "rewards/margins": 2.560816764831543, "rewards/rejected": 0.0, "step": 1551 }, { "epoch": 8.670391061452515, "grad_norm": 1.4612654511791863, "learning_rate": 5.143223377757031e-07, "logits/chosen": -3.5222458839416504, "logits/rejected": -3.454744338989258, "logps/chosen": -5.21843147277832, "logps/rejected": -53.71820068359375, "loss": 0.093, "rewards/accuracies": 1.0, "rewards/chosen": 2.6841795444488525, "rewards/margins": 2.6841795444488525, "rewards/rejected": 0.0, "step": 1552 }, { "epoch": 8.675977653631286, "grad_norm": 2.888368778684556, "learning_rate": 5.137130328478166e-07, "logits/chosen": -3.3584721088409424, "logits/rejected": -3.3054399490356445, "logps/chosen": -16.143930435180664, "logps/rejected": -52.69926452636719, "loss": 0.1346, "rewards/accuracies": 1.0, "rewards/chosen": 3.1023361682891846, "rewards/margins": 3.1023361682891846, "rewards/rejected": 0.0, "step": 1553 }, { "epoch": 8.681564245810057, "grad_norm": 3.815984947336513, "learning_rate": 5.131037075399212e-07, "logits/chosen": -3.327307939529419, "logits/rejected": -3.222965717315674, "logps/chosen": -24.722620010375977, "logps/rejected": -67.58235168457031, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": 2.7870500087738037, "rewards/margins": 2.7870500087738037, "rewards/rejected": 0.0, "step": 1554 }, { "epoch": 8.687150837988828, "grad_norm": 1.8643236715290348, "learning_rate": 5.124943627575826e-07, "logits/chosen": -3.320669412612915, "logits/rejected": -3.542205810546875, "logps/chosen": -14.131210327148438, "logps/rejected": -45.428321838378906, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 3.646136999130249, "rewards/margins": 3.646136999130249, "rewards/rejected": 0.0, "step": 1555 }, { "epoch": 8.692737430167599, "grad_norm": 1.9161077564537476, "learning_rate": 5.11884999406396e-07, "logits/chosen": -3.2250707149505615, "logits/rejected": -3.2313990592956543, "logps/chosen": -23.096805572509766, "logps/rejected": -60.486968994140625, "loss": 0.0962, "rewards/accuracies": 1.0, "rewards/chosen": 3.9371800422668457, "rewards/margins": 3.9371800422668457, "rewards/rejected": 0.0, "step": 1556 }, { "epoch": 8.69832402234637, "grad_norm": 3.096848599623186, "learning_rate": 5.112756183919836e-07, "logits/chosen": -3.2024033069610596, "logits/rejected": -3.2210841178894043, "logps/chosen": -13.349126815795898, "logps/rejected": -54.73284912109375, "loss": 0.1103, "rewards/accuracies": 1.0, "rewards/chosen": 2.460592746734619, "rewards/margins": 2.460592746734619, "rewards/rejected": 0.0, "step": 1557 }, { "epoch": 8.703910614525139, "grad_norm": 1.3174950052810543, "learning_rate": 5.106662206199942e-07, "logits/chosen": -3.3897392749786377, "logits/rejected": -3.579496145248413, "logps/chosen": -3.686384677886963, "logps/rejected": -26.034996032714844, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 2.913018226623535, "rewards/margins": 2.913018226623535, "rewards/rejected": 0.0, "step": 1558 }, { "epoch": 8.70949720670391, "grad_norm": 1.1337723992522022, "learning_rate": 5.100568069961014e-07, "logits/chosen": -3.577808141708374, "logits/rejected": -3.5318191051483154, "logps/chosen": -2.9288718700408936, "logps/rejected": -63.374305725097656, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": 2.5752320289611816, "rewards/margins": 2.5752320289611816, "rewards/rejected": 0.0, "step": 1559 }, { "epoch": 8.71508379888268, "grad_norm": 3.3617005770425044, "learning_rate": 5.094473784260023e-07, "logits/chosen": -3.412342071533203, "logits/rejected": -3.5374176502227783, "logps/chosen": -6.275794506072998, "logps/rejected": -37.99629211425781, "loss": 0.0899, "rewards/accuracies": 1.0, "rewards/chosen": 2.3646249771118164, "rewards/margins": 2.3646249771118164, "rewards/rejected": 0.0, "step": 1560 }, { "epoch": 8.71508379888268, "eval_logits/chosen": -3.343669891357422, "eval_logits/rejected": -3.472111463546753, "eval_logps/chosen": -26.061229705810547, "eval_logps/rejected": -47.825870513916016, "eval_loss": 0.7493130564689636, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": 0.4008810520172119, "eval_rewards/margins": 0.4008810520172119, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7379, "eval_samples_per_second": 9.469, "eval_steps_per_second": 0.305, "step": 1560 }, { "epoch": 8.720670391061452, "grad_norm": 1.1258929545927845, "learning_rate": 5.088379358154165e-07, "logits/chosen": -3.7648444175720215, "logits/rejected": -3.540130376815796, "logps/chosen": -3.2312402725219727, "logps/rejected": -41.85802459716797, "loss": 0.1062, "rewards/accuracies": 1.0, "rewards/chosen": 2.6411352157592773, "rewards/margins": 2.6411352157592773, "rewards/rejected": 0.0, "step": 1561 }, { "epoch": 8.726256983240223, "grad_norm": 1.2156419190124832, "learning_rate": 5.082284800700841e-07, "logits/chosen": -3.464627981185913, "logits/rejected": -3.509732961654663, "logps/chosen": -3.6714587211608887, "logps/rejected": -36.64527130126953, "loss": 0.1028, "rewards/accuracies": 1.0, "rewards/chosen": 2.553143262863159, "rewards/margins": 2.553143262863159, "rewards/rejected": 0.0, "step": 1562 }, { "epoch": 8.731843575418994, "grad_norm": 1.3044713481705497, "learning_rate": 5.076190120957649e-07, "logits/chosen": -3.347734212875366, "logits/rejected": -3.411790370941162, "logps/chosen": -3.0454139709472656, "logps/rejected": -41.397701263427734, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": 2.37506103515625, "rewards/margins": 2.37506103515625, "rewards/rejected": 0.0, "step": 1563 }, { "epoch": 8.737430167597765, "grad_norm": 1.3422598830715409, "learning_rate": 5.070095327982368e-07, "logits/chosen": -3.61736798286438, "logits/rejected": -3.5823631286621094, "logps/chosen": -15.124504089355469, "logps/rejected": -33.697410583496094, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 3.5412120819091797, "rewards/margins": 3.5412120819091797, "rewards/rejected": 0.0, "step": 1564 }, { "epoch": 8.743016759776536, "grad_norm": 0.9727540910407199, "learning_rate": 5.064000430832947e-07, "logits/chosen": -3.644531011581421, "logits/rejected": -3.639143943786621, "logps/chosen": -8.086284637451172, "logps/rejected": -24.990882873535156, "loss": 0.1146, "rewards/accuracies": 1.0, "rewards/chosen": 3.3636388778686523, "rewards/margins": 3.3636388778686523, "rewards/rejected": 0.0, "step": 1565 }, { "epoch": 8.748603351955307, "grad_norm": 1.7613299882922084, "learning_rate": 5.057905438567488e-07, "logits/chosen": -3.5720908641815186, "logits/rejected": -3.421658515930176, "logps/chosen": -4.2872819900512695, "logps/rejected": -46.355926513671875, "loss": 0.1236, "rewards/accuracies": 1.0, "rewards/chosen": 2.071406364440918, "rewards/margins": 2.071406364440918, "rewards/rejected": 0.0, "step": 1566 }, { "epoch": 8.754189944134078, "grad_norm": 2.2865459606433034, "learning_rate": 5.051810360244234e-07, "logits/chosen": -3.486600875854492, "logits/rejected": -3.6979551315307617, "logps/chosen": -19.66204071044922, "logps/rejected": -56.792850494384766, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": 3.337367534637451, "rewards/margins": 3.337367534637451, "rewards/rejected": 0.0, "step": 1567 }, { "epoch": 8.759776536312849, "grad_norm": 7.629526182105958, "learning_rate": 5.045715204921559e-07, "logits/chosen": -3.6171090602874756, "logits/rejected": -3.5304319858551025, "logps/chosen": -16.706317901611328, "logps/rejected": -51.3306999206543, "loss": 0.1213, "rewards/accuracies": 1.0, "rewards/chosen": 2.9988741874694824, "rewards/margins": 2.9988741874694824, "rewards/rejected": 0.0, "step": 1568 }, { "epoch": 8.76536312849162, "grad_norm": 0.8949060173920261, "learning_rate": 5.039619981657948e-07, "logits/chosen": -3.300743818283081, "logits/rejected": -3.1763076782226562, "logps/chosen": -18.87421989440918, "logps/rejected": -66.6238021850586, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 3.896085023880005, "rewards/margins": 3.896085023880005, "rewards/rejected": 0.0, "step": 1569 }, { "epoch": 8.77094972067039, "grad_norm": 1.015738087328273, "learning_rate": 5.033524699511986e-07, "logits/chosen": -3.715214729309082, "logits/rejected": -3.7003979682922363, "logps/chosen": -5.552937030792236, "logps/rejected": -28.38661766052246, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": 2.587810754776001, "rewards/margins": 2.587810754776001, "rewards/rejected": 0.0, "step": 1570 }, { "epoch": 8.776536312849162, "grad_norm": 3.058420619499173, "learning_rate": 5.027429367542352e-07, "logits/chosen": -3.286238193511963, "logits/rejected": -3.4611129760742188, "logps/chosen": -6.222301483154297, "logps/rejected": -40.57895278930664, "loss": 0.1318, "rewards/accuracies": 1.0, "rewards/chosen": 2.6173553466796875, "rewards/margins": 2.6173553466796875, "rewards/rejected": 0.0, "step": 1571 }, { "epoch": 8.782122905027933, "grad_norm": 2.266191007271309, "learning_rate": 5.021333994807793e-07, "logits/chosen": -3.624561309814453, "logits/rejected": -3.5173733234405518, "logps/chosen": -2.490200996398926, "logps/rejected": -42.061866760253906, "loss": 0.1134, "rewards/accuracies": 1.0, "rewards/chosen": 2.5689961910247803, "rewards/margins": 2.5689961910247803, "rewards/rejected": 0.0, "step": 1572 }, { "epoch": 8.787709497206704, "grad_norm": 1.1402949409937073, "learning_rate": 5.015238590367117e-07, "logits/chosen": -3.3423678874969482, "logits/rejected": -3.575061798095703, "logps/chosen": -3.269780158996582, "logps/rejected": -32.531211853027344, "loss": 0.0865, "rewards/accuracies": 1.0, "rewards/chosen": 3.163800001144409, "rewards/margins": 3.163800001144409, "rewards/rejected": 0.0, "step": 1573 }, { "epoch": 8.793296089385475, "grad_norm": 3.5322342832499434, "learning_rate": 5.009143163279183e-07, "logits/chosen": -3.370183229446411, "logits/rejected": -3.5936026573181152, "logps/chosen": -0.7177491784095764, "logps/rejected": -57.73527526855469, "loss": 0.1397, "rewards/accuracies": 1.0, "rewards/chosen": 1.7981486320495605, "rewards/margins": 1.7981486320495605, "rewards/rejected": 0.0, "step": 1574 }, { "epoch": 8.798882681564246, "grad_norm": 3.2384878827922097, "learning_rate": 5.003047722602881e-07, "logits/chosen": -3.5188417434692383, "logits/rejected": -3.4905717372894287, "logps/chosen": -7.418839454650879, "logps/rejected": -43.177005767822266, "loss": 0.1091, "rewards/accuracies": 1.0, "rewards/chosen": 3.034696102142334, "rewards/margins": 3.034696102142334, "rewards/rejected": 0.0, "step": 1575 }, { "epoch": 8.804469273743017, "grad_norm": 1.3958213066627738, "learning_rate": 4.99695227739712e-07, "logits/chosen": -3.5734057426452637, "logits/rejected": -3.4808127880096436, "logps/chosen": -4.616374969482422, "logps/rejected": -71.2945556640625, "loss": 0.103, "rewards/accuracies": 1.0, "rewards/chosen": 2.652176856994629, "rewards/margins": 2.652176856994629, "rewards/rejected": 0.0, "step": 1576 }, { "epoch": 8.810055865921788, "grad_norm": 1.1838207134161463, "learning_rate": 4.990856836720816e-07, "logits/chosen": -3.5023655891418457, "logits/rejected": -3.396958112716675, "logps/chosen": -3.69647216796875, "logps/rejected": -28.05923080444336, "loss": 0.106, "rewards/accuracies": 1.0, "rewards/chosen": 2.5006635189056396, "rewards/margins": 2.5006635189056396, "rewards/rejected": 0.0, "step": 1577 }, { "epoch": 8.815642458100559, "grad_norm": 1.417720591473832, "learning_rate": 4.984761409632881e-07, "logits/chosen": -3.582775592803955, "logits/rejected": -3.546322822570801, "logps/chosen": -5.0747480392456055, "logps/rejected": -47.944156646728516, "loss": 0.1128, "rewards/accuracies": 1.0, "rewards/chosen": 3.0520803928375244, "rewards/margins": 3.0520803928375244, "rewards/rejected": 0.0, "step": 1578 }, { "epoch": 8.82122905027933, "grad_norm": 2.2313394840329717, "learning_rate": 4.978666005192208e-07, "logits/chosen": -3.461977958679199, "logits/rejected": -3.450249671936035, "logps/chosen": -30.081253051757812, "logps/rejected": -53.075565338134766, "loss": 0.1085, "rewards/accuracies": 1.0, "rewards/chosen": 3.263123035430908, "rewards/margins": 3.263123035430908, "rewards/rejected": 0.0, "step": 1579 }, { "epoch": 8.8268156424581, "grad_norm": 4.088177642631041, "learning_rate": 4.972570632457648e-07, "logits/chosen": -3.4786570072174072, "logits/rejected": -3.452631711959839, "logps/chosen": -15.857526779174805, "logps/rejected": -98.74070739746094, "loss": 0.1591, "rewards/accuracies": 1.0, "rewards/chosen": 2.5767078399658203, "rewards/margins": 2.5767078399658203, "rewards/rejected": 0.0, "step": 1580 }, { "epoch": 8.8268156424581, "eval_logits/chosen": -3.3498008251190186, "eval_logits/rejected": -3.4777941703796387, "eval_logps/chosen": -26.655345916748047, "eval_logps/rejected": -48.50056838989258, "eval_loss": 0.7682304978370667, "eval_rewards/accuracies": 0.675000011920929, "eval_rewards/chosen": 0.3414694666862488, "eval_rewards/margins": 0.3414694666862488, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7177, "eval_samples_per_second": 9.475, "eval_steps_per_second": 0.306, "step": 1580 }, { "epoch": 8.832402234636872, "grad_norm": 4.485409113305424, "learning_rate": 4.966475300488013e-07, "logits/chosen": -3.4706907272338867, "logits/rejected": -3.287583112716675, "logps/chosen": -5.174438953399658, "logps/rejected": -85.02562713623047, "loss": 0.1581, "rewards/accuracies": 1.0, "rewards/chosen": 2.533134937286377, "rewards/margins": 2.533134937286377, "rewards/rejected": 0.0, "step": 1581 }, { "epoch": 8.837988826815643, "grad_norm": 2.2256516999035076, "learning_rate": 4.960380018342054e-07, "logits/chosen": -3.248904228210449, "logits/rejected": -3.4592931270599365, "logps/chosen": -4.772720813751221, "logps/rejected": -38.422508239746094, "loss": 0.1207, "rewards/accuracies": 1.0, "rewards/chosen": 2.3661322593688965, "rewards/margins": 2.3661322593688965, "rewards/rejected": 0.0, "step": 1582 }, { "epoch": 8.843575418994414, "grad_norm": 1.5716208939373597, "learning_rate": 4.954284795078441e-07, "logits/chosen": -3.2725048065185547, "logits/rejected": -3.2471745014190674, "logps/chosen": -15.56840705871582, "logps/rejected": -39.41374969482422, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 3.6685779094696045, "rewards/margins": 3.6685779094696045, "rewards/rejected": 0.0, "step": 1583 }, { "epoch": 8.849162011173185, "grad_norm": 3.1934728620877713, "learning_rate": 4.948189639755766e-07, "logits/chosen": -3.3488597869873047, "logits/rejected": -3.3317267894744873, "logps/chosen": -37.42424011230469, "logps/rejected": -30.104129791259766, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 3.6193454265594482, "rewards/margins": 3.6193454265594482, "rewards/rejected": 0.0, "step": 1584 }, { "epoch": 8.854748603351956, "grad_norm": 1.8094745652787352, "learning_rate": 4.942094561432512e-07, "logits/chosen": -3.3481955528259277, "logits/rejected": -3.2755868434906006, "logps/chosen": -13.137596130371094, "logps/rejected": -29.392597198486328, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 3.5587501525878906, "rewards/margins": 3.5587501525878906, "rewards/rejected": 0.0, "step": 1585 }, { "epoch": 8.860335195530727, "grad_norm": 1.6502555052491021, "learning_rate": 4.935999569167054e-07, "logits/chosen": -3.6667110919952393, "logits/rejected": -3.4561636447906494, "logps/chosen": -1.662327766418457, "logps/rejected": -52.39588165283203, "loss": 0.1197, "rewards/accuracies": 1.0, "rewards/chosen": 2.203054189682007, "rewards/margins": 2.203054189682007, "rewards/rejected": 0.0, "step": 1586 }, { "epoch": 8.865921787709498, "grad_norm": 1.357996872364283, "learning_rate": 4.929904672017632e-07, "logits/chosen": -3.220386028289795, "logits/rejected": -3.1334450244903564, "logps/chosen": -7.67182731628418, "logps/rejected": -44.112003326416016, "loss": 0.1114, "rewards/accuracies": 1.0, "rewards/chosen": 3.803406000137329, "rewards/margins": 3.803406000137329, "rewards/rejected": 0.0, "step": 1587 }, { "epoch": 8.871508379888269, "grad_norm": 2.3309656231730322, "learning_rate": 4.923809879042352e-07, "logits/chosen": -3.4371016025543213, "logits/rejected": -3.160398483276367, "logps/chosen": -6.324312210083008, "logps/rejected": -31.49691390991211, "loss": 0.0875, "rewards/accuracies": 1.0, "rewards/chosen": 3.0658135414123535, "rewards/margins": 3.0658135414123535, "rewards/rejected": 0.0, "step": 1588 }, { "epoch": 8.87709497206704, "grad_norm": 1.2529366060778842, "learning_rate": 4.917715199299159e-07, "logits/chosen": -3.572672128677368, "logits/rejected": -3.508350372314453, "logps/chosen": -1.9186047315597534, "logps/rejected": -27.849653244018555, "loss": 0.1235, "rewards/accuracies": 1.0, "rewards/chosen": 1.938552975654602, "rewards/margins": 1.938552975654602, "rewards/rejected": 0.0, "step": 1589 }, { "epoch": 8.88268156424581, "grad_norm": 1.2374703915768543, "learning_rate": 4.911620641845836e-07, "logits/chosen": -3.663745403289795, "logits/rejected": -3.6889708042144775, "logps/chosen": -0.6582353115081787, "logps/rejected": -80.0638427734375, "loss": 0.1495, "rewards/accuracies": 1.0, "rewards/chosen": 1.2671637535095215, "rewards/margins": 1.2671637535095215, "rewards/rejected": 0.0, "step": 1590 }, { "epoch": 8.888268156424582, "grad_norm": 2.035017885923048, "learning_rate": 4.905526215739977e-07, "logits/chosen": -3.5884945392608643, "logits/rejected": -3.4904232025146484, "logps/chosen": -18.197912216186523, "logps/rejected": -37.696136474609375, "loss": 0.1212, "rewards/accuracies": 1.0, "rewards/chosen": 3.4336118698120117, "rewards/margins": 3.4336118698120117, "rewards/rejected": 0.0, "step": 1591 }, { "epoch": 8.893854748603353, "grad_norm": 4.704868496625444, "learning_rate": 4.899431930038986e-07, "logits/chosen": -3.2843191623687744, "logits/rejected": -3.3270316123962402, "logps/chosen": -17.2952823638916, "logps/rejected": -68.72885131835938, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": 2.5416369438171387, "rewards/margins": 2.5416369438171387, "rewards/rejected": 0.0, "step": 1592 }, { "epoch": 8.899441340782122, "grad_norm": 3.9141381699334965, "learning_rate": 4.893337793800059e-07, "logits/chosen": -3.5973403453826904, "logits/rejected": -3.699655532836914, "logps/chosen": -7.495997428894043, "logps/rejected": -61.90679168701172, "loss": 0.1119, "rewards/accuracies": 1.0, "rewards/chosen": 3.1217434406280518, "rewards/margins": 3.1217434406280518, "rewards/rejected": 0.0, "step": 1593 }, { "epoch": 8.905027932960895, "grad_norm": 4.424650691551774, "learning_rate": 4.887243816080165e-07, "logits/chosen": -3.300236225128174, "logits/rejected": -3.3679394721984863, "logps/chosen": -8.020062446594238, "logps/rejected": -63.595943450927734, "loss": 0.1349, "rewards/accuracies": 1.0, "rewards/chosen": 2.423440456390381, "rewards/margins": 2.423440456390381, "rewards/rejected": 0.0, "step": 1594 }, { "epoch": 8.910614525139664, "grad_norm": 1.0632118047821677, "learning_rate": 4.881150005936041e-07, "logits/chosen": -2.886989116668701, "logits/rejected": -2.9253146648406982, "logps/chosen": -8.046568870544434, "logps/rejected": -35.37300109863281, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 2.350053310394287, "rewards/margins": 2.350053310394287, "rewards/rejected": 0.0, "step": 1595 }, { "epoch": 8.916201117318435, "grad_norm": 3.7207138444524617, "learning_rate": 4.875056372424173e-07, "logits/chosen": -3.399993896484375, "logits/rejected": -3.3559515476226807, "logps/chosen": -1.6196277141571045, "logps/rejected": -125.1887435913086, "loss": 0.0939, "rewards/accuracies": 1.0, "rewards/chosen": 2.29793643951416, "rewards/margins": 2.29793643951416, "rewards/rejected": 0.0, "step": 1596 }, { "epoch": 8.921787709497206, "grad_norm": 2.668075368170375, "learning_rate": 4.868962924600789e-07, "logits/chosen": -3.4909353256225586, "logits/rejected": -3.4668478965759277, "logps/chosen": -9.527432441711426, "logps/rejected": -28.397212982177734, "loss": 0.095, "rewards/accuracies": 1.0, "rewards/chosen": 3.1990573406219482, "rewards/margins": 3.1990573406219482, "rewards/rejected": 0.0, "step": 1597 }, { "epoch": 8.927374301675977, "grad_norm": 2.0253597516958934, "learning_rate": 4.862869671521834e-07, "logits/chosen": -3.2897651195526123, "logits/rejected": -3.4349746704101562, "logps/chosen": -3.4705586433410645, "logps/rejected": -33.25648498535156, "loss": 0.1157, "rewards/accuracies": 1.0, "rewards/chosen": 2.474574089050293, "rewards/margins": 2.474574089050293, "rewards/rejected": 0.0, "step": 1598 }, { "epoch": 8.932960893854748, "grad_norm": 2.038151410844929, "learning_rate": 4.85677662224297e-07, "logits/chosen": -3.5211784839630127, "logits/rejected": -3.6392548084259033, "logps/chosen": -5.188231468200684, "logps/rejected": -89.66943359375, "loss": 0.1228, "rewards/accuracies": 1.0, "rewards/chosen": 2.9291810989379883, "rewards/margins": 2.9291810989379883, "rewards/rejected": 0.0, "step": 1599 }, { "epoch": 8.938547486033519, "grad_norm": 0.9683098958025135, "learning_rate": 4.85068378581955e-07, "logits/chosen": -3.423570394515991, "logits/rejected": -3.555612564086914, "logps/chosen": -15.116783142089844, "logps/rejected": -31.30979347229004, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": 3.50461745262146, "rewards/margins": 3.50461745262146, "rewards/rejected": 0.0, "step": 1600 }, { "epoch": 8.938547486033519, "eval_logits/chosen": -3.344449996948242, "eval_logits/rejected": -3.470318555831909, "eval_logps/chosen": -27.05573081970215, "eval_logps/rejected": -48.810630798339844, "eval_loss": 0.7781283259391785, "eval_rewards/accuracies": 0.675000011920929, "eval_rewards/chosen": 0.30143120884895325, "eval_rewards/margins": 0.30143120884895325, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7151, "eval_samples_per_second": 9.476, "eval_steps_per_second": 0.306, "step": 1600 }, { "epoch": 8.94413407821229, "grad_norm": 2.923952822851652, "learning_rate": 4.844591171306617e-07, "logits/chosen": -3.3482167720794678, "logits/rejected": -3.3730177879333496, "logps/chosen": -12.375945091247559, "logps/rejected": -41.884944915771484, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 3.0999507904052734, "rewards/margins": 3.0999507904052734, "rewards/rejected": 0.0, "step": 1601 }, { "epoch": 8.949720670391061, "grad_norm": 1.99215640247595, "learning_rate": 4.838498787758881e-07, "logits/chosen": -3.27443265914917, "logits/rejected": -3.4482321739196777, "logps/chosen": -1.7216635942459106, "logps/rejected": -42.25780487060547, "loss": 0.0974, "rewards/accuracies": 1.0, "rewards/chosen": 1.9816725254058838, "rewards/margins": 1.9816725254058838, "rewards/rejected": 0.0, "step": 1602 }, { "epoch": 8.955307262569832, "grad_norm": 2.984793762440282, "learning_rate": 4.832406644230707e-07, "logits/chosen": -3.319697141647339, "logits/rejected": -3.5623459815979004, "logps/chosen": -4.52077579498291, "logps/rejected": -47.441158294677734, "loss": 0.1156, "rewards/accuracies": 1.0, "rewards/chosen": 2.188465118408203, "rewards/margins": 2.188465118408203, "rewards/rejected": 0.0, "step": 1603 }, { "epoch": 8.960893854748603, "grad_norm": 1.7438154477280894, "learning_rate": 4.826314749776108e-07, "logits/chosen": -3.4082627296447754, "logits/rejected": -3.320340156555176, "logps/chosen": -13.580062866210938, "logps/rejected": -42.55960464477539, "loss": 0.0786, "rewards/accuracies": 1.0, "rewards/chosen": 3.284619092941284, "rewards/margins": 3.284619092941284, "rewards/rejected": 0.0, "step": 1604 }, { "epoch": 8.966480446927374, "grad_norm": 1.8153576446453652, "learning_rate": 4.820223113448722e-07, "logits/chosen": -3.4017300605773926, "logits/rejected": -3.4077565670013428, "logps/chosen": -0.5121971368789673, "logps/rejected": -56.97151184082031, "loss": 0.1552, "rewards/accuracies": 1.0, "rewards/chosen": 1.4818192720413208, "rewards/margins": 1.4818192720413208, "rewards/rejected": 0.0, "step": 1605 }, { "epoch": 8.972067039106145, "grad_norm": 3.761434969368255, "learning_rate": 4.814131744301805e-07, "logits/chosen": -3.4453651905059814, "logits/rejected": -3.4377174377441406, "logps/chosen": -3.7002198696136475, "logps/rejected": -69.67536926269531, "loss": 0.1248, "rewards/accuracies": 1.0, "rewards/chosen": 2.8244028091430664, "rewards/margins": 2.8244028091430664, "rewards/rejected": 0.0, "step": 1606 }, { "epoch": 8.977653631284916, "grad_norm": 2.1432809831255684, "learning_rate": 4.808040651388218e-07, "logits/chosen": -3.3408074378967285, "logits/rejected": -3.4254205226898193, "logps/chosen": -1.5616624355316162, "logps/rejected": -40.005062103271484, "loss": 0.1328, "rewards/accuracies": 1.0, "rewards/chosen": 2.43265438079834, "rewards/margins": 2.43265438079834, "rewards/rejected": 0.0, "step": 1607 }, { "epoch": 8.983240223463687, "grad_norm": 2.3523641894817895, "learning_rate": 4.801949843760409e-07, "logits/chosen": -3.645590305328369, "logits/rejected": -3.5588388442993164, "logps/chosen": -6.469679355621338, "logps/rejected": -34.304443359375, "loss": 0.1179, "rewards/accuracies": 1.0, "rewards/chosen": 2.7767696380615234, "rewards/margins": 2.7767696380615234, "rewards/rejected": 0.0, "step": 1608 }, { "epoch": 8.988826815642458, "grad_norm": 1.2175327141580934, "learning_rate": 4.795859330470402e-07, "logits/chosen": -3.355254888534546, "logits/rejected": -3.4555022716522217, "logps/chosen": -29.797555923461914, "logps/rejected": -46.1976318359375, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": 3.2099294662475586, "rewards/margins": 3.2099294662475586, "rewards/rejected": 0.0, "step": 1609 }, { "epoch": 8.994413407821229, "grad_norm": 1.8427440846233556, "learning_rate": 4.789769120569785e-07, "logits/chosen": -3.239454746246338, "logits/rejected": -3.0870308876037598, "logps/chosen": -5.3824357986450195, "logps/rejected": -60.95530319213867, "loss": 0.1313, "rewards/accuracies": 1.0, "rewards/chosen": 2.3480262756347656, "rewards/margins": 2.3480262756347656, "rewards/rejected": 0.0, "step": 1610 }, { "epoch": 9.0, "grad_norm": 3.696445974241549, "learning_rate": 4.783679223109699e-07, "logits/chosen": -3.4663498401641846, "logits/rejected": -3.6648917198181152, "logps/chosen": -2.7009425163269043, "logps/rejected": -34.224090576171875, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 1.5710911750793457, "rewards/margins": 1.5710911750793457, "rewards/rejected": 0.0, "step": 1611 }, { "epoch": 9.005586592178771, "grad_norm": 1.7836807376205073, "learning_rate": 4.777589647140808e-07, "logits/chosen": -3.4373486042022705, "logits/rejected": -3.411127805709839, "logps/chosen": -6.093512535095215, "logps/rejected": -47.09807586669922, "loss": 0.108, "rewards/accuracies": 1.0, "rewards/chosen": 3.146267890930176, "rewards/margins": 3.146267890930176, "rewards/rejected": 0.0, "step": 1612 }, { "epoch": 9.011173184357542, "grad_norm": 1.7596652181504093, "learning_rate": 4.77150040171331e-07, "logits/chosen": -3.6059412956237793, "logits/rejected": -3.623866319656372, "logps/chosen": -2.8918423652648926, "logps/rejected": -74.71697998046875, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": 2.2367377281188965, "rewards/margins": 2.2367377281188965, "rewards/rejected": 0.0, "step": 1613 }, { "epoch": 9.016759776536313, "grad_norm": 3.097676263988404, "learning_rate": 4.765411495876912e-07, "logits/chosen": -3.3083105087280273, "logits/rejected": -3.602318048477173, "logps/chosen": -0.7240970134735107, "logps/rejected": -40.536834716796875, "loss": 0.1244, "rewards/accuracies": 1.0, "rewards/chosen": 2.0945539474487305, "rewards/margins": 2.0945539474487305, "rewards/rejected": 0.0, "step": 1614 }, { "epoch": 9.022346368715084, "grad_norm": 1.4345374265099398, "learning_rate": 4.7593229386808077e-07, "logits/chosen": -3.74735426902771, "logits/rejected": -3.740783929824829, "logps/chosen": -2.399965763092041, "logps/rejected": -33.345176696777344, "loss": 0.1085, "rewards/accuracies": 1.0, "rewards/chosen": 2.3207335472106934, "rewards/margins": 2.3207335472106934, "rewards/rejected": 0.0, "step": 1615 }, { "epoch": 9.027932960893855, "grad_norm": 2.0122321395956786, "learning_rate": 4.7532347391736775e-07, "logits/chosen": -3.1974687576293945, "logits/rejected": -3.291553020477295, "logps/chosen": -3.7264633178710938, "logps/rejected": -57.64251708984375, "loss": 0.1114, "rewards/accuracies": 1.0, "rewards/chosen": 2.3138155937194824, "rewards/margins": 2.3138155937194824, "rewards/rejected": 0.0, "step": 1616 }, { "epoch": 9.033519553072626, "grad_norm": 1.5822959394116287, "learning_rate": 4.7471469064036735e-07, "logits/chosen": -3.2524325847625732, "logits/rejected": -3.3430263996124268, "logps/chosen": -0.9469399452209473, "logps/rejected": -35.770469665527344, "loss": 0.1363, "rewards/accuracies": 1.0, "rewards/chosen": 2.1638243198394775, "rewards/margins": 2.1638243198394775, "rewards/rejected": 0.0, "step": 1617 }, { "epoch": 9.039106145251397, "grad_norm": 3.2426545116767067, "learning_rate": 4.7410594494183955e-07, "logits/chosen": -3.6374306678771973, "logits/rejected": -3.6596550941467285, "logps/chosen": -4.790374279022217, "logps/rejected": -36.828216552734375, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": 2.5422682762145996, "rewards/margins": 2.5422682762145996, "rewards/rejected": 0.0, "step": 1618 }, { "epoch": 9.044692737430168, "grad_norm": 0.6697225002492078, "learning_rate": 4.734972377264892e-07, "logits/chosen": -3.4723641872406006, "logits/rejected": -3.276477336883545, "logps/chosen": -2.1712141036987305, "logps/rejected": -37.70220947265625, "loss": 0.1064, "rewards/accuracies": 1.0, "rewards/chosen": 2.700287342071533, "rewards/margins": 2.700287342071533, "rewards/rejected": 0.0, "step": 1619 }, { "epoch": 9.050279329608939, "grad_norm": 0.6852575228381395, "learning_rate": 4.728885698989633e-07, "logits/chosen": -3.3522133827209473, "logits/rejected": -3.481334924697876, "logps/chosen": -1.090745449066162, "logps/rejected": -39.78439712524414, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 2.2486796379089355, "rewards/margins": 2.2486796379089355, "rewards/rejected": 0.0, "step": 1620 }, { "epoch": 9.050279329608939, "eval_logits/chosen": -3.3324215412139893, "eval_logits/rejected": -3.4577934741973877, "eval_logps/chosen": -26.813121795654297, "eval_logps/rejected": -48.46649932861328, "eval_loss": 0.7766733765602112, "eval_rewards/accuracies": 0.6499999761581421, "eval_rewards/chosen": 0.325691819190979, "eval_rewards/margins": 0.325691819190979, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7163, "eval_samples_per_second": 9.475, "eval_steps_per_second": 0.306, "step": 1620 }, { "epoch": 9.05586592178771, "grad_norm": 1.5025125740946996, "learning_rate": 4.7227994236385083e-07, "logits/chosen": -3.576542854309082, "logits/rejected": -3.614333391189575, "logps/chosen": -3.1437058448791504, "logps/rejected": -38.23810577392578, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 2.709089994430542, "rewards/margins": 2.709089994430542, "rewards/rejected": 0.0, "step": 1621 }, { "epoch": 9.061452513966481, "grad_norm": 0.8305849529527441, "learning_rate": 4.7167135602568086e-07, "logits/chosen": -3.600114583969116, "logits/rejected": -3.417306900024414, "logps/chosen": -0.8178118467330933, "logps/rejected": -37.192344665527344, "loss": 0.0803, "rewards/accuracies": 1.0, "rewards/chosen": 2.218064546585083, "rewards/margins": 2.218064546585083, "rewards/rejected": 0.0, "step": 1622 }, { "epoch": 9.067039106145252, "grad_norm": 1.104244303328039, "learning_rate": 4.710628117889207e-07, "logits/chosen": -3.3144118785858154, "logits/rejected": -3.446066379547119, "logps/chosen": -7.979313850402832, "logps/rejected": -66.49369812011719, "loss": 0.1125, "rewards/accuracies": 1.0, "rewards/chosen": 3.54119610786438, "rewards/margins": 3.54119610786438, "rewards/rejected": 0.0, "step": 1623 }, { "epoch": 9.072625698324023, "grad_norm": 0.6169748980165974, "learning_rate": 4.704543105579757e-07, "logits/chosen": -3.5144424438476562, "logits/rejected": -3.677825927734375, "logps/chosen": -6.750329971313477, "logps/rejected": -47.67042922973633, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": 3.5624661445617676, "rewards/margins": 3.5624661445617676, "rewards/rejected": 0.0, "step": 1624 }, { "epoch": 9.078212290502794, "grad_norm": 1.6088784590881093, "learning_rate": 4.6984585323718705e-07, "logits/chosen": -3.200291633605957, "logits/rejected": -3.248264789581299, "logps/chosen": -1.1295030117034912, "logps/rejected": -40.96461486816406, "loss": 0.1179, "rewards/accuracies": 1.0, "rewards/chosen": 2.1124207973480225, "rewards/margins": 2.1124207973480225, "rewards/rejected": 0.0, "step": 1625 }, { "epoch": 9.083798882681565, "grad_norm": 1.4466208468231971, "learning_rate": 4.692374407308307e-07, "logits/chosen": -3.436687707901001, "logits/rejected": -3.4873499870300293, "logps/chosen": -6.9484543800354, "logps/rejected": -45.24601364135742, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 2.7573416233062744, "rewards/margins": 2.7573416233062744, "rewards/rejected": 0.0, "step": 1626 }, { "epoch": 9.089385474860336, "grad_norm": 1.031143712645248, "learning_rate": 4.686290739431157e-07, "logits/chosen": -3.438779354095459, "logits/rejected": -3.37857985496521, "logps/chosen": -11.256845474243164, "logps/rejected": -40.809532165527344, "loss": 0.113, "rewards/accuracies": 1.0, "rewards/chosen": 3.0104825496673584, "rewards/margins": 3.0104825496673584, "rewards/rejected": 0.0, "step": 1627 }, { "epoch": 9.094972067039107, "grad_norm": 1.5021763413826712, "learning_rate": 4.68020753778184e-07, "logits/chosen": -3.7217040061950684, "logits/rejected": -3.7122786045074463, "logps/chosen": -2.4026668071746826, "logps/rejected": -53.36454772949219, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": 2.717082977294922, "rewards/margins": 2.717082977294922, "rewards/rejected": 0.0, "step": 1628 }, { "epoch": 9.100558659217878, "grad_norm": 0.6685998811669954, "learning_rate": 4.674124811401071e-07, "logits/chosen": -3.4131436347961426, "logits/rejected": -3.5219039916992188, "logps/chosen": -3.5351905822753906, "logps/rejected": -88.4783706665039, "loss": 0.1036, "rewards/accuracies": 1.0, "rewards/chosen": 3.042217493057251, "rewards/margins": 3.042217493057251, "rewards/rejected": 0.0, "step": 1629 }, { "epoch": 9.106145251396647, "grad_norm": 1.0102987999404707, "learning_rate": 4.6680425693288683e-07, "logits/chosen": -3.4946987628936768, "logits/rejected": -3.326328992843628, "logps/chosen": -3.51169753074646, "logps/rejected": -70.41123962402344, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": 2.480116844177246, "rewards/margins": 2.480116844177246, "rewards/rejected": 0.0, "step": 1630 }, { "epoch": 9.111731843575418, "grad_norm": 0.8886492547646674, "learning_rate": 4.6619608206045275e-07, "logits/chosen": -3.7399768829345703, "logits/rejected": -3.654049873352051, "logps/chosen": -10.962431907653809, "logps/rejected": -51.43917465209961, "loss": 0.0729, "rewards/accuracies": 1.0, "rewards/chosen": 3.465006113052368, "rewards/margins": 3.465006113052368, "rewards/rejected": 0.0, "step": 1631 }, { "epoch": 9.11731843575419, "grad_norm": 1.4639677826571171, "learning_rate": 4.655879574266607e-07, "logits/chosen": -3.641827344894409, "logits/rejected": -3.5533337593078613, "logps/chosen": -17.22907257080078, "logps/rejected": -40.89700698852539, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 3.298551321029663, "rewards/margins": 3.298551321029663, "rewards/rejected": 0.0, "step": 1632 }, { "epoch": 9.12290502793296, "grad_norm": 0.8847487547987789, "learning_rate": 4.649798839352923e-07, "logits/chosen": -3.5922327041625977, "logits/rejected": -3.620138645172119, "logps/chosen": -3.5823731422424316, "logps/rejected": -13.908525466918945, "loss": 0.094, "rewards/accuracies": 1.0, "rewards/chosen": 2.8184237480163574, "rewards/margins": 2.8184237480163574, "rewards/rejected": 0.0, "step": 1633 }, { "epoch": 9.128491620111731, "grad_norm": 0.983025461046181, "learning_rate": 4.643718624900534e-07, "logits/chosen": -3.3321423530578613, "logits/rejected": -3.5353293418884277, "logps/chosen": -5.316637992858887, "logps/rejected": -54.03192138671875, "loss": 0.1063, "rewards/accuracies": 1.0, "rewards/chosen": 3.1418213844299316, "rewards/margins": 3.1418213844299316, "rewards/rejected": 0.0, "step": 1634 }, { "epoch": 9.134078212290502, "grad_norm": 1.0651140276522517, "learning_rate": 4.6376389399457175e-07, "logits/chosen": -3.2532856464385986, "logits/rejected": -3.3750452995300293, "logps/chosen": -27.882795333862305, "logps/rejected": -73.39027404785156, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 2.414515495300293, "rewards/margins": 2.414515495300293, "rewards/rejected": 0.0, "step": 1635 }, { "epoch": 9.139664804469273, "grad_norm": 1.0352375591863479, "learning_rate": 4.6315597935239673e-07, "logits/chosen": -3.720578670501709, "logits/rejected": -3.4379751682281494, "logps/chosen": -23.957807540893555, "logps/rejected": -45.56193923950195, "loss": 0.1411, "rewards/accuracies": 1.0, "rewards/chosen": 4.235013961791992, "rewards/margins": 4.235013961791992, "rewards/rejected": 0.0, "step": 1636 }, { "epoch": 9.145251396648044, "grad_norm": 0.8748613766094245, "learning_rate": 4.6254811946699825e-07, "logits/chosen": -3.546139717102051, "logits/rejected": -3.5422675609588623, "logps/chosen": -4.041231632232666, "logps/rejected": -56.439640045166016, "loss": 0.0857, "rewards/accuracies": 1.0, "rewards/chosen": 3.1619534492492676, "rewards/margins": 3.1619534492492676, "rewards/rejected": 0.0, "step": 1637 }, { "epoch": 9.150837988826815, "grad_norm": 0.9517846054201481, "learning_rate": 4.619403152417638e-07, "logits/chosen": -3.819573163986206, "logits/rejected": -3.7253293991088867, "logps/chosen": -1.1498639583587646, "logps/rejected": -24.196958541870117, "loss": 0.1014, "rewards/accuracies": 1.0, "rewards/chosen": 2.2579398155212402, "rewards/margins": 2.2579398155212402, "rewards/rejected": 0.0, "step": 1638 }, { "epoch": 9.156424581005586, "grad_norm": 1.0271047737056191, "learning_rate": 4.6133256757999894e-07, "logits/chosen": -3.3910739421844482, "logits/rejected": -3.5881876945495605, "logps/chosen": -4.272151470184326, "logps/rejected": -36.314308166503906, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 2.5699243545532227, "rewards/margins": 2.5699243545532227, "rewards/rejected": 0.0, "step": 1639 }, { "epoch": 9.162011173184357, "grad_norm": 0.7567277389197201, "learning_rate": 4.607248773849252e-07, "logits/chosen": -3.5559473037719727, "logits/rejected": -3.5395753383636475, "logps/chosen": -2.048529624938965, "logps/rejected": -62.623008728027344, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": 2.2905678749084473, "rewards/margins": 2.2905678749084473, "rewards/rejected": 0.0, "step": 1640 }, { "epoch": 9.162011173184357, "eval_logits/chosen": -3.3215668201446533, "eval_logits/rejected": -3.450575590133667, "eval_logps/chosen": -27.634836196899414, "eval_logps/rejected": -50.139163970947266, "eval_loss": 0.8401439189910889, "eval_rewards/accuracies": 0.6499999761581421, "eval_rewards/chosen": 0.2435208559036255, "eval_rewards/margins": 0.2435208559036255, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6858, "eval_samples_per_second": 9.484, "eval_steps_per_second": 0.306, "step": 1640 }, { "epoch": 9.167597765363128, "grad_norm": 0.6652376165818646, "learning_rate": 4.60117245559678e-07, "logits/chosen": -3.4068853855133057, "logits/rejected": -3.4603068828582764, "logps/chosen": -3.901829957962036, "logps/rejected": -61.692291259765625, "loss": 0.089, "rewards/accuracies": 1.0, "rewards/chosen": 2.3450779914855957, "rewards/margins": 2.3450779914855957, "rewards/rejected": 0.0, "step": 1641 }, { "epoch": 9.1731843575419, "grad_norm": 2.0880314630346977, "learning_rate": 4.595096730073066e-07, "logits/chosen": -3.1529829502105713, "logits/rejected": -3.1645779609680176, "logps/chosen": -23.6221981048584, "logps/rejected": -31.06586265563965, "loss": 0.1288, "rewards/accuracies": 1.0, "rewards/chosen": 2.6671934127807617, "rewards/margins": 2.6671934127807617, "rewards/rejected": 0.0, "step": 1642 }, { "epoch": 9.17877094972067, "grad_norm": 2.6801130903246286, "learning_rate": 4.5890216063077227e-07, "logits/chosen": -3.447528123855591, "logits/rejected": -3.3506224155426025, "logps/chosen": -1.7273507118225098, "logps/rejected": -56.97646713256836, "loss": 0.1228, "rewards/accuracies": 1.0, "rewards/chosen": 2.118861198425293, "rewards/margins": 2.118861198425293, "rewards/rejected": 0.0, "step": 1643 }, { "epoch": 9.184357541899441, "grad_norm": 0.6879957323627479, "learning_rate": 4.582947093329461e-07, "logits/chosen": -3.5328001976013184, "logits/rejected": -3.387763738632202, "logps/chosen": -7.785747528076172, "logps/rejected": -25.19640350341797, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 3.3997373580932617, "rewards/margins": 3.3997373580932617, "rewards/rejected": 0.0, "step": 1644 }, { "epoch": 9.189944134078212, "grad_norm": 0.6458114477243525, "learning_rate": 4.576873200166093e-07, "logits/chosen": -3.577589750289917, "logits/rejected": -3.549164295196533, "logps/chosen": -15.643972396850586, "logps/rejected": -36.84091567993164, "loss": 0.1004, "rewards/accuracies": 1.0, "rewards/chosen": 3.667699098587036, "rewards/margins": 3.667699098587036, "rewards/rejected": 0.0, "step": 1645 }, { "epoch": 9.195530726256983, "grad_norm": 0.653742575908524, "learning_rate": 4.570799935844506e-07, "logits/chosen": -3.426997184753418, "logits/rejected": -3.263404369354248, "logps/chosen": -19.646692276000977, "logps/rejected": -34.58079147338867, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 3.6539430618286133, "rewards/margins": 3.6539430618286133, "rewards/rejected": 0.0, "step": 1646 }, { "epoch": 9.201117318435754, "grad_norm": 0.6338413469102935, "learning_rate": 4.564727309390648e-07, "logits/chosen": -3.516860246658325, "logits/rejected": -3.3906006813049316, "logps/chosen": -23.571407318115234, "logps/rejected": -71.54456329345703, "loss": 0.0761, "rewards/accuracies": 1.0, "rewards/chosen": 4.067446708679199, "rewards/margins": 4.067446708679199, "rewards/rejected": 0.0, "step": 1647 }, { "epoch": 9.206703910614525, "grad_norm": 1.3750926237769714, "learning_rate": 4.5586553298295284e-07, "logits/chosen": -3.391157388687134, "logits/rejected": -3.4704341888427734, "logps/chosen": -0.6388237476348877, "logps/rejected": -45.884033203125, "loss": 0.1029, "rewards/accuracies": 1.0, "rewards/chosen": 1.884291410446167, "rewards/margins": 1.884291410446167, "rewards/rejected": 0.0, "step": 1648 }, { "epoch": 9.212290502793296, "grad_norm": 2.2531424336147983, "learning_rate": 4.552584006185186e-07, "logits/chosen": -3.6195030212402344, "logits/rejected": -3.72220778465271, "logps/chosen": -1.3236113786697388, "logps/rejected": -73.94574737548828, "loss": 0.0874, "rewards/accuracies": 1.0, "rewards/chosen": 2.1073408126831055, "rewards/margins": 2.1073408126831055, "rewards/rejected": 0.0, "step": 1649 }, { "epoch": 9.217877094972067, "grad_norm": 1.483944660137312, "learning_rate": 4.54651334748069e-07, "logits/chosen": -3.4388511180877686, "logits/rejected": -3.500291109085083, "logps/chosen": -1.9012879133224487, "logps/rejected": -39.856170654296875, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": 2.5029850006103516, "rewards/margins": 2.5029850006103516, "rewards/rejected": 0.0, "step": 1650 }, { "epoch": 9.223463687150838, "grad_norm": 1.4906533168379197, "learning_rate": 4.540443362738123e-07, "logits/chosen": -3.5285568237304688, "logits/rejected": -3.536672353744507, "logps/chosen": -7.189120769500732, "logps/rejected": -36.30305480957031, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 3.1984524726867676, "rewards/margins": 3.1984524726867676, "rewards/rejected": 0.0, "step": 1651 }, { "epoch": 9.22905027932961, "grad_norm": 1.009822880276252, "learning_rate": 4.534374060978561e-07, "logits/chosen": -3.6347761154174805, "logits/rejected": -3.5859129428863525, "logps/chosen": -3.876652240753174, "logps/rejected": -57.586734771728516, "loss": 0.0873, "rewards/accuracies": 1.0, "rewards/chosen": 2.904188632965088, "rewards/margins": 2.904188632965088, "rewards/rejected": 0.0, "step": 1652 }, { "epoch": 9.23463687150838, "grad_norm": 1.0512488552773978, "learning_rate": 4.5283054512220666e-07, "logits/chosen": -3.4411423206329346, "logits/rejected": -3.469578981399536, "logps/chosen": -0.7947466373443604, "logps/rejected": -97.12767028808594, "loss": 0.1436, "rewards/accuracies": 1.0, "rewards/chosen": 1.72902250289917, "rewards/margins": 1.72902250289917, "rewards/rejected": 0.0, "step": 1653 }, { "epoch": 9.240223463687151, "grad_norm": 1.046013283113762, "learning_rate": 4.5222375424876783e-07, "logits/chosen": -3.492800712585449, "logits/rejected": -3.6472015380859375, "logps/chosen": -13.19853687286377, "logps/rejected": -42.79853820800781, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": 3.3905725479125977, "rewards/margins": 3.3905725479125977, "rewards/rejected": 0.0, "step": 1654 }, { "epoch": 9.245810055865922, "grad_norm": 0.6472985246797178, "learning_rate": 4.516170343793386e-07, "logits/chosen": -3.4796149730682373, "logits/rejected": -3.7505271434783936, "logps/chosen": -3.8719398975372314, "logps/rejected": -54.36867141723633, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 3.4371376037597656, "rewards/margins": 3.4371376037597656, "rewards/rejected": 0.0, "step": 1655 }, { "epoch": 9.251396648044693, "grad_norm": 1.1471450171632256, "learning_rate": 4.510103864156127e-07, "logits/chosen": -3.6489460468292236, "logits/rejected": -3.6754863262176514, "logps/chosen": -4.7925190925598145, "logps/rejected": -50.89821243286133, "loss": 0.1111, "rewards/accuracies": 1.0, "rewards/chosen": 2.939988851547241, "rewards/margins": 2.939988851547241, "rewards/rejected": 0.0, "step": 1656 }, { "epoch": 9.256983240223464, "grad_norm": 1.5275573364432178, "learning_rate": 4.5040381125917733e-07, "logits/chosen": -3.6592698097229004, "logits/rejected": -3.6023573875427246, "logps/chosen": -6.153960227966309, "logps/rejected": -22.277875900268555, "loss": 0.0762, "rewards/accuracies": 1.0, "rewards/chosen": 3.3746848106384277, "rewards/margins": 3.3746848106384277, "rewards/rejected": 0.0, "step": 1657 }, { "epoch": 9.262569832402235, "grad_norm": 3.659832937503148, "learning_rate": 4.497973098115108e-07, "logits/chosen": -3.221414804458618, "logits/rejected": -3.2071645259857178, "logps/chosen": -1.912853479385376, "logps/rejected": -43.9027099609375, "loss": 0.1187, "rewards/accuracies": 1.0, "rewards/chosen": 2.023972511291504, "rewards/margins": 2.023972511291504, "rewards/rejected": 0.0, "step": 1658 }, { "epoch": 9.268156424581006, "grad_norm": 0.9419207384234801, "learning_rate": 4.491908829739826e-07, "logits/chosen": -3.550814628601074, "logits/rejected": -3.6960623264312744, "logps/chosen": -1.5808019638061523, "logps/rejected": -105.10787963867188, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": 2.259671449661255, "rewards/margins": 2.259671449661255, "rewards/rejected": 0.0, "step": 1659 }, { "epoch": 9.273743016759777, "grad_norm": 0.9918028558746579, "learning_rate": 4.48584531647851e-07, "logits/chosen": -3.4608771800994873, "logits/rejected": -3.5884146690368652, "logps/chosen": -2.104559898376465, "logps/rejected": -50.12146759033203, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 2.90057373046875, "rewards/margins": 2.90057373046875, "rewards/rejected": 0.0, "step": 1660 }, { "epoch": 9.273743016759777, "eval_logits/chosen": -3.320514678955078, "eval_logits/rejected": -3.4494194984436035, "eval_logps/chosen": -27.093725204467773, "eval_logps/rejected": -50.0571403503418, "eval_loss": 0.822568416595459, "eval_rewards/accuracies": 0.675000011920929, "eval_rewards/chosen": 0.2976315915584564, "eval_rewards/margins": 0.2976315915584564, "eval_rewards/rejected": 0.0, "eval_runtime": 32.728, "eval_samples_per_second": 9.472, "eval_steps_per_second": 0.306, "step": 1660 }, { "epoch": 9.279329608938548, "grad_norm": 0.7978783107009588, "learning_rate": 4.479782567342619e-07, "logits/chosen": -3.4343059062957764, "logits/rejected": -3.4542601108551025, "logps/chosen": -2.98053240776062, "logps/rejected": -68.02059936523438, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 2.34826397895813, "rewards/margins": 2.34826397895813, "rewards/rejected": 0.0, "step": 1661 }, { "epoch": 9.28491620111732, "grad_norm": 1.215827653811134, "learning_rate": 4.4737205913424775e-07, "logits/chosen": -3.477102041244507, "logits/rejected": -3.492522954940796, "logps/chosen": -1.10588538646698, "logps/rejected": -28.390304565429688, "loss": 0.1224, "rewards/accuracies": 1.0, "rewards/chosen": 2.6332898139953613, "rewards/margins": 2.6332898139953613, "rewards/rejected": 0.0, "step": 1662 }, { "epoch": 9.29050279329609, "grad_norm": 4.523770466035961, "learning_rate": 4.4676593974872647e-07, "logits/chosen": -3.2639989852905273, "logits/rejected": -3.5039234161376953, "logps/chosen": -1.6709858179092407, "logps/rejected": -32.406158447265625, "loss": 0.1265, "rewards/accuracies": 1.0, "rewards/chosen": 2.325526237487793, "rewards/margins": 2.325526237487793, "rewards/rejected": 0.0, "step": 1663 }, { "epoch": 9.296089385474861, "grad_norm": 1.1902724242122853, "learning_rate": 4.46159899478499e-07, "logits/chosen": -3.3042995929718018, "logits/rejected": -3.3759191036224365, "logps/chosen": -1.20881187915802, "logps/rejected": -48.647247314453125, "loss": 0.1238, "rewards/accuracies": 1.0, "rewards/chosen": 2.325157403945923, "rewards/margins": 2.325157403945923, "rewards/rejected": 0.0, "step": 1664 }, { "epoch": 9.30167597765363, "grad_norm": 3.9169471879173385, "learning_rate": 4.455539392242494e-07, "logits/chosen": -3.4400534629821777, "logits/rejected": -3.3671622276306152, "logps/chosen": -4.663588047027588, "logps/rejected": -29.19081687927246, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": 3.0808463096618652, "rewards/margins": 3.0808463096618652, "rewards/rejected": 0.0, "step": 1665 }, { "epoch": 9.307262569832401, "grad_norm": 1.4183777468015464, "learning_rate": 4.4494805988654256e-07, "logits/chosen": -3.6820247173309326, "logits/rejected": -3.5284478664398193, "logps/chosen": -0.8771708011627197, "logps/rejected": -41.76000213623047, "loss": 0.1395, "rewards/accuracies": 1.0, "rewards/chosen": 1.9157788753509521, "rewards/margins": 1.9157788753509521, "rewards/rejected": 0.0, "step": 1666 }, { "epoch": 9.312849162011172, "grad_norm": 0.6651252353283887, "learning_rate": 4.443422623658227e-07, "logits/chosen": -3.6375091075897217, "logits/rejected": -3.6346709728240967, "logps/chosen": -7.7690935134887695, "logps/rejected": -25.248586654663086, "loss": 0.1312, "rewards/accuracies": 1.0, "rewards/chosen": 2.6729512214660645, "rewards/margins": 2.6729512214660645, "rewards/rejected": 0.0, "step": 1667 }, { "epoch": 9.318435754189943, "grad_norm": 0.9617451051965464, "learning_rate": 4.437365475624131e-07, "logits/chosen": -3.404390335083008, "logits/rejected": -3.368523597717285, "logps/chosen": -5.0989508628845215, "logps/rejected": -44.76288604736328, "loss": 0.1224, "rewards/accuracies": 1.0, "rewards/chosen": 3.140861988067627, "rewards/margins": 3.140861988067627, "rewards/rejected": 0.0, "step": 1668 }, { "epoch": 9.324022346368714, "grad_norm": 1.001034236506721, "learning_rate": 4.4313091637651394e-07, "logits/chosen": -3.5085628032684326, "logits/rejected": -3.540684938430786, "logps/chosen": -5.532584190368652, "logps/rejected": -43.663787841796875, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 2.869607925415039, "rewards/margins": 2.869607925415039, "rewards/rejected": 0.0, "step": 1669 }, { "epoch": 9.329608938547485, "grad_norm": 0.705774360061405, "learning_rate": 4.425253697082007e-07, "logits/chosen": -3.2584965229034424, "logits/rejected": -3.2584667205810547, "logps/chosen": -12.138728141784668, "logps/rejected": -36.21665954589844, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 3.29658579826355, "rewards/margins": 3.29658579826355, "rewards/rejected": 0.0, "step": 1670 }, { "epoch": 9.335195530726256, "grad_norm": 0.8504428691008185, "learning_rate": 4.419199084574237e-07, "logits/chosen": -3.736549139022827, "logits/rejected": -3.687662124633789, "logps/chosen": -4.930518627166748, "logps/rejected": -64.18195343017578, "loss": 0.117, "rewards/accuracies": 1.0, "rewards/chosen": 2.6274194717407227, "rewards/margins": 2.6274194717407227, "rewards/rejected": 0.0, "step": 1671 }, { "epoch": 9.340782122905027, "grad_norm": 0.9255245693297752, "learning_rate": 4.4131453352400656e-07, "logits/chosen": -3.119297981262207, "logits/rejected": -3.45367431640625, "logps/chosen": -1.8677343130111694, "logps/rejected": -39.497676849365234, "loss": 0.0993, "rewards/accuracies": 1.0, "rewards/chosen": 2.3898773193359375, "rewards/margins": 2.3898773193359375, "rewards/rejected": 0.0, "step": 1672 }, { "epoch": 9.346368715083798, "grad_norm": 1.0796381261006927, "learning_rate": 4.407092458076436e-07, "logits/chosen": -3.3355917930603027, "logits/rejected": -3.4962973594665527, "logps/chosen": -9.620408058166504, "logps/rejected": -50.528167724609375, "loss": 0.1316, "rewards/accuracies": 1.0, "rewards/chosen": 3.2583227157592773, "rewards/margins": 3.2583227157592773, "rewards/rejected": 0.0, "step": 1673 }, { "epoch": 9.35195530726257, "grad_norm": 0.979557178370967, "learning_rate": 4.401040462079006e-07, "logits/chosen": -3.594233512878418, "logits/rejected": -3.315873384475708, "logps/chosen": -1.0419033765792847, "logps/rejected": -37.96501922607422, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": 2.4149818420410156, "rewards/margins": 2.4149818420410156, "rewards/rejected": 0.0, "step": 1674 }, { "epoch": 9.35754189944134, "grad_norm": 2.3850754165564654, "learning_rate": 4.394989356242119e-07, "logits/chosen": -3.544698476791382, "logits/rejected": -3.6464734077453613, "logps/chosen": -1.730089545249939, "logps/rejected": -61.054046630859375, "loss": 0.1262, "rewards/accuracies": 1.0, "rewards/chosen": 2.486802577972412, "rewards/margins": 2.486802577972412, "rewards/rejected": 0.0, "step": 1675 }, { "epoch": 9.363128491620111, "grad_norm": 0.7804655522429714, "learning_rate": 4.388939149558794e-07, "logits/chosen": -3.38547945022583, "logits/rejected": -3.43357253074646, "logps/chosen": -0.42281386256217957, "logps/rejected": -62.40565872192383, "loss": 0.1378, "rewards/accuracies": 1.0, "rewards/chosen": 1.6719470024108887, "rewards/margins": 1.6719470024108887, "rewards/rejected": 0.0, "step": 1676 }, { "epoch": 9.368715083798882, "grad_norm": 0.6369642249548819, "learning_rate": 4.3828898510207177e-07, "logits/chosen": -3.666167974472046, "logits/rejected": -3.686500310897827, "logps/chosen": -9.158576011657715, "logps/rejected": -47.06145477294922, "loss": 0.0987, "rewards/accuracies": 1.0, "rewards/chosen": 2.7776994705200195, "rewards/margins": 2.7776994705200195, "rewards/rejected": 0.0, "step": 1677 }, { "epoch": 9.374301675977653, "grad_norm": 0.7228971001669937, "learning_rate": 4.376841469618223e-07, "logits/chosen": -3.5081560611724854, "logits/rejected": -3.5118141174316406, "logps/chosen": -2.3876757621765137, "logps/rejected": -44.77592849731445, "loss": 0.0959, "rewards/accuracies": 1.0, "rewards/chosen": 2.8737363815307617, "rewards/margins": 2.8737363815307617, "rewards/rejected": 0.0, "step": 1678 }, { "epoch": 9.379888268156424, "grad_norm": 2.1531995223331353, "learning_rate": 4.370794014340282e-07, "logits/chosen": -3.298464298248291, "logits/rejected": -3.3193044662475586, "logps/chosen": -19.938934326171875, "logps/rejected": -40.75102996826172, "loss": 0.1425, "rewards/accuracies": 1.0, "rewards/chosen": 2.9173717498779297, "rewards/margins": 2.9173717498779297, "rewards/rejected": 0.0, "step": 1679 }, { "epoch": 9.385474860335195, "grad_norm": 3.2467322257414124, "learning_rate": 4.3647474941744945e-07, "logits/chosen": -3.4913649559020996, "logits/rejected": -3.5214719772338867, "logps/chosen": -0.6102027297019958, "logps/rejected": -52.77445602416992, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 1.9872210025787354, "rewards/margins": 1.9872210025787354, "rewards/rejected": 0.0, "step": 1680 }, { "epoch": 9.385474860335195, "eval_logits/chosen": -3.311461925506592, "eval_logits/rejected": -3.4402499198913574, "eval_logps/chosen": -27.66586685180664, "eval_logps/rejected": -50.32343292236328, "eval_loss": 0.8500019907951355, "eval_rewards/accuracies": 0.6499999761581421, "eval_rewards/chosen": 0.24041762948036194, "eval_rewards/margins": 0.24041762948036194, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6971, "eval_samples_per_second": 9.481, "eval_steps_per_second": 0.306, "step": 1680 }, { "epoch": 9.391061452513966, "grad_norm": 1.137546018759431, "learning_rate": 4.3587019181070593e-07, "logits/chosen": -3.4446256160736084, "logits/rejected": -3.4689605236053467, "logps/chosen": -0.7464298605918884, "logps/rejected": -40.93949890136719, "loss": 0.133, "rewards/accuracies": 1.0, "rewards/chosen": 1.8554067611694336, "rewards/margins": 1.8554067611694336, "rewards/rejected": 0.0, "step": 1681 }, { "epoch": 9.396648044692737, "grad_norm": 2.124718311740144, "learning_rate": 4.3526572951227835e-07, "logits/chosen": -3.2802059650421143, "logits/rejected": -3.5219709873199463, "logps/chosen": -2.9198992252349854, "logps/rejected": -52.50535202026367, "loss": 0.0816, "rewards/accuracies": 1.0, "rewards/chosen": 2.9849014282226562, "rewards/margins": 2.9849014282226562, "rewards/rejected": 0.0, "step": 1682 }, { "epoch": 9.402234636871508, "grad_norm": 1.1790212523455037, "learning_rate": 4.3466136342050545e-07, "logits/chosen": -3.5371663570404053, "logits/rejected": -3.54402494430542, "logps/chosen": -0.9409477710723877, "logps/rejected": -40.8814697265625, "loss": 0.1204, "rewards/accuracies": 1.0, "rewards/chosen": 2.1401760578155518, "rewards/margins": 2.1401760578155518, "rewards/rejected": 0.0, "step": 1683 }, { "epoch": 9.40782122905028, "grad_norm": 0.9343191536862998, "learning_rate": 4.340570944335825e-07, "logits/chosen": -3.3438329696655273, "logits/rejected": -3.513079881668091, "logps/chosen": -2.165937900543213, "logps/rejected": -43.00746154785156, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 2.3748960494995117, "rewards/margins": 2.3748960494995117, "rewards/rejected": 0.0, "step": 1684 }, { "epoch": 9.41340782122905, "grad_norm": 1.889334067259178, "learning_rate": 4.334529234495611e-07, "logits/chosen": -3.2260501384735107, "logits/rejected": -3.549238443374634, "logps/chosen": -4.397539138793945, "logps/rejected": -42.89970397949219, "loss": 0.1042, "rewards/accuracies": 1.0, "rewards/chosen": 2.7786693572998047, "rewards/margins": 2.7786693572998047, "rewards/rejected": 0.0, "step": 1685 }, { "epoch": 9.418994413407821, "grad_norm": 1.9003856935501429, "learning_rate": 4.3284885136634697e-07, "logits/chosen": -3.3984479904174805, "logits/rejected": -3.4613521099090576, "logps/chosen": -11.049154281616211, "logps/rejected": -48.33641052246094, "loss": 0.1566, "rewards/accuracies": 1.0, "rewards/chosen": 2.3414509296417236, "rewards/margins": 2.3414509296417236, "rewards/rejected": 0.0, "step": 1686 }, { "epoch": 9.424581005586592, "grad_norm": 0.6644543604790718, "learning_rate": 4.322448790816984e-07, "logits/chosen": -3.161001682281494, "logits/rejected": -3.209975242614746, "logps/chosen": -24.673967361450195, "logps/rejected": -42.61569595336914, "loss": 0.1202, "rewards/accuracies": 1.0, "rewards/chosen": 3.0251245498657227, "rewards/margins": 3.0251245498657227, "rewards/rejected": 0.0, "step": 1687 }, { "epoch": 9.430167597765363, "grad_norm": 0.8299100994106664, "learning_rate": 4.3164100749322614e-07, "logits/chosen": -3.7852985858917236, "logits/rejected": -3.730530023574829, "logps/chosen": -7.574895858764648, "logps/rejected": -30.36754035949707, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 3.8281028270721436, "rewards/margins": 3.8281028270721436, "rewards/rejected": 0.0, "step": 1688 }, { "epoch": 9.435754189944134, "grad_norm": 2.6184409693735207, "learning_rate": 4.310372374983909e-07, "logits/chosen": -3.561497926712036, "logits/rejected": -3.6270360946655273, "logps/chosen": -1.438293218612671, "logps/rejected": -54.53354263305664, "loss": 0.1259, "rewards/accuracies": 1.0, "rewards/chosen": 1.8125982284545898, "rewards/margins": 1.8125982284545898, "rewards/rejected": 0.0, "step": 1689 }, { "epoch": 9.441340782122905, "grad_norm": 0.6631884411592908, "learning_rate": 4.3043356999450223e-07, "logits/chosen": -3.448148727416992, "logits/rejected": -3.5284643173217773, "logps/chosen": -2.6424715518951416, "logps/rejected": -100.07594299316406, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 2.3525705337524414, "rewards/margins": 2.3525705337524414, "rewards/rejected": 0.0, "step": 1690 }, { "epoch": 9.446927374301676, "grad_norm": 1.1318458693737592, "learning_rate": 4.298300058787176e-07, "logits/chosen": -3.44157075881958, "logits/rejected": -3.594092607498169, "logps/chosen": -2.2108848094940186, "logps/rejected": -79.8294677734375, "loss": 0.103, "rewards/accuracies": 1.0, "rewards/chosen": 2.4404940605163574, "rewards/margins": 2.4404940605163574, "rewards/rejected": 0.0, "step": 1691 }, { "epoch": 9.452513966480447, "grad_norm": 1.4703698634310705, "learning_rate": 4.292265460480411e-07, "logits/chosen": -3.5306217670440674, "logits/rejected": -3.624938726425171, "logps/chosen": -0.5446736812591553, "logps/rejected": -133.71543884277344, "loss": 0.0967, "rewards/accuracies": 1.0, "rewards/chosen": 1.7631876468658447, "rewards/margins": 1.7631876468658447, "rewards/rejected": 0.0, "step": 1692 }, { "epoch": 9.458100558659218, "grad_norm": 3.305936388380418, "learning_rate": 4.2862319139932094e-07, "logits/chosen": -3.560810089111328, "logits/rejected": -3.534581184387207, "logps/chosen": -4.433304309844971, "logps/rejected": -50.900962829589844, "loss": 0.0845, "rewards/accuracies": 1.0, "rewards/chosen": 3.496093273162842, "rewards/margins": 3.496093273162842, "rewards/rejected": 0.0, "step": 1693 }, { "epoch": 9.46368715083799, "grad_norm": 0.7327068207557769, "learning_rate": 4.2801994282925e-07, "logits/chosen": -3.4338550567626953, "logits/rejected": -3.4327404499053955, "logps/chosen": -7.381734848022461, "logps/rejected": -40.26165771484375, "loss": 0.1295, "rewards/accuracies": 1.0, "rewards/chosen": 2.829134941101074, "rewards/margins": 2.829134941101074, "rewards/rejected": 0.0, "step": 1694 }, { "epoch": 9.46927374301676, "grad_norm": 4.163846340676333, "learning_rate": 4.27416801234363e-07, "logits/chosen": -3.3018383979797363, "logits/rejected": -3.4342386722564697, "logps/chosen": -2.5537142753601074, "logps/rejected": -42.40884017944336, "loss": 0.1354, "rewards/accuracies": 1.0, "rewards/chosen": 2.7506661415100098, "rewards/margins": 2.7506661415100098, "rewards/rejected": 0.0, "step": 1695 }, { "epoch": 9.474860335195531, "grad_norm": 0.6920824321518683, "learning_rate": 4.2681376751103573e-07, "logits/chosen": -3.2477638721466064, "logits/rejected": -3.4153714179992676, "logps/chosen": -1.4177420139312744, "logps/rejected": -82.40536499023438, "loss": 0.1037, "rewards/accuracies": 1.0, "rewards/chosen": 2.5145020484924316, "rewards/margins": 2.5145020484924316, "rewards/rejected": 0.0, "step": 1696 }, { "epoch": 9.480446927374302, "grad_norm": 0.6608240332889246, "learning_rate": 4.2621084255548356e-07, "logits/chosen": -3.209749698638916, "logits/rejected": -3.36749005317688, "logps/chosen": -4.849580764770508, "logps/rejected": -69.09649658203125, "loss": 0.0756, "rewards/accuracies": 1.0, "rewards/chosen": 2.505600929260254, "rewards/margins": 2.505600929260254, "rewards/rejected": 0.0, "step": 1697 }, { "epoch": 9.486033519553073, "grad_norm": 0.8748741223269052, "learning_rate": 4.2560802726376066e-07, "logits/chosen": -3.658421754837036, "logits/rejected": -3.5597047805786133, "logps/chosen": -7.290219306945801, "logps/rejected": -51.058555603027344, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 3.701432466506958, "rewards/margins": 3.701432466506958, "rewards/rejected": 0.0, "step": 1698 }, { "epoch": 9.491620111731844, "grad_norm": 0.7820758644196301, "learning_rate": 4.250053225317576e-07, "logits/chosen": -3.325294017791748, "logits/rejected": -3.598661184310913, "logps/chosen": -4.855356693267822, "logps/rejected": -27.665653228759766, "loss": 0.0898, "rewards/accuracies": 1.0, "rewards/chosen": 2.507937431335449, "rewards/margins": 2.507937431335449, "rewards/rejected": 0.0, "step": 1699 }, { "epoch": 9.497206703910614, "grad_norm": 4.139095095118339, "learning_rate": 4.24402729255201e-07, "logits/chosen": -3.417232036590576, "logits/rejected": -3.365792751312256, "logps/chosen": -13.969809532165527, "logps/rejected": -66.77743530273438, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": 2.26210355758667, "rewards/margins": 2.26210355758667, "rewards/rejected": 0.0, "step": 1700 }, { "epoch": 9.497206703910614, "eval_logits/chosen": -3.3096587657928467, "eval_logits/rejected": -3.436217784881592, "eval_logps/chosen": -27.44838523864746, "eval_logps/rejected": -50.46533966064453, "eval_loss": 0.8455008268356323, "eval_rewards/accuracies": 0.699999988079071, "eval_rewards/chosen": 0.2621658444404602, "eval_rewards/margins": 0.2621658444404602, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7289, "eval_samples_per_second": 9.472, "eval_steps_per_second": 0.306, "step": 1700 }, { "epoch": 9.502793296089386, "grad_norm": 1.412572136058337, "learning_rate": 4.238002483296521e-07, "logits/chosen": -3.4035112857818604, "logits/rejected": -3.3024539947509766, "logps/chosen": -3.373434066772461, "logps/rejected": -29.926616668701172, "loss": 0.1308, "rewards/accuracies": 1.0, "rewards/chosen": 2.4663188457489014, "rewards/margins": 2.4663188457489014, "rewards/rejected": 0.0, "step": 1701 }, { "epoch": 9.508379888268156, "grad_norm": 0.6523008592710757, "learning_rate": 4.2319788065050443e-07, "logits/chosen": -3.4485371112823486, "logits/rejected": -3.6596977710723877, "logps/chosen": -15.020283699035645, "logps/rejected": -46.4030876159668, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 2.8614203929901123, "rewards/margins": 2.8614203929901123, "rewards/rejected": 0.0, "step": 1702 }, { "epoch": 9.513966480446927, "grad_norm": 0.9840730198114828, "learning_rate": 4.2259562711298393e-07, "logits/chosen": -3.4605963230133057, "logits/rejected": -3.5639140605926514, "logps/chosen": -1.4424254894256592, "logps/rejected": -37.53175354003906, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 2.5046846866607666, "rewards/margins": 2.5046846866607666, "rewards/rejected": 0.0, "step": 1703 }, { "epoch": 9.519553072625698, "grad_norm": 3.3210097153062015, "learning_rate": 4.2199348861214646e-07, "logits/chosen": -3.572042226791382, "logits/rejected": -3.51839017868042, "logps/chosen": -2.0930020809173584, "logps/rejected": -56.61003875732422, "loss": 0.1057, "rewards/accuracies": 1.0, "rewards/chosen": 2.452495574951172, "rewards/margins": 2.452495574951172, "rewards/rejected": 0.0, "step": 1704 }, { "epoch": 9.525139664804469, "grad_norm": 2.8093863564024937, "learning_rate": 4.2139146604287714e-07, "logits/chosen": -3.474696159362793, "logits/rejected": -3.5628952980041504, "logps/chosen": -3.7944483757019043, "logps/rejected": -56.560157775878906, "loss": 0.1393, "rewards/accuracies": 1.0, "rewards/chosen": 1.9909389019012451, "rewards/margins": 1.9909389019012451, "rewards/rejected": 0.0, "step": 1705 }, { "epoch": 9.53072625698324, "grad_norm": 1.2376177190857576, "learning_rate": 4.2078956029988883e-07, "logits/chosen": -3.311756134033203, "logits/rejected": -3.4236950874328613, "logps/chosen": -5.494135856628418, "logps/rejected": -78.06456756591797, "loss": 0.129, "rewards/accuracies": 1.0, "rewards/chosen": 2.181332588195801, "rewards/margins": 2.181332588195801, "rewards/rejected": 0.0, "step": 1706 }, { "epoch": 9.53631284916201, "grad_norm": 0.8740634913580586, "learning_rate": 4.2018777227772026e-07, "logits/chosen": -3.3100528717041016, "logits/rejected": -3.2620058059692383, "logps/chosen": -15.985023498535156, "logps/rejected": -26.272205352783203, "loss": 0.0853, "rewards/accuracies": 1.0, "rewards/chosen": 4.03938102722168, "rewards/margins": 4.03938102722168, "rewards/rejected": 0.0, "step": 1707 }, { "epoch": 9.541899441340782, "grad_norm": 0.7045059680653859, "learning_rate": 4.1958610287073585e-07, "logits/chosen": -3.4263386726379395, "logits/rejected": -3.48311185836792, "logps/chosen": -2.8320908546447754, "logps/rejected": -47.739253997802734, "loss": 0.0883, "rewards/accuracies": 1.0, "rewards/chosen": 2.8688912391662598, "rewards/margins": 2.8688912391662598, "rewards/rejected": 0.0, "step": 1708 }, { "epoch": 9.547486033519553, "grad_norm": 2.939591053489811, "learning_rate": 4.189845529731238e-07, "logits/chosen": -3.4358630180358887, "logits/rejected": -3.4611194133758545, "logps/chosen": -7.619137763977051, "logps/rejected": -36.52315902709961, "loss": 0.0997, "rewards/accuracies": 1.0, "rewards/chosen": 2.6181259155273438, "rewards/margins": 2.6181259155273438, "rewards/rejected": 0.0, "step": 1709 }, { "epoch": 9.553072625698324, "grad_norm": 1.7312824374240363, "learning_rate": 4.183831234788938e-07, "logits/chosen": -3.3011412620544434, "logits/rejected": -3.387990951538086, "logps/chosen": -1.489870309829712, "logps/rejected": -75.22355651855469, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 2.148293972015381, "rewards/margins": 2.148293972015381, "rewards/rejected": 0.0, "step": 1710 }, { "epoch": 9.558659217877095, "grad_norm": 6.792215059319358, "learning_rate": 4.177818152818775e-07, "logits/chosen": -3.49019193649292, "logits/rejected": -3.5170786380767822, "logps/chosen": -16.729494094848633, "logps/rejected": -36.510337829589844, "loss": 0.1207, "rewards/accuracies": 1.0, "rewards/chosen": 3.3461015224456787, "rewards/margins": 3.3461015224456787, "rewards/rejected": 0.0, "step": 1711 }, { "epoch": 9.564245810055866, "grad_norm": 2.597248724827953, "learning_rate": 4.171806292757262e-07, "logits/chosen": -3.3735554218292236, "logits/rejected": -3.332463264465332, "logps/chosen": -1.008131742477417, "logps/rejected": -58.622379302978516, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 1.7787175178527832, "rewards/margins": 1.7787175178527832, "rewards/rejected": 0.0, "step": 1712 }, { "epoch": 9.569832402234637, "grad_norm": 0.6999497925872811, "learning_rate": 4.165795663539089e-07, "logits/chosen": -3.59106707572937, "logits/rejected": -3.7462661266326904, "logps/chosen": -2.865769386291504, "logps/rejected": -43.900978088378906, "loss": 0.0867, "rewards/accuracies": 1.0, "rewards/chosen": 2.748168468475342, "rewards/margins": 2.748168468475342, "rewards/rejected": 0.0, "step": 1713 }, { "epoch": 9.575418994413408, "grad_norm": 1.299737306466016, "learning_rate": 4.159786274097125e-07, "logits/chosen": -3.423650026321411, "logits/rejected": -3.5724918842315674, "logps/chosen": -2.597463369369507, "logps/rejected": -55.465576171875, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 2.100991725921631, "rewards/margins": 2.100991725921631, "rewards/rejected": 0.0, "step": 1714 }, { "epoch": 9.581005586592179, "grad_norm": 1.0454743027026128, "learning_rate": 4.1537781333623906e-07, "logits/chosen": -3.250215530395508, "logits/rejected": -3.497128486633301, "logps/chosen": -0.8876857757568359, "logps/rejected": -72.73077392578125, "loss": 0.1251, "rewards/accuracies": 1.0, "rewards/chosen": 1.6695135831832886, "rewards/margins": 1.6695135831832886, "rewards/rejected": 0.0, "step": 1715 }, { "epoch": 9.58659217877095, "grad_norm": 1.0617451284723087, "learning_rate": 4.147771250264054e-07, "logits/chosen": -3.3230700492858887, "logits/rejected": -3.5200843811035156, "logps/chosen": -1.789273977279663, "logps/rejected": -30.924949645996094, "loss": 0.0834, "rewards/accuracies": 1.0, "rewards/chosen": 2.6298274993896484, "rewards/margins": 2.6298274993896484, "rewards/rejected": 0.0, "step": 1716 }, { "epoch": 9.59217877094972, "grad_norm": 2.0966938880006265, "learning_rate": 4.1417656337294124e-07, "logits/chosen": -3.3895654678344727, "logits/rejected": -3.434917449951172, "logps/chosen": -5.88671350479126, "logps/rejected": -39.347511291503906, "loss": 0.0794, "rewards/accuracies": 1.0, "rewards/chosen": 3.851647138595581, "rewards/margins": 3.851647138595581, "rewards/rejected": 0.0, "step": 1717 }, { "epoch": 9.597765363128492, "grad_norm": 1.0592453797314934, "learning_rate": 4.1357612926838855e-07, "logits/chosen": -3.12894344329834, "logits/rejected": -3.056067943572998, "logps/chosen": -0.6702836155891418, "logps/rejected": -49.90453338623047, "loss": 0.1124, "rewards/accuracies": 1.0, "rewards/chosen": 1.6518197059631348, "rewards/margins": 1.6518197059631348, "rewards/rejected": 0.0, "step": 1718 }, { "epoch": 9.603351955307263, "grad_norm": 0.9466857657811402, "learning_rate": 4.1297582360509866e-07, "logits/chosen": -3.5946905612945557, "logits/rejected": -3.457310199737549, "logps/chosen": -1.7465816736221313, "logps/rejected": -28.91625213623047, "loss": 0.1254, "rewards/accuracies": 1.0, "rewards/chosen": 2.1947221755981445, "rewards/margins": 2.1947221755981445, "rewards/rejected": 0.0, "step": 1719 }, { "epoch": 9.608938547486034, "grad_norm": 3.35693357476571, "learning_rate": 4.123756472752331e-07, "logits/chosen": -3.44520902633667, "logits/rejected": -3.411966323852539, "logps/chosen": -0.4839590787887573, "logps/rejected": -38.20967102050781, "loss": 0.1178, "rewards/accuracies": 1.0, "rewards/chosen": 1.7154146432876587, "rewards/margins": 1.7154146432876587, "rewards/rejected": 0.0, "step": 1720 }, { "epoch": 9.608938547486034, "eval_logits/chosen": -3.305182695388794, "eval_logits/rejected": -3.4331936836242676, "eval_logps/chosen": -27.447872161865234, "eval_logps/rejected": -50.44144821166992, "eval_loss": 0.8560366630554199, "eval_rewards/accuracies": 0.699999988079071, "eval_rewards/chosen": 0.26221686601638794, "eval_rewards/margins": 0.26221686601638794, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6959, "eval_samples_per_second": 9.481, "eval_steps_per_second": 0.306, "step": 1720 }, { "epoch": 9.614525139664805, "grad_norm": 0.728579747223021, "learning_rate": 4.117756011707609e-07, "logits/chosen": -3.18867826461792, "logits/rejected": -3.124232292175293, "logps/chosen": -1.6616671085357666, "logps/rejected": -35.0905647277832, "loss": 0.1062, "rewards/accuracies": 1.0, "rewards/chosen": 2.0302085876464844, "rewards/margins": 2.0302085876464844, "rewards/rejected": 0.0, "step": 1721 }, { "epoch": 9.620111731843576, "grad_norm": 1.4853680298014906, "learning_rate": 4.111756861834571e-07, "logits/chosen": -3.761784553527832, "logits/rejected": -3.7591989040374756, "logps/chosen": -2.763706684112549, "logps/rejected": -29.031394958496094, "loss": 0.1343, "rewards/accuracies": 1.0, "rewards/chosen": 2.1953248977661133, "rewards/margins": 2.1953248977661133, "rewards/rejected": 0.0, "step": 1722 }, { "epoch": 9.625698324022347, "grad_norm": 2.1376789566184704, "learning_rate": 4.105759032049025e-07, "logits/chosen": -3.2697160243988037, "logits/rejected": -3.1297335624694824, "logps/chosen": -6.834595680236816, "logps/rejected": -24.282554626464844, "loss": 0.1242, "rewards/accuracies": 1.0, "rewards/chosen": 2.6610021591186523, "rewards/margins": 2.6610021591186523, "rewards/rejected": 0.0, "step": 1723 }, { "epoch": 9.631284916201118, "grad_norm": 1.8294215741490434, "learning_rate": 4.0997625312648127e-07, "logits/chosen": -3.337998867034912, "logits/rejected": -3.355957269668579, "logps/chosen": -2.4976565837860107, "logps/rejected": -99.48643493652344, "loss": 0.1084, "rewards/accuracies": 1.0, "rewards/chosen": 2.2600810527801514, "rewards/margins": 2.2600810527801514, "rewards/rejected": 0.0, "step": 1724 }, { "epoch": 9.636871508379889, "grad_norm": 0.8759311338583263, "learning_rate": 4.093767368393802e-07, "logits/chosen": -2.968350887298584, "logits/rejected": -2.944314479827881, "logps/chosen": -9.29737377166748, "logps/rejected": -84.48825073242188, "loss": 0.0756, "rewards/accuracies": 1.0, "rewards/chosen": 2.7920689582824707, "rewards/margins": 2.7920689582824707, "rewards/rejected": 0.0, "step": 1725 }, { "epoch": 9.64245810055866, "grad_norm": 1.4459508312142024, "learning_rate": 4.087773552345873e-07, "logits/chosen": -3.326263666152954, "logits/rejected": -3.622400999069214, "logps/chosen": -13.418744087219238, "logps/rejected": -74.73787689208984, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 3.2520549297332764, "rewards/margins": 3.2520549297332764, "rewards/rejected": 0.0, "step": 1726 }, { "epoch": 9.64804469273743, "grad_norm": 1.0096702618259616, "learning_rate": 4.081781092028905e-07, "logits/chosen": -3.570830821990967, "logits/rejected": -3.3866488933563232, "logps/chosen": -8.488333702087402, "logps/rejected": -47.86735534667969, "loss": 0.1, "rewards/accuracies": 1.0, "rewards/chosen": 2.7169766426086426, "rewards/margins": 2.7169766426086426, "rewards/rejected": 0.0, "step": 1727 }, { "epoch": 9.653631284916202, "grad_norm": 0.7058010028158032, "learning_rate": 4.075789996348759e-07, "logits/chosen": -3.570552349090576, "logits/rejected": -3.7115981578826904, "logps/chosen": -6.230530738830566, "logps/rejected": -28.94961929321289, "loss": 0.0842, "rewards/accuracies": 1.0, "rewards/chosen": 2.987460136413574, "rewards/margins": 2.987460136413574, "rewards/rejected": 0.0, "step": 1728 }, { "epoch": 9.659217877094973, "grad_norm": 1.3403671628087432, "learning_rate": 4.069800274209271e-07, "logits/chosen": -3.5410234928131104, "logits/rejected": -3.5444400310516357, "logps/chosen": -36.93230438232422, "logps/rejected": -43.82135772705078, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 3.602138042449951, "rewards/margins": 3.602138042449951, "rewards/rejected": 0.0, "step": 1729 }, { "epoch": 9.664804469273744, "grad_norm": 1.3274871794888756, "learning_rate": 4.063811934512236e-07, "logits/chosen": -3.160902261734009, "logits/rejected": -3.1793432235717773, "logps/chosen": -7.068883419036865, "logps/rejected": -61.582122802734375, "loss": 0.0925, "rewards/accuracies": 1.0, "rewards/chosen": 3.330369472503662, "rewards/margins": 3.330369472503662, "rewards/rejected": 0.0, "step": 1730 }, { "epoch": 9.670391061452515, "grad_norm": 0.7071744118530354, "learning_rate": 4.0578249861573915e-07, "logits/chosen": -3.534485340118408, "logits/rejected": -3.7163448333740234, "logps/chosen": -2.4362969398498535, "logps/rejected": -60.17002868652344, "loss": 0.0799, "rewards/accuracies": 1.0, "rewards/chosen": 2.759557008743286, "rewards/margins": 2.759557008743286, "rewards/rejected": 0.0, "step": 1731 }, { "epoch": 9.675977653631286, "grad_norm": 1.3149072273229911, "learning_rate": 4.0518394380424104e-07, "logits/chosen": -3.4997012615203857, "logits/rejected": -3.3523571491241455, "logps/chosen": -4.800792694091797, "logps/rejected": -92.15414428710938, "loss": 0.1212, "rewards/accuracies": 1.0, "rewards/chosen": 2.196422815322876, "rewards/margins": 2.196422815322876, "rewards/rejected": 0.0, "step": 1732 }, { "epoch": 9.681564245810057, "grad_norm": 1.1150452822720032, "learning_rate": 4.0458552990628823e-07, "logits/chosen": -3.3276965618133545, "logits/rejected": -3.60642671585083, "logps/chosen": -3.349475860595703, "logps/rejected": -36.168418884277344, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 3.071549654006958, "rewards/margins": 3.071549654006958, "rewards/rejected": 0.0, "step": 1733 }, { "epoch": 9.687150837988828, "grad_norm": 1.4258397485068366, "learning_rate": 4.039872578112304e-07, "logits/chosen": -3.601954460144043, "logits/rejected": -3.7227015495300293, "logps/chosen": -3.7684290409088135, "logps/rejected": -58.21186828613281, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 2.8843202590942383, "rewards/margins": 2.8843202590942383, "rewards/rejected": 0.0, "step": 1734 }, { "epoch": 9.692737430167599, "grad_norm": 3.422911984874883, "learning_rate": 4.033891284082064e-07, "logits/chosen": -3.7799041271209717, "logits/rejected": -3.6273603439331055, "logps/chosen": -2.0787601470947266, "logps/rejected": -100.98727416992188, "loss": 0.1114, "rewards/accuracies": 1.0, "rewards/chosen": 1.9293601512908936, "rewards/margins": 1.9293601512908936, "rewards/rejected": 0.0, "step": 1735 }, { "epoch": 9.69832402234637, "grad_norm": 0.7344493698509696, "learning_rate": 4.0279114258614297e-07, "logits/chosen": -3.370419502258301, "logits/rejected": -3.5623855590820312, "logps/chosen": -21.76702880859375, "logps/rejected": -47.909759521484375, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 3.385744094848633, "rewards/margins": 3.385744094848633, "rewards/rejected": 0.0, "step": 1736 }, { "epoch": 9.703910614525139, "grad_norm": 3.7414667340922865, "learning_rate": 4.0219330123375355e-07, "logits/chosen": -3.397857904434204, "logits/rejected": -3.313786268234253, "logps/chosen": -0.6954784393310547, "logps/rejected": -41.59431838989258, "loss": 0.1144, "rewards/accuracies": 1.0, "rewards/chosen": 2.0533552169799805, "rewards/margins": 2.0533552169799805, "rewards/rejected": 0.0, "step": 1737 }, { "epoch": 9.70949720670391, "grad_norm": 1.2846233750707605, "learning_rate": 4.015956052395371e-07, "logits/chosen": -3.5319018363952637, "logits/rejected": -3.5256760120391846, "logps/chosen": -8.303479194641113, "logps/rejected": -34.281288146972656, "loss": 0.0993, "rewards/accuracies": 1.0, "rewards/chosen": 3.6803646087646484, "rewards/margins": 3.6803646087646484, "rewards/rejected": 0.0, "step": 1738 }, { "epoch": 9.71508379888268, "grad_norm": 3.287051408358936, "learning_rate": 4.0099805549177594e-07, "logits/chosen": -3.7380051612854004, "logits/rejected": -3.736550807952881, "logps/chosen": -12.186216354370117, "logps/rejected": -41.61170196533203, "loss": 0.1099, "rewards/accuracies": 1.0, "rewards/chosen": 3.883512258529663, "rewards/margins": 3.883512258529663, "rewards/rejected": 0.0, "step": 1739 }, { "epoch": 9.720670391061452, "grad_norm": 2.496423080365672, "learning_rate": 4.0040065287853563e-07, "logits/chosen": -3.4628448486328125, "logits/rejected": -3.5115394592285156, "logps/chosen": -0.5052372217178345, "logps/rejected": -94.57827758789062, "loss": 0.138, "rewards/accuracies": 1.0, "rewards/chosen": 1.9333962202072144, "rewards/margins": 1.9333962202072144, "rewards/rejected": 0.0, "step": 1740 }, { "epoch": 9.720670391061452, "eval_logits/chosen": -3.311786651611328, "eval_logits/rejected": -3.438049793243408, "eval_logps/chosen": -27.7125301361084, "eval_logps/rejected": -50.265037536621094, "eval_loss": 0.8467325568199158, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": 0.23575131595134735, "eval_rewards/margins": 0.23575131595134735, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7079, "eval_samples_per_second": 9.478, "eval_steps_per_second": 0.306, "step": 1740 }, { "epoch": 9.726256983240223, "grad_norm": 0.7484806715848626, "learning_rate": 3.9980339828766304e-07, "logits/chosen": -3.3347198963165283, "logits/rejected": -3.5363245010375977, "logps/chosen": -0.5856008529663086, "logps/rejected": -56.053314208984375, "loss": 0.0893, "rewards/accuracies": 1.0, "rewards/chosen": 1.6698448657989502, "rewards/margins": 1.6698448657989502, "rewards/rejected": 0.0, "step": 1741 }, { "epoch": 9.731843575418994, "grad_norm": 0.6762585368094688, "learning_rate": 3.9920629260678437e-07, "logits/chosen": -3.442697525024414, "logits/rejected": -3.5688514709472656, "logps/chosen": -4.242497444152832, "logps/rejected": -44.710716247558594, "loss": 0.0903, "rewards/accuracies": 1.0, "rewards/chosen": 3.1763532161712646, "rewards/margins": 3.1763532161712646, "rewards/rejected": 0.0, "step": 1742 }, { "epoch": 9.737430167597765, "grad_norm": 0.7985715083369194, "learning_rate": 3.986093367233054e-07, "logits/chosen": -3.028599977493286, "logits/rejected": -3.074604034423828, "logps/chosen": -9.273598670959473, "logps/rejected": -37.633121490478516, "loss": 0.0905, "rewards/accuracies": 1.0, "rewards/chosen": 2.960770845413208, "rewards/margins": 2.960770845413208, "rewards/rejected": 0.0, "step": 1743 }, { "epoch": 9.743016759776536, "grad_norm": 2.082457001958603, "learning_rate": 3.980125315244087e-07, "logits/chosen": -3.359621047973633, "logits/rejected": -3.2890217304229736, "logps/chosen": -9.379022598266602, "logps/rejected": -40.036903381347656, "loss": 0.1105, "rewards/accuracies": 1.0, "rewards/chosen": 2.7783303260803223, "rewards/margins": 2.7783303260803223, "rewards/rejected": 0.0, "step": 1744 }, { "epoch": 9.748603351955307, "grad_norm": 0.866636462205181, "learning_rate": 3.9741587789705315e-07, "logits/chosen": -3.580343723297119, "logits/rejected": -3.5654988288879395, "logps/chosen": -18.33171272277832, "logps/rejected": -43.90819549560547, "loss": 0.1017, "rewards/accuracies": 1.0, "rewards/chosen": 3.1261768341064453, "rewards/margins": 3.1261768341064453, "rewards/rejected": 0.0, "step": 1745 }, { "epoch": 9.754189944134078, "grad_norm": 1.7091966122334377, "learning_rate": 3.968193767279722e-07, "logits/chosen": -3.5168099403381348, "logits/rejected": -3.483940601348877, "logps/chosen": -0.9022572040557861, "logps/rejected": -52.251983642578125, "loss": 0.095, "rewards/accuracies": 1.0, "rewards/chosen": 2.030081272125244, "rewards/margins": 2.030081272125244, "rewards/rejected": 0.0, "step": 1746 }, { "epoch": 9.759776536312849, "grad_norm": 1.1050512299361617, "learning_rate": 3.962230289036731e-07, "logits/chosen": -3.5157039165496826, "logits/rejected": -3.4680283069610596, "logps/chosen": -1.9303457736968994, "logps/rejected": -41.15681457519531, "loss": 0.1078, "rewards/accuracies": 1.0, "rewards/chosen": 2.6821866035461426, "rewards/margins": 2.6821866035461426, "rewards/rejected": 0.0, "step": 1747 }, { "epoch": 9.76536312849162, "grad_norm": 1.1568480157771526, "learning_rate": 3.956268353104345e-07, "logits/chosen": -3.50109601020813, "logits/rejected": -3.5288844108581543, "logps/chosen": -7.669013977050781, "logps/rejected": -69.0399398803711, "loss": 0.1037, "rewards/accuracies": 1.0, "rewards/chosen": 2.297898292541504, "rewards/margins": 2.297898292541504, "rewards/rejected": 0.0, "step": 1748 }, { "epoch": 9.77094972067039, "grad_norm": 0.8890581971010301, "learning_rate": 3.950307968343065e-07, "logits/chosen": -3.638458251953125, "logits/rejected": -3.654754161834717, "logps/chosen": -6.63097620010376, "logps/rejected": -56.3765869140625, "loss": 0.1013, "rewards/accuracies": 1.0, "rewards/chosen": 2.622446060180664, "rewards/margins": 2.622446060180664, "rewards/rejected": 0.0, "step": 1749 }, { "epoch": 9.776536312849162, "grad_norm": 3.358159262414772, "learning_rate": 3.944349143611085e-07, "logits/chosen": -3.5754904747009277, "logits/rejected": -3.7089226245880127, "logps/chosen": -5.279230117797852, "logps/rejected": -46.64799880981445, "loss": 0.1168, "rewards/accuracies": 1.0, "rewards/chosen": 2.894991874694824, "rewards/margins": 2.894991874694824, "rewards/rejected": 0.0, "step": 1750 }, { "epoch": 9.782122905027933, "grad_norm": 1.3118787614294551, "learning_rate": 3.938391887764277e-07, "logits/chosen": -3.4574172496795654, "logits/rejected": -3.321378469467163, "logps/chosen": -16.185279846191406, "logps/rejected": -48.61097717285156, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": 4.327827453613281, "rewards/margins": 4.327827453613281, "rewards/rejected": 0.0, "step": 1751 }, { "epoch": 9.787709497206704, "grad_norm": 1.2003247191612298, "learning_rate": 3.932436209656188e-07, "logits/chosen": -3.395296096801758, "logits/rejected": -3.4303250312805176, "logps/chosen": -20.829021453857422, "logps/rejected": -52.34408187866211, "loss": 0.0862, "rewards/accuracies": 1.0, "rewards/chosen": 3.022113800048828, "rewards/margins": 3.022113800048828, "rewards/rejected": 0.0, "step": 1752 }, { "epoch": 9.793296089385475, "grad_norm": 1.243628704239324, "learning_rate": 3.926482118138013e-07, "logits/chosen": -3.3079471588134766, "logits/rejected": -3.23549222946167, "logps/chosen": -0.5884277820587158, "logps/rejected": -66.92959594726562, "loss": 0.1268, "rewards/accuracies": 1.0, "rewards/chosen": 1.4585797786712646, "rewards/margins": 1.4585797786712646, "rewards/rejected": 0.0, "step": 1753 }, { "epoch": 9.798882681564246, "grad_norm": 2.271405711594028, "learning_rate": 3.920529622058594e-07, "logits/chosen": -3.2228918075561523, "logits/rejected": -3.3771908283233643, "logps/chosen": -31.38953971862793, "logps/rejected": -34.92992401123047, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": 3.8909780979156494, "rewards/margins": 3.8909780979156494, "rewards/rejected": 0.0, "step": 1754 }, { "epoch": 9.804469273743017, "grad_norm": 0.7039622569173002, "learning_rate": 3.914578730264399e-07, "logits/chosen": -3.1822798252105713, "logits/rejected": -3.574373722076416, "logps/chosen": -1.4600975513458252, "logps/rejected": -57.503257751464844, "loss": 0.0749, "rewards/accuracies": 1.0, "rewards/chosen": 2.233499765396118, "rewards/margins": 2.233499765396118, "rewards/rejected": 0.0, "step": 1755 }, { "epoch": 9.810055865921788, "grad_norm": 5.120097178309147, "learning_rate": 3.9086294515995157e-07, "logits/chosen": -3.4335861206054688, "logits/rejected": -3.2751214504241943, "logps/chosen": -18.258726119995117, "logps/rejected": -35.95403289794922, "loss": 0.1322, "rewards/accuracies": 1.0, "rewards/chosen": 3.150092601776123, "rewards/margins": 3.150092601776123, "rewards/rejected": 0.0, "step": 1756 }, { "epoch": 9.815642458100559, "grad_norm": 0.9911180404327901, "learning_rate": 3.9026817949056276e-07, "logits/chosen": -3.4206182956695557, "logits/rejected": -3.3530704975128174, "logps/chosen": -3.826144218444824, "logps/rejected": -99.31694030761719, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": 2.372318744659424, "rewards/margins": 2.372318744659424, "rewards/rejected": 0.0, "step": 1757 }, { "epoch": 9.82122905027933, "grad_norm": 0.9858380401630963, "learning_rate": 3.896735769022014e-07, "logits/chosen": -3.5911967754364014, "logits/rejected": -3.4111180305480957, "logps/chosen": -6.001222610473633, "logps/rejected": -39.18571090698242, "loss": 0.1164, "rewards/accuracies": 1.0, "rewards/chosen": 3.1782052516937256, "rewards/margins": 3.1782052516937256, "rewards/rejected": 0.0, "step": 1758 }, { "epoch": 9.8268156424581, "grad_norm": 1.20868578492993, "learning_rate": 3.8907913827855296e-07, "logits/chosen": -3.6774656772613525, "logits/rejected": -3.4747474193573, "logps/chosen": -1.6732978820800781, "logps/rejected": -38.50730514526367, "loss": 0.1052, "rewards/accuracies": 1.0, "rewards/chosen": 2.597053050994873, "rewards/margins": 2.597053050994873, "rewards/rejected": 0.0, "step": 1759 }, { "epoch": 9.832402234636872, "grad_norm": 2.5127368290338428, "learning_rate": 3.8848486450305886e-07, "logits/chosen": -3.605839490890503, "logits/rejected": -3.558403968811035, "logps/chosen": -5.57737922668457, "logps/rejected": -33.1185188293457, "loss": 0.1465, "rewards/accuracies": 1.0, "rewards/chosen": 3.048252582550049, "rewards/margins": 3.048252582550049, "rewards/rejected": 0.0, "step": 1760 }, { "epoch": 9.832402234636872, "eval_logits/chosen": -3.3077073097229004, "eval_logits/rejected": -3.4372220039367676, "eval_logps/chosen": -27.46579360961914, "eval_logps/rejected": -49.97900390625, "eval_loss": 0.8316324353218079, "eval_rewards/accuracies": 0.675000011920929, "eval_rewards/chosen": 0.26042503118515015, "eval_rewards/margins": 0.26042503118515015, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6853, "eval_samples_per_second": 9.484, "eval_steps_per_second": 0.306, "step": 1760 }, { "epoch": 9.837988826815643, "grad_norm": 0.8045617035115878, "learning_rate": 3.878907564589158e-07, "logits/chosen": -3.4086174964904785, "logits/rejected": -3.4591829776763916, "logps/chosen": -0.9813145399093628, "logps/rejected": -36.769046783447266, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 2.1813831329345703, "rewards/margins": 2.1813831329345703, "rewards/rejected": 0.0, "step": 1761 }, { "epoch": 9.843575418994414, "grad_norm": 1.7989369383734568, "learning_rate": 3.872968150290745e-07, "logits/chosen": -3.2892017364501953, "logits/rejected": -3.3393900394439697, "logps/chosen": -3.108140230178833, "logps/rejected": -91.55709838867188, "loss": 0.0809, "rewards/accuracies": 1.0, "rewards/chosen": 2.5772852897644043, "rewards/margins": 2.5772852897644043, "rewards/rejected": 0.0, "step": 1762 }, { "epoch": 9.849162011173185, "grad_norm": 1.9401687622075436, "learning_rate": 3.867030410962372e-07, "logits/chosen": -3.1430904865264893, "logits/rejected": -2.9523463249206543, "logps/chosen": -27.47216796875, "logps/rejected": -23.676525115966797, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 3.5253515243530273, "rewards/margins": 3.5253515243530273, "rewards/rejected": 0.0, "step": 1763 }, { "epoch": 9.854748603351956, "grad_norm": 0.8372219159775339, "learning_rate": 3.86109435542858e-07, "logits/chosen": -3.4679312705993652, "logits/rejected": -3.3298377990722656, "logps/chosen": -8.737722396850586, "logps/rejected": -47.09326934814453, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": 3.532306671142578, "rewards/margins": 3.532306671142578, "rewards/rejected": 0.0, "step": 1764 }, { "epoch": 9.860335195530727, "grad_norm": 0.8523774604103546, "learning_rate": 3.855159992511403e-07, "logits/chosen": -3.185344696044922, "logits/rejected": -3.3517708778381348, "logps/chosen": -5.29677152633667, "logps/rejected": -92.0924072265625, "loss": 0.1312, "rewards/accuracies": 1.0, "rewards/chosen": 1.8936386108398438, "rewards/margins": 1.8936386108398438, "rewards/rejected": 0.0, "step": 1765 }, { "epoch": 9.865921787709498, "grad_norm": 1.680458582515685, "learning_rate": 3.8492273310303615e-07, "logits/chosen": -3.3776400089263916, "logits/rejected": -3.503953218460083, "logps/chosen": -0.49245208501815796, "logps/rejected": -85.63204956054688, "loss": 0.1052, "rewards/accuracies": 1.0, "rewards/chosen": 1.7712440490722656, "rewards/margins": 1.7712440490722656, "rewards/rejected": 0.0, "step": 1766 }, { "epoch": 9.871508379888269, "grad_norm": 2.7525121422880634, "learning_rate": 3.8432963798024493e-07, "logits/chosen": -3.604893445968628, "logits/rejected": -3.671189308166504, "logps/chosen": -9.61042594909668, "logps/rejected": -50.47281265258789, "loss": 0.1172, "rewards/accuracies": 1.0, "rewards/chosen": 3.054548978805542, "rewards/margins": 3.054548978805542, "rewards/rejected": 0.0, "step": 1767 }, { "epoch": 9.87709497206704, "grad_norm": 1.263736681740419, "learning_rate": 3.837367147642112e-07, "logits/chosen": -3.3494186401367188, "logits/rejected": -3.480822801589966, "logps/chosen": -1.6545333862304688, "logps/rejected": -57.14366149902344, "loss": 0.1269, "rewards/accuracies": 1.0, "rewards/chosen": 1.903753399848938, "rewards/margins": 1.903753399848938, "rewards/rejected": 0.0, "step": 1768 }, { "epoch": 9.88268156424581, "grad_norm": 1.9278952942148668, "learning_rate": 3.8314396433612476e-07, "logits/chosen": -3.361748456954956, "logits/rejected": -3.4422905445098877, "logps/chosen": -11.146872520446777, "logps/rejected": -44.38884353637695, "loss": 0.1103, "rewards/accuracies": 1.0, "rewards/chosen": 2.3656280040740967, "rewards/margins": 2.3656280040740967, "rewards/rejected": 0.0, "step": 1769 }, { "epoch": 9.888268156424582, "grad_norm": 0.9324815106028379, "learning_rate": 3.825513875769184e-07, "logits/chosen": -3.4899258613586426, "logits/rejected": -3.5031919479370117, "logps/chosen": -3.4244465827941895, "logps/rejected": -52.209190368652344, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 2.807156801223755, "rewards/margins": 2.807156801223755, "rewards/rejected": 0.0, "step": 1770 }, { "epoch": 9.893854748603353, "grad_norm": 0.7663472924188264, "learning_rate": 3.819589853672664e-07, "logits/chosen": -3.441516160964966, "logits/rejected": -3.580808162689209, "logps/chosen": -2.0890583992004395, "logps/rejected": -41.098140716552734, "loss": 0.0653, "rewards/accuracies": 1.0, "rewards/chosen": 2.743102550506592, "rewards/margins": 2.743102550506592, "rewards/rejected": 0.0, "step": 1771 }, { "epoch": 9.899441340782122, "grad_norm": 2.1268683325530326, "learning_rate": 3.8136675858758413e-07, "logits/chosen": -3.400416851043701, "logits/rejected": -3.420621156692505, "logps/chosen": -7.7488837242126465, "logps/rejected": -56.89590072631836, "loss": 0.1307, "rewards/accuracies": 1.0, "rewards/chosen": 2.7369866371154785, "rewards/margins": 2.7369866371154785, "rewards/rejected": 0.0, "step": 1772 }, { "epoch": 9.905027932960895, "grad_norm": 0.6675627638984232, "learning_rate": 3.8077470811802607e-07, "logits/chosen": -3.4226248264312744, "logits/rejected": -3.3723502159118652, "logps/chosen": -6.033211708068848, "logps/rejected": -28.20562744140625, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 3.184857130050659, "rewards/margins": 3.184857130050659, "rewards/rejected": 0.0, "step": 1773 }, { "epoch": 9.910614525139664, "grad_norm": 1.530320240689921, "learning_rate": 3.801828348384846e-07, "logits/chosen": -3.1324992179870605, "logits/rejected": -3.481651782989502, "logps/chosen": -1.784105896949768, "logps/rejected": -68.61576843261719, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 2.685253143310547, "rewards/margins": 2.685253143310547, "rewards/rejected": 0.0, "step": 1774 }, { "epoch": 9.916201117318435, "grad_norm": 1.722119950356398, "learning_rate": 3.7959113962858874e-07, "logits/chosen": -3.3707611560821533, "logits/rejected": -3.356269359588623, "logps/chosen": -1.1857571601867676, "logps/rejected": -35.922237396240234, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 2.640213966369629, "rewards/margins": 2.640213966369629, "rewards/rejected": 0.0, "step": 1775 }, { "epoch": 9.921787709497206, "grad_norm": 0.8925646178900609, "learning_rate": 3.789996233677032e-07, "logits/chosen": -3.47371506690979, "logits/rejected": -3.368332862854004, "logps/chosen": -6.826603412628174, "logps/rejected": -26.780426025390625, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 2.414897918701172, "rewards/margins": 2.414897918701172, "rewards/rejected": 0.0, "step": 1776 }, { "epoch": 9.927374301675977, "grad_norm": 0.6691219573900442, "learning_rate": 3.7840828693492616e-07, "logits/chosen": -3.4892804622650146, "logits/rejected": -3.559783458709717, "logps/chosen": -12.715020179748535, "logps/rejected": -58.34233093261719, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 3.7501840591430664, "rewards/margins": 3.7501840591430664, "rewards/rejected": 0.0, "step": 1777 }, { "epoch": 9.932960893854748, "grad_norm": 2.2085090168636925, "learning_rate": 3.77817131209089e-07, "logits/chosen": -3.3483548164367676, "logits/rejected": -3.464453935623169, "logps/chosen": -3.2515854835510254, "logps/rejected": -60.47914123535156, "loss": 0.1083, "rewards/accuracies": 1.0, "rewards/chosen": 2.664233684539795, "rewards/margins": 2.664233684539795, "rewards/rejected": 0.0, "step": 1778 }, { "epoch": 9.938547486033519, "grad_norm": 0.9067412657774024, "learning_rate": 3.772261570687547e-07, "logits/chosen": -3.5294055938720703, "logits/rejected": -3.6384940147399902, "logps/chosen": -0.8120299577713013, "logps/rejected": -51.77969741821289, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 2.22739315032959, "rewards/margins": 2.22739315032959, "rewards/rejected": 0.0, "step": 1779 }, { "epoch": 9.94413407821229, "grad_norm": 0.792412217652063, "learning_rate": 3.7663536539221557e-07, "logits/chosen": -2.7787868976593018, "logits/rejected": -2.9306488037109375, "logps/chosen": -43.92116165161133, "logps/rejected": -47.89521026611328, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 3.404733657836914, "rewards/margins": 3.404733657836914, "rewards/rejected": 0.0, "step": 1780 }, { "epoch": 9.94413407821229, "eval_logits/chosen": -3.3067257404327393, "eval_logits/rejected": -3.434464693069458, "eval_logps/chosen": -27.879383087158203, "eval_logps/rejected": -50.4825439453125, "eval_loss": 0.8560438752174377, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": 0.2190660983324051, "eval_rewards/margins": 0.2190660983324051, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7005, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.306, "step": 1780 }, { "epoch": 9.949720670391061, "grad_norm": 4.858421110396251, "learning_rate": 3.760447570574935e-07, "logits/chosen": -3.4732677936553955, "logits/rejected": -3.3451426029205322, "logps/chosen": -3.260953664779663, "logps/rejected": -59.285213470458984, "loss": 0.0985, "rewards/accuracies": 1.0, "rewards/chosen": 1.7885258197784424, "rewards/margins": 1.7885258197784424, "rewards/rejected": 0.0, "step": 1781 }, { "epoch": 9.955307262569832, "grad_norm": 1.2430823107046305, "learning_rate": 3.754543329423379e-07, "logits/chosen": -3.2819435596466064, "logits/rejected": -3.44335675239563, "logps/chosen": -2.0860066413879395, "logps/rejected": -70.48025512695312, "loss": 0.1349, "rewards/accuracies": 1.0, "rewards/chosen": 2.7055273056030273, "rewards/margins": 2.7055273056030273, "rewards/rejected": 0.0, "step": 1782 }, { "epoch": 9.960893854748603, "grad_norm": 1.5458676899178516, "learning_rate": 3.748640939242238e-07, "logits/chosen": -3.4624369144439697, "logits/rejected": -3.2279250621795654, "logps/chosen": -1.4880242347717285, "logps/rejected": -44.38206481933594, "loss": 0.0902, "rewards/accuracies": 1.0, "rewards/chosen": 2.151301145553589, "rewards/margins": 2.151301145553589, "rewards/rejected": 0.0, "step": 1783 }, { "epoch": 9.966480446927374, "grad_norm": 2.8207054308446797, "learning_rate": 3.7427404088035154e-07, "logits/chosen": -3.529907703399658, "logits/rejected": -3.399103879928589, "logps/chosen": -7.378881454467773, "logps/rejected": -29.37787437438965, "loss": 0.1076, "rewards/accuracies": 1.0, "rewards/chosen": 3.654884099960327, "rewards/margins": 3.654884099960327, "rewards/rejected": 0.0, "step": 1784 }, { "epoch": 9.972067039106145, "grad_norm": 2.365062685396757, "learning_rate": 3.736841746876452e-07, "logits/chosen": -3.386728286743164, "logits/rejected": -3.290909767150879, "logps/chosen": -1.703054428100586, "logps/rejected": -28.475000381469727, "loss": 0.1203, "rewards/accuracies": 1.0, "rewards/chosen": 2.1432785987854004, "rewards/margins": 2.1432785987854004, "rewards/rejected": 0.0, "step": 1785 }, { "epoch": 9.977653631284916, "grad_norm": 1.6920144311296252, "learning_rate": 3.730944962227508e-07, "logits/chosen": -3.4041218757629395, "logits/rejected": -3.3558971881866455, "logps/chosen": -0.975854754447937, "logps/rejected": -37.28769302368164, "loss": 0.107, "rewards/accuracies": 1.0, "rewards/chosen": 1.864650011062622, "rewards/margins": 1.864650011062622, "rewards/rejected": 0.0, "step": 1786 }, { "epoch": 9.983240223463687, "grad_norm": 0.7704995271772535, "learning_rate": 3.7250500636203563e-07, "logits/chosen": -3.4895503520965576, "logits/rejected": -3.2797303199768066, "logps/chosen": -13.055721282958984, "logps/rejected": -37.99407196044922, "loss": 0.0876, "rewards/accuracies": 1.0, "rewards/chosen": 4.027369499206543, "rewards/margins": 4.027369499206543, "rewards/rejected": 0.0, "step": 1787 }, { "epoch": 9.988826815642458, "grad_norm": 1.0171152776714127, "learning_rate": 3.719157059815868e-07, "logits/chosen": -3.2032382488250732, "logits/rejected": -3.1955862045288086, "logps/chosen": -3.1329994201660156, "logps/rejected": -41.41448211669922, "loss": 0.1139, "rewards/accuracies": 1.0, "rewards/chosen": 2.500286817550659, "rewards/margins": 2.500286817550659, "rewards/rejected": 0.0, "step": 1788 }, { "epoch": 9.994413407821229, "grad_norm": 1.1703063868926848, "learning_rate": 3.713265959572093e-07, "logits/chosen": -3.1190359592437744, "logits/rejected": -3.178412437438965, "logps/chosen": -3.4634060859680176, "logps/rejected": -37.48868179321289, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 2.124701499938965, "rewards/margins": 2.124701499938965, "rewards/rejected": 0.0, "step": 1789 }, { "epoch": 10.0, "grad_norm": 2.2025068365730167, "learning_rate": 3.707376771644256e-07, "logits/chosen": -3.533526659011841, "logits/rejected": -3.4222147464752197, "logps/chosen": -2.5332818031311035, "logps/rejected": -44.49677276611328, "loss": 0.1077, "rewards/accuracies": 1.0, "rewards/chosen": 3.1045467853546143, "rewards/margins": 3.1045467853546143, "rewards/rejected": 0.0, "step": 1790 }, { "epoch": 10.005586592178771, "grad_norm": 0.6971282634946807, "learning_rate": 3.7014895047847427e-07, "logits/chosen": -3.6349709033966064, "logits/rejected": -3.6591176986694336, "logps/chosen": -20.891502380371094, "logps/rejected": -64.14535522460938, "loss": 0.1042, "rewards/accuracies": 1.0, "rewards/chosen": 3.370220184326172, "rewards/margins": 3.370220184326172, "rewards/rejected": 0.0, "step": 1791 }, { "epoch": 10.011173184357542, "grad_norm": 2.7256861857969237, "learning_rate": 3.695604167743076e-07, "logits/chosen": -3.316932439804077, "logits/rejected": -3.2103734016418457, "logps/chosen": -5.798719882965088, "logps/rejected": -46.49449157714844, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 2.738903522491455, "rewards/margins": 2.738903522491455, "rewards/rejected": 0.0, "step": 1792 }, { "epoch": 10.016759776536313, "grad_norm": 0.4488725480341928, "learning_rate": 3.689720769265916e-07, "logits/chosen": -3.404007911682129, "logits/rejected": -3.4754297733306885, "logps/chosen": -3.416945695877075, "logps/rejected": -38.77027893066406, "loss": 0.0761, "rewards/accuracies": 1.0, "rewards/chosen": 2.912581443786621, "rewards/margins": 2.912581443786621, "rewards/rejected": 0.0, "step": 1793 }, { "epoch": 10.022346368715084, "grad_norm": 1.348873498036711, "learning_rate": 3.6838393180970414e-07, "logits/chosen": -3.72420072555542, "logits/rejected": -3.533327341079712, "logps/chosen": -2.403259754180908, "logps/rejected": -54.006866455078125, "loss": 0.1179, "rewards/accuracies": 1.0, "rewards/chosen": 2.9669742584228516, "rewards/margins": 2.9669742584228516, "rewards/rejected": 0.0, "step": 1794 }, { "epoch": 10.027932960893855, "grad_norm": 0.6901470402549703, "learning_rate": 3.677959822977334e-07, "logits/chosen": -3.4903969764709473, "logits/rejected": -3.695781707763672, "logps/chosen": -16.70089340209961, "logps/rejected": -43.73395538330078, "loss": 0.0944, "rewards/accuracies": 1.0, "rewards/chosen": 3.4970295429229736, "rewards/margins": 3.4970295429229736, "rewards/rejected": 0.0, "step": 1795 }, { "epoch": 10.033519553072626, "grad_norm": 0.874014065062377, "learning_rate": 3.6720822926447746e-07, "logits/chosen": -3.149998664855957, "logits/rejected": -3.092202663421631, "logps/chosen": -5.896096229553223, "logps/rejected": -41.47645568847656, "loss": 0.1202, "rewards/accuracies": 1.0, "rewards/chosen": 2.8974430561065674, "rewards/margins": 2.8974430561065674, "rewards/rejected": 0.0, "step": 1796 }, { "epoch": 10.039106145251397, "grad_norm": 0.6459482810611393, "learning_rate": 3.666206735834415e-07, "logits/chosen": -3.2483434677124023, "logits/rejected": -3.4119620323181152, "logps/chosen": -0.2977844476699829, "logps/rejected": -72.59209442138672, "loss": 0.0998, "rewards/accuracies": 1.0, "rewards/chosen": 1.855495810508728, "rewards/margins": 1.855495810508728, "rewards/rejected": 0.0, "step": 1797 }, { "epoch": 10.044692737430168, "grad_norm": 4.657927756822812, "learning_rate": 3.6603331612783814e-07, "logits/chosen": -3.57955002784729, "logits/rejected": -3.7270655632019043, "logps/chosen": -5.491798400878906, "logps/rejected": -32.26445770263672, "loss": 0.1423, "rewards/accuracies": 1.0, "rewards/chosen": 3.0879874229431152, "rewards/margins": 3.0879874229431152, "rewards/rejected": 0.0, "step": 1798 }, { "epoch": 10.050279329608939, "grad_norm": 0.5170493262407111, "learning_rate": 3.654461577705855e-07, "logits/chosen": -3.653297185897827, "logits/rejected": -3.5519185066223145, "logps/chosen": -0.7011286616325378, "logps/rejected": -48.50175857543945, "loss": 0.0914, "rewards/accuracies": 1.0, "rewards/chosen": 2.2897849082946777, "rewards/margins": 2.2897849082946777, "rewards/rejected": 0.0, "step": 1799 }, { "epoch": 10.05586592178771, "grad_norm": 0.5477330112233832, "learning_rate": 3.6485919938430484e-07, "logits/chosen": -3.3516428470611572, "logits/rejected": -3.263317823410034, "logps/chosen": -0.7090689539909363, "logps/rejected": -63.48958206176758, "loss": 0.1189, "rewards/accuracies": 1.0, "rewards/chosen": 2.2917845249176025, "rewards/margins": 2.2917845249176025, "rewards/rejected": 0.0, "step": 1800 }, { "epoch": 10.05586592178771, "eval_logits/chosen": -3.306220293045044, "eval_logits/rejected": -3.4329609870910645, "eval_logps/chosen": -27.864831924438477, "eval_logps/rejected": -50.33716583251953, "eval_loss": 0.8530665040016174, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": 0.22052082419395447, "eval_rewards/margins": 0.22052082419395447, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7075, "eval_samples_per_second": 9.478, "eval_steps_per_second": 0.306, "step": 1800 }, { "epoch": 10.061452513966481, "grad_norm": 0.597354188490836, "learning_rate": 3.642724418413211e-07, "logits/chosen": -3.14695405960083, "logits/rejected": -3.36464786529541, "logps/chosen": -7.296316623687744, "logps/rejected": -83.26950073242188, "loss": 0.0847, "rewards/accuracies": 1.0, "rewards/chosen": 3.0922064781188965, "rewards/margins": 3.0922064781188965, "rewards/rejected": 0.0, "step": 1801 }, { "epoch": 10.067039106145252, "grad_norm": 3.7260784031422487, "learning_rate": 3.6368588601366084e-07, "logits/chosen": -3.3331053256988525, "logits/rejected": -3.6264700889587402, "logps/chosen": -0.9284124970436096, "logps/rejected": -79.09478759765625, "loss": 0.139, "rewards/accuracies": 1.0, "rewards/chosen": 2.1486902236938477, "rewards/margins": 2.1486902236938477, "rewards/rejected": 0.0, "step": 1802 }, { "epoch": 10.072625698324023, "grad_norm": 0.5138674110258655, "learning_rate": 3.6309953277305003e-07, "logits/chosen": -3.390658140182495, "logits/rejected": -3.3350045680999756, "logps/chosen": -0.8583160638809204, "logps/rejected": -34.64715576171875, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 2.389380931854248, "rewards/margins": 2.389380931854248, "rewards/rejected": 0.0, "step": 1803 }, { "epoch": 10.078212290502794, "grad_norm": 1.5476097008815286, "learning_rate": 3.625133829909141e-07, "logits/chosen": -3.593475580215454, "logits/rejected": -3.447777271270752, "logps/chosen": -3.3375725746154785, "logps/rejected": -33.17520523071289, "loss": 0.1049, "rewards/accuracies": 1.0, "rewards/chosen": 2.4503748416900635, "rewards/margins": 2.4503748416900635, "rewards/rejected": 0.0, "step": 1804 }, { "epoch": 10.083798882681565, "grad_norm": 0.5830468945337419, "learning_rate": 3.619274375383763e-07, "logits/chosen": -3.523662567138672, "logits/rejected": -3.5923807621002197, "logps/chosen": -19.54915428161621, "logps/rejected": -27.53266143798828, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 3.7670845985412598, "rewards/margins": 3.7670845985412598, "rewards/rejected": 0.0, "step": 1805 }, { "epoch": 10.089385474860336, "grad_norm": 0.9087389475341742, "learning_rate": 3.613416972862554e-07, "logits/chosen": -3.2287206649780273, "logits/rejected": -3.3611888885498047, "logps/chosen": -3.0734684467315674, "logps/rejected": -77.2863998413086, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": 2.500432014465332, "rewards/margins": 2.500432014465332, "rewards/rejected": 0.0, "step": 1806 }, { "epoch": 10.094972067039107, "grad_norm": 0.584162716982449, "learning_rate": 3.607561631050661e-07, "logits/chosen": -3.4512503147125244, "logits/rejected": -3.354774236679077, "logps/chosen": -6.594293594360352, "logps/rejected": -22.98871612548828, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": 3.304326057434082, "rewards/margins": 3.304326057434082, "rewards/rejected": 0.0, "step": 1807 }, { "epoch": 10.100558659217878, "grad_norm": 1.9744675269338456, "learning_rate": 3.601708358650165e-07, "logits/chosen": -3.525938034057617, "logits/rejected": -3.686091423034668, "logps/chosen": -11.050585746765137, "logps/rejected": -44.24974822998047, "loss": 0.1197, "rewards/accuracies": 1.0, "rewards/chosen": 2.7290430068969727, "rewards/margins": 2.7290430068969727, "rewards/rejected": 0.0, "step": 1808 }, { "epoch": 10.106145251396647, "grad_norm": 1.1717321376977452, "learning_rate": 3.5958571643600696e-07, "logits/chosen": -3.4633126258850098, "logits/rejected": -3.702258586883545, "logps/chosen": -2.6135754585266113, "logps/rejected": -45.58203887939453, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 2.6394338607788086, "rewards/margins": 2.6394338607788086, "rewards/rejected": 0.0, "step": 1809 }, { "epoch": 10.111731843575418, "grad_norm": 0.8484919348861357, "learning_rate": 3.5900080568762916e-07, "logits/chosen": -3.5047709941864014, "logits/rejected": -3.480854034423828, "logps/chosen": -47.91346740722656, "logps/rejected": -30.867521286010742, "loss": 0.0729, "rewards/accuracies": 1.0, "rewards/chosen": 4.134696006774902, "rewards/margins": 4.134696006774902, "rewards/rejected": 0.0, "step": 1810 }, { "epoch": 10.11731843575419, "grad_norm": 3.902059987595593, "learning_rate": 3.584161044891648e-07, "logits/chosen": -3.7783257961273193, "logits/rejected": -3.6784465312957764, "logps/chosen": -3.2678468227386475, "logps/rejected": -38.271728515625, "loss": 0.0995, "rewards/accuracies": 1.0, "rewards/chosen": 3.1960103511810303, "rewards/margins": 3.1960103511810303, "rewards/rejected": 0.0, "step": 1811 }, { "epoch": 10.12290502793296, "grad_norm": 3.712507208825318, "learning_rate": 3.5783161370958374e-07, "logits/chosen": -3.2993576526641846, "logits/rejected": -3.4634902477264404, "logps/chosen": -0.25519460439682007, "logps/rejected": -98.33456420898438, "loss": 0.1408, "rewards/accuracies": 1.0, "rewards/chosen": 1.7136197090148926, "rewards/margins": 1.7136197090148926, "rewards/rejected": 0.0, "step": 1812 }, { "epoch": 10.128491620111731, "grad_norm": 1.321000307062693, "learning_rate": 3.572473342175436e-07, "logits/chosen": -3.3377838134765625, "logits/rejected": -3.445711374282837, "logps/chosen": -2.820636749267578, "logps/rejected": -41.340660095214844, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": 2.604174852371216, "rewards/margins": 2.604174852371216, "rewards/rejected": 0.0, "step": 1813 }, { "epoch": 10.134078212290502, "grad_norm": 1.5393829717655072, "learning_rate": 3.566632668813878e-07, "logits/chosen": -3.440643787384033, "logits/rejected": -3.596426010131836, "logps/chosen": -18.2464542388916, "logps/rejected": -34.368446350097656, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/chosen": 3.2566850185394287, "rewards/margins": 3.2566850185394287, "rewards/rejected": 0.0, "step": 1814 }, { "epoch": 10.139664804469273, "grad_norm": 1.9601218109100802, "learning_rate": 3.560794125691441e-07, "logits/chosen": -3.508427858352661, "logits/rejected": -3.4402294158935547, "logps/chosen": -1.8796428442001343, "logps/rejected": -39.425445556640625, "loss": 0.0841, "rewards/accuracies": 1.0, "rewards/chosen": 3.0107765197753906, "rewards/margins": 3.0107765197753906, "rewards/rejected": 0.0, "step": 1815 }, { "epoch": 10.145251396648044, "grad_norm": 1.26301681295152, "learning_rate": 3.5549577214852415e-07, "logits/chosen": -3.3128695487976074, "logits/rejected": -3.274829864501953, "logps/chosen": -18.262205123901367, "logps/rejected": -34.38519287109375, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 3.6635007858276367, "rewards/margins": 3.6635007858276367, "rewards/rejected": 0.0, "step": 1816 }, { "epoch": 10.150837988826815, "grad_norm": 0.5983902495600948, "learning_rate": 3.549123464869218e-07, "logits/chosen": -3.3344552516937256, "logits/rejected": -3.501915693283081, "logps/chosen": -1.0359522104263306, "logps/rejected": -86.96023559570312, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 2.0813112258911133, "rewards/margins": 2.0813112258911133, "rewards/rejected": 0.0, "step": 1817 }, { "epoch": 10.156424581005586, "grad_norm": 0.5444704846871248, "learning_rate": 3.5432913645141103e-07, "logits/chosen": -3.424379825592041, "logits/rejected": -3.5716700553894043, "logps/chosen": -1.2069406509399414, "logps/rejected": -49.48115158081055, "loss": 0.0938, "rewards/accuracies": 1.0, "rewards/chosen": 2.398582696914673, "rewards/margins": 2.398582696914673, "rewards/rejected": 0.0, "step": 1818 }, { "epoch": 10.162011173184357, "grad_norm": 0.7436498642751891, "learning_rate": 3.537461429087464e-07, "logits/chosen": -3.156156539916992, "logits/rejected": -2.994645833969116, "logps/chosen": -3.9752585887908936, "logps/rejected": -39.014522552490234, "loss": 0.0765, "rewards/accuracies": 1.0, "rewards/chosen": 2.7616236209869385, "rewards/margins": 2.7616236209869385, "rewards/rejected": 0.0, "step": 1819 }, { "epoch": 10.167597765363128, "grad_norm": 0.6418105324988261, "learning_rate": 3.531633667253594e-07, "logits/chosen": -3.4729127883911133, "logits/rejected": -3.5277273654937744, "logps/chosen": -4.411098003387451, "logps/rejected": -68.09147644042969, "loss": 0.1078, "rewards/accuracies": 1.0, "rewards/chosen": 2.95358943939209, "rewards/margins": 2.95358943939209, "rewards/rejected": 0.0, "step": 1820 }, { "epoch": 10.167597765363128, "eval_logits/chosen": -3.306750535964966, "eval_logits/rejected": -3.43415904045105, "eval_logps/chosen": -28.0472354888916, "eval_logps/rejected": -50.623146057128906, "eval_loss": 0.8597971200942993, "eval_rewards/accuracies": 0.675000011920929, "eval_rewards/chosen": 0.20228078961372375, "eval_rewards/margins": 0.20228078961372375, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6905, "eval_samples_per_second": 9.483, "eval_steps_per_second": 0.306, "step": 1820 }, { "epoch": 10.1731843575419, "grad_norm": 0.5378953337576965, "learning_rate": 3.525808087673597e-07, "logits/chosen": -3.7045185565948486, "logits/rejected": -3.641814947128296, "logps/chosen": -1.8553470373153687, "logps/rejected": -30.911592483520508, "loss": 0.0801, "rewards/accuracies": 1.0, "rewards/chosen": 2.6366233825683594, "rewards/margins": 2.6366233825683594, "rewards/rejected": 0.0, "step": 1821 }, { "epoch": 10.17877094972067, "grad_norm": 1.4765511353150538, "learning_rate": 3.5199846990053227e-07, "logits/chosen": -3.639594078063965, "logits/rejected": -3.5415031909942627, "logps/chosen": -15.877860069274902, "logps/rejected": -27.99404525756836, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": 3.320511817932129, "rewards/margins": 3.320511817932129, "rewards/rejected": 0.0, "step": 1822 }, { "epoch": 10.184357541899441, "grad_norm": 0.5777010850299931, "learning_rate": 3.5141635099033615e-07, "logits/chosen": -3.521170139312744, "logits/rejected": -3.6726415157318115, "logps/chosen": -1.5075368881225586, "logps/rejected": -33.77130889892578, "loss": 0.1013, "rewards/accuracies": 1.0, "rewards/chosen": 2.0790767669677734, "rewards/margins": 2.0790767669677734, "rewards/rejected": 0.0, "step": 1823 }, { "epoch": 10.189944134078212, "grad_norm": 1.0164424489320223, "learning_rate": 3.508344529019036e-07, "logits/chosen": -3.419110059738159, "logits/rejected": -3.4553027153015137, "logps/chosen": -3.7882261276245117, "logps/rejected": -39.779170989990234, "loss": 0.0785, "rewards/accuracies": 1.0, "rewards/chosen": 2.990870714187622, "rewards/margins": 2.990870714187622, "rewards/rejected": 0.0, "step": 1824 }, { "epoch": 10.195530726256983, "grad_norm": 0.4876300906458077, "learning_rate": 3.5025277650003924e-07, "logits/chosen": -3.7110509872436523, "logits/rejected": -3.6583542823791504, "logps/chosen": -0.771490216255188, "logps/rejected": -33.36054229736328, "loss": 0.1132, "rewards/accuracies": 1.0, "rewards/chosen": 2.275239944458008, "rewards/margins": 2.275239944458008, "rewards/rejected": 0.0, "step": 1825 }, { "epoch": 10.201117318435754, "grad_norm": 0.4735647527961175, "learning_rate": 3.496713226492173e-07, "logits/chosen": -3.451616048812866, "logits/rejected": -3.6345643997192383, "logps/chosen": -4.551021099090576, "logps/rejected": -41.84909439086914, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 2.69582462310791, "rewards/margins": 2.69582462310791, "rewards/rejected": 0.0, "step": 1826 }, { "epoch": 10.206703910614525, "grad_norm": 0.5634175579177542, "learning_rate": 3.49090092213582e-07, "logits/chosen": -3.3760182857513428, "logits/rejected": -3.4502804279327393, "logps/chosen": -3.664790630340576, "logps/rejected": -55.21084976196289, "loss": 0.0889, "rewards/accuracies": 1.0, "rewards/chosen": 2.8722262382507324, "rewards/margins": 2.8722262382507324, "rewards/rejected": 0.0, "step": 1827 }, { "epoch": 10.212290502793296, "grad_norm": 0.5762721791745985, "learning_rate": 3.485090860569454e-07, "logits/chosen": -3.5474774837493896, "logits/rejected": -3.703139543533325, "logps/chosen": -8.969099044799805, "logps/rejected": -69.3248291015625, "loss": 0.0969, "rewards/accuracies": 1.0, "rewards/chosen": 3.520266532897949, "rewards/margins": 3.520266532897949, "rewards/rejected": 0.0, "step": 1828 }, { "epoch": 10.217877094972067, "grad_norm": 0.5241805173650772, "learning_rate": 3.4792830504278576e-07, "logits/chosen": -3.5396921634674072, "logits/rejected": -3.5115535259246826, "logps/chosen": -1.3261332511901855, "logps/rejected": -61.18022918701172, "loss": 0.1027, "rewards/accuracies": 1.0, "rewards/chosen": 2.5625224113464355, "rewards/margins": 2.5625224113464355, "rewards/rejected": 0.0, "step": 1829 }, { "epoch": 10.223463687150838, "grad_norm": 1.2202257954137754, "learning_rate": 3.4734775003424733e-07, "logits/chosen": -3.3850276470184326, "logits/rejected": -3.4800312519073486, "logps/chosen": -1.661970615386963, "logps/rejected": -66.8587646484375, "loss": 0.103, "rewards/accuracies": 1.0, "rewards/chosen": 2.148455858230591, "rewards/margins": 2.148455858230591, "rewards/rejected": 0.0, "step": 1830 }, { "epoch": 10.22905027932961, "grad_norm": 0.9781709482587243, "learning_rate": 3.4676742189413845e-07, "logits/chosen": -3.4181766510009766, "logits/rejected": -3.443974256515503, "logps/chosen": -0.8105766773223877, "logps/rejected": -30.953338623046875, "loss": 0.0763, "rewards/accuracies": 1.0, "rewards/chosen": 2.427340507507324, "rewards/margins": 2.427340507507324, "rewards/rejected": 0.0, "step": 1831 }, { "epoch": 10.23463687150838, "grad_norm": 1.0589211914881005, "learning_rate": 3.4618732148492967e-07, "logits/chosen": -3.196134328842163, "logits/rejected": -3.3517751693725586, "logps/chosen": -7.323477268218994, "logps/rejected": -57.46253967285156, "loss": 0.1267, "rewards/accuracies": 1.0, "rewards/chosen": 3.3929636478424072, "rewards/margins": 3.3929636478424072, "rewards/rejected": 0.0, "step": 1832 }, { "epoch": 10.240223463687151, "grad_norm": 0.5288517081599233, "learning_rate": 3.456074496687538e-07, "logits/chosen": -3.1686713695526123, "logits/rejected": -3.2956857681274414, "logps/chosen": -2.033550262451172, "logps/rejected": -42.6346435546875, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 2.9380688667297363, "rewards/margins": 2.9380688667297363, "rewards/rejected": 0.0, "step": 1833 }, { "epoch": 10.245810055865922, "grad_norm": 1.1668173711901333, "learning_rate": 3.4502780730740356e-07, "logits/chosen": -3.2127201557159424, "logits/rejected": -3.16595721244812, "logps/chosen": -6.029697895050049, "logps/rejected": -33.397308349609375, "loss": 0.1084, "rewards/accuracies": 1.0, "rewards/chosen": 2.8888542652130127, "rewards/margins": 2.8888542652130127, "rewards/rejected": 0.0, "step": 1834 }, { "epoch": 10.251396648044693, "grad_norm": 1.2717630202495658, "learning_rate": 3.444483952623306e-07, "logits/chosen": -3.254587173461914, "logits/rejected": -3.267111301422119, "logps/chosen": -12.352456092834473, "logps/rejected": -44.57154846191406, "loss": 0.0763, "rewards/accuracies": 1.0, "rewards/chosen": 3.622567892074585, "rewards/margins": 3.622567892074585, "rewards/rejected": 0.0, "step": 1835 }, { "epoch": 10.256983240223464, "grad_norm": 3.219122668949424, "learning_rate": 3.4386921439464443e-07, "logits/chosen": -3.407097578048706, "logits/rejected": -3.4369802474975586, "logps/chosen": -16.4822998046875, "logps/rejected": -44.601016998291016, "loss": 0.1251, "rewards/accuracies": 1.0, "rewards/chosen": 3.7790446281433105, "rewards/margins": 3.7790446281433105, "rewards/rejected": 0.0, "step": 1836 }, { "epoch": 10.262569832402235, "grad_norm": 1.3450125333718765, "learning_rate": 3.432902655651113e-07, "logits/chosen": -3.3426268100738525, "logits/rejected": -3.3855786323547363, "logps/chosen": -5.525186538696289, "logps/rejected": -82.24298095703125, "loss": 0.1268, "rewards/accuracies": 1.0, "rewards/chosen": 2.879131317138672, "rewards/margins": 2.879131317138672, "rewards/rejected": 0.0, "step": 1837 }, { "epoch": 10.268156424581006, "grad_norm": 0.6178454890474379, "learning_rate": 3.427115496341518e-07, "logits/chosen": -3.4179000854492188, "logits/rejected": -3.521860361099243, "logps/chosen": -8.484330177307129, "logps/rejected": -32.62311553955078, "loss": 0.0881, "rewards/accuracies": 1.0, "rewards/chosen": 3.538153648376465, "rewards/margins": 3.538153648376465, "rewards/rejected": 0.0, "step": 1838 }, { "epoch": 10.273743016759777, "grad_norm": 1.7629890611681795, "learning_rate": 3.421330674618411e-07, "logits/chosen": -3.66129994392395, "logits/rejected": -3.7412681579589844, "logps/chosen": -0.4885304570198059, "logps/rejected": -49.49581527709961, "loss": 0.1142, "rewards/accuracies": 1.0, "rewards/chosen": 1.7407662868499756, "rewards/margins": 1.7407662868499756, "rewards/rejected": 0.0, "step": 1839 }, { "epoch": 10.279329608938548, "grad_norm": 0.8929129133825531, "learning_rate": 3.4155481990790687e-07, "logits/chosen": -3.2866475582122803, "logits/rejected": -3.336662769317627, "logps/chosen": -1.3250962495803833, "logps/rejected": -55.27764892578125, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": 2.3882293701171875, "rewards/margins": 2.3882293701171875, "rewards/rejected": 0.0, "step": 1840 }, { "epoch": 10.279329608938548, "eval_logits/chosen": -3.2945797443389893, "eval_logits/rejected": -3.4231491088867188, "eval_logps/chosen": -28.080852508544922, "eval_logps/rejected": -51.409461975097656, "eval_loss": 0.8763619661331177, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": 0.19891873002052307, "eval_rewards/margins": 0.19891873002052307, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6942, "eval_samples_per_second": 9.482, "eval_steps_per_second": 0.306, "step": 1840 }, { "epoch": 10.28491620111732, "grad_norm": 0.708406499896884, "learning_rate": 3.4097680783172763e-07, "logits/chosen": -3.390831470489502, "logits/rejected": -3.603947877883911, "logps/chosen": -0.34076374769210815, "logps/rejected": -57.07746505737305, "loss": 0.1065, "rewards/accuracies": 1.0, "rewards/chosen": 1.540673851966858, "rewards/margins": 1.540673851966858, "rewards/rejected": 0.0, "step": 1841 }, { "epoch": 10.29050279329609, "grad_norm": 1.2933475544436752, "learning_rate": 3.403990320923325e-07, "logits/chosen": -3.458280086517334, "logits/rejected": -3.5281569957733154, "logps/chosen": -1.0999212265014648, "logps/rejected": -42.88365936279297, "loss": 0.0902, "rewards/accuracies": 1.0, "rewards/chosen": 2.469132423400879, "rewards/margins": 2.469132423400879, "rewards/rejected": 0.0, "step": 1842 }, { "epoch": 10.296089385474861, "grad_norm": 0.5619472867290808, "learning_rate": 3.3982149354839904e-07, "logits/chosen": -3.5612244606018066, "logits/rejected": -3.3477532863616943, "logps/chosen": -3.444309711456299, "logps/rejected": -82.17162322998047, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 3.4129276275634766, "rewards/margins": 3.4129276275634766, "rewards/rejected": 0.0, "step": 1843 }, { "epoch": 10.30167597765363, "grad_norm": 0.50006164653966, "learning_rate": 3.392441930582525e-07, "logits/chosen": -3.3518800735473633, "logits/rejected": -3.2428812980651855, "logps/chosen": -2.3187599182128906, "logps/rejected": -53.334068298339844, "loss": 0.1265, "rewards/accuracies": 1.0, "rewards/chosen": 2.5861599445343018, "rewards/margins": 2.5861599445343018, "rewards/rejected": 0.0, "step": 1844 }, { "epoch": 10.307262569832401, "grad_norm": 1.164540651723829, "learning_rate": 3.3866713147986393e-07, "logits/chosen": -3.6116364002227783, "logits/rejected": -3.65683650970459, "logps/chosen": -2.0827600955963135, "logps/rejected": -35.13486862182617, "loss": 0.1174, "rewards/accuracies": 1.0, "rewards/chosen": 2.676473379135132, "rewards/margins": 2.676473379135132, "rewards/rejected": 0.0, "step": 1845 }, { "epoch": 10.312849162011172, "grad_norm": 0.47667747136271305, "learning_rate": 3.380903096708501e-07, "logits/chosen": -3.578843116760254, "logits/rejected": -3.4206881523132324, "logps/chosen": -3.8796091079711914, "logps/rejected": -28.51175308227539, "loss": 0.0552, "rewards/accuracies": 1.0, "rewards/chosen": 3.4883346557617188, "rewards/margins": 3.4883346557617188, "rewards/rejected": 0.0, "step": 1846 }, { "epoch": 10.318435754189943, "grad_norm": 0.6376174597724137, "learning_rate": 3.375137284884703e-07, "logits/chosen": -3.4132537841796875, "logits/rejected": -3.46468186378479, "logps/chosen": -4.338701248168945, "logps/rejected": -51.54572677612305, "loss": 0.1098, "rewards/accuracies": 1.0, "rewards/chosen": 2.1232264041900635, "rewards/margins": 2.1232264041900635, "rewards/rejected": 0.0, "step": 1847 }, { "epoch": 10.324022346368714, "grad_norm": 0.8348576151942313, "learning_rate": 3.369373887896274e-07, "logits/chosen": -3.403993606567383, "logits/rejected": -3.4272985458374023, "logps/chosen": -1.2190645933151245, "logps/rejected": -32.052894592285156, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 2.384263277053833, "rewards/margins": 2.384263277053833, "rewards/rejected": 0.0, "step": 1848 }, { "epoch": 10.329608938547485, "grad_norm": 1.064786522288118, "learning_rate": 3.363612914308645e-07, "logits/chosen": -3.5142853260040283, "logits/rejected": -3.445352554321289, "logps/chosen": -0.7066118121147156, "logps/rejected": -51.051021575927734, "loss": 0.1144, "rewards/accuracies": 1.0, "rewards/chosen": 2.0623998641967773, "rewards/margins": 2.0623998641967773, "rewards/rejected": 0.0, "step": 1849 }, { "epoch": 10.335195530726256, "grad_norm": 0.5238202917225792, "learning_rate": 3.3578543726836495e-07, "logits/chosen": -2.6879019737243652, "logits/rejected": -2.939012050628662, "logps/chosen": -4.14566707611084, "logps/rejected": -97.30796813964844, "loss": 0.1184, "rewards/accuracies": 1.0, "rewards/chosen": 2.2669615745544434, "rewards/margins": 2.2669615745544434, "rewards/rejected": 0.0, "step": 1850 }, { "epoch": 10.340782122905027, "grad_norm": 0.683918591125821, "learning_rate": 3.352098271579509e-07, "logits/chosen": -3.4271247386932373, "logits/rejected": -3.4677963256835938, "logps/chosen": -1.8000907897949219, "logps/rejected": -54.379390716552734, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 2.590089797973633, "rewards/margins": 2.590089797973633, "rewards/rejected": 0.0, "step": 1851 }, { "epoch": 10.346368715083798, "grad_norm": 1.2349938001511842, "learning_rate": 3.3463446195508096e-07, "logits/chosen": -3.3436098098754883, "logits/rejected": -3.552900552749634, "logps/chosen": -2.0247182846069336, "logps/rejected": -39.91437911987305, "loss": 0.1227, "rewards/accuracies": 1.0, "rewards/chosen": 2.3063344955444336, "rewards/margins": 2.3063344955444336, "rewards/rejected": 0.0, "step": 1852 }, { "epoch": 10.35195530726257, "grad_norm": 0.9950521200296845, "learning_rate": 3.340593425148506e-07, "logits/chosen": -3.435642719268799, "logits/rejected": -3.6248693466186523, "logps/chosen": -3.458634376525879, "logps/rejected": -62.87099838256836, "loss": 0.1006, "rewards/accuracies": 1.0, "rewards/chosen": 2.806347370147705, "rewards/margins": 2.806347370147705, "rewards/rejected": 0.0, "step": 1853 }, { "epoch": 10.35754189944134, "grad_norm": 0.47763450859602763, "learning_rate": 3.3348446969198987e-07, "logits/chosen": -3.394721746444702, "logits/rejected": -3.4811933040618896, "logps/chosen": -3.7385308742523193, "logps/rejected": -42.472137451171875, "loss": 0.088, "rewards/accuracies": 1.0, "rewards/chosen": 2.706210136413574, "rewards/margins": 2.706210136413574, "rewards/rejected": 0.0, "step": 1854 }, { "epoch": 10.363128491620111, "grad_norm": 1.3451192899406743, "learning_rate": 3.3290984434086167e-07, "logits/chosen": -2.9988791942596436, "logits/rejected": -3.119492530822754, "logps/chosen": -9.244361877441406, "logps/rejected": -43.439910888671875, "loss": 0.0946, "rewards/accuracies": 1.0, "rewards/chosen": 3.273994207382202, "rewards/margins": 3.273994207382202, "rewards/rejected": 0.0, "step": 1855 }, { "epoch": 10.368715083798882, "grad_norm": 2.619526409237513, "learning_rate": 3.32335467315462e-07, "logits/chosen": -3.699422836303711, "logits/rejected": -3.593337059020996, "logps/chosen": -7.9409332275390625, "logps/rejected": -41.3218994140625, "loss": 0.0959, "rewards/accuracies": 1.0, "rewards/chosen": 4.015827178955078, "rewards/margins": 4.015827178955078, "rewards/rejected": 0.0, "step": 1856 }, { "epoch": 10.374301675977653, "grad_norm": 0.5557399530140986, "learning_rate": 3.317613394694175e-07, "logits/chosen": -3.137514114379883, "logits/rejected": -3.0139694213867188, "logps/chosen": -0.49404364824295044, "logps/rejected": -95.09521484375, "loss": 0.1072, "rewards/accuracies": 1.0, "rewards/chosen": 1.7863245010375977, "rewards/margins": 1.7863245010375977, "rewards/rejected": 0.0, "step": 1857 }, { "epoch": 10.379888268156424, "grad_norm": 0.6493478016639063, "learning_rate": 3.3118746165598397e-07, "logits/chosen": -3.3223178386688232, "logits/rejected": -3.1887118816375732, "logps/chosen": -2.961448907852173, "logps/rejected": -81.54209899902344, "loss": 0.1037, "rewards/accuracies": 1.0, "rewards/chosen": 2.4504408836364746, "rewards/margins": 2.4504408836364746, "rewards/rejected": 0.0, "step": 1858 }, { "epoch": 10.385474860335195, "grad_norm": 1.9270303079384823, "learning_rate": 3.306138347280463e-07, "logits/chosen": -3.484477996826172, "logits/rejected": -3.3443188667297363, "logps/chosen": -2.8306005001068115, "logps/rejected": -41.490234375, "loss": 0.1235, "rewards/accuracies": 1.0, "rewards/chosen": 2.867560386657715, "rewards/margins": 2.867560386657715, "rewards/rejected": 0.0, "step": 1859 }, { "epoch": 10.391061452513966, "grad_norm": 0.8294739883535532, "learning_rate": 3.3004045953811646e-07, "logits/chosen": -3.254361629486084, "logits/rejected": -3.349991798400879, "logps/chosen": -1.9156274795532227, "logps/rejected": -110.4948501586914, "loss": 0.0831, "rewards/accuracies": 1.0, "rewards/chosen": 2.4569637775421143, "rewards/margins": 2.4569637775421143, "rewards/rejected": 0.0, "step": 1860 }, { "epoch": 10.391061452513966, "eval_logits/chosen": -3.29571533203125, "eval_logits/rejected": -3.4226455688476562, "eval_logps/chosen": -28.695659637451172, "eval_logps/rejected": -52.63666534423828, "eval_loss": 0.9198136925697327, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": 0.13743813335895538, "eval_rewards/margins": 0.13743813335895538, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6799, "eval_samples_per_second": 9.486, "eval_steps_per_second": 0.306, "step": 1860 }, { "epoch": 10.396648044692737, "grad_norm": 0.6064433511872755, "learning_rate": 3.2946733693833174e-07, "logits/chosen": -3.3379640579223633, "logits/rejected": -3.3924098014831543, "logps/chosen": -2.839841365814209, "logps/rejected": -57.46247100830078, "loss": 0.1402, "rewards/accuracies": 1.0, "rewards/chosen": 2.3916800022125244, "rewards/margins": 2.3916800022125244, "rewards/rejected": 0.0, "step": 1861 }, { "epoch": 10.402234636871508, "grad_norm": 2.1087870060840728, "learning_rate": 3.288944677804546e-07, "logits/chosen": -3.341815233230591, "logits/rejected": -3.523813009262085, "logps/chosen": -5.720947265625, "logps/rejected": -101.80484008789062, "loss": 0.1137, "rewards/accuracies": 1.0, "rewards/chosen": 3.270772933959961, "rewards/margins": 3.270772933959961, "rewards/rejected": 0.0, "step": 1862 }, { "epoch": 10.40782122905028, "grad_norm": 0.4689909392959233, "learning_rate": 3.2832185291587053e-07, "logits/chosen": -3.3669724464416504, "logits/rejected": -3.6839537620544434, "logps/chosen": -0.6228213310241699, "logps/rejected": -40.415008544921875, "loss": 0.1286, "rewards/accuracies": 1.0, "rewards/chosen": 2.0144503116607666, "rewards/margins": 2.0144503116607666, "rewards/rejected": 0.0, "step": 1863 }, { "epoch": 10.41340782122905, "grad_norm": 4.798617740793981, "learning_rate": 3.2774949319558723e-07, "logits/chosen": -3.4009153842926025, "logits/rejected": -3.3676819801330566, "logps/chosen": -6.605515956878662, "logps/rejected": -32.565277099609375, "loss": 0.1368, "rewards/accuracies": 1.0, "rewards/chosen": 3.265873432159424, "rewards/margins": 3.265873432159424, "rewards/rejected": 0.0, "step": 1864 }, { "epoch": 10.418994413407821, "grad_norm": 0.6096916993770324, "learning_rate": 3.271773894702331e-07, "logits/chosen": -3.2294697761535645, "logits/rejected": -3.4293994903564453, "logps/chosen": -0.6215426921844482, "logps/rejected": -75.20904541015625, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": 2.0924994945526123, "rewards/margins": 2.0924994945526123, "rewards/rejected": 0.0, "step": 1865 }, { "epoch": 10.424581005586592, "grad_norm": 0.6886767599347046, "learning_rate": 3.2660554259005645e-07, "logits/chosen": -3.4089062213897705, "logits/rejected": -3.3943350315093994, "logps/chosen": -6.764495372772217, "logps/rejected": -39.120704650878906, "loss": 0.1189, "rewards/accuracies": 1.0, "rewards/chosen": 3.6976699829101562, "rewards/margins": 3.6976699829101562, "rewards/rejected": 0.0, "step": 1866 }, { "epoch": 10.430167597765363, "grad_norm": 0.5453366272332151, "learning_rate": 3.260339534049232e-07, "logits/chosen": -3.6877708435058594, "logits/rejected": -3.6817972660064697, "logps/chosen": -13.224329948425293, "logps/rejected": -42.58197021484375, "loss": 0.1195, "rewards/accuracies": 1.0, "rewards/chosen": 3.314650535583496, "rewards/margins": 3.314650535583496, "rewards/rejected": 0.0, "step": 1867 }, { "epoch": 10.435754189944134, "grad_norm": 0.7275898467664447, "learning_rate": 3.254626227643168e-07, "logits/chosen": -3.495012044906616, "logits/rejected": -3.6272099018096924, "logps/chosen": -8.475492477416992, "logps/rejected": -24.994121551513672, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 3.5595078468322754, "rewards/margins": 3.5595078468322754, "rewards/rejected": 0.0, "step": 1868 }, { "epoch": 10.441340782122905, "grad_norm": 2.484363144221312, "learning_rate": 3.248915515173366e-07, "logits/chosen": -3.5623674392700195, "logits/rejected": -3.687246799468994, "logps/chosen": -7.592851638793945, "logps/rejected": -52.94874954223633, "loss": 0.1213, "rewards/accuracies": 1.0, "rewards/chosen": 4.052880764007568, "rewards/margins": 4.052880764007568, "rewards/rejected": 0.0, "step": 1869 }, { "epoch": 10.446927374301676, "grad_norm": 0.603735940648404, "learning_rate": 3.243207405126958e-07, "logits/chosen": -3.6547141075134277, "logits/rejected": -3.6865251064300537, "logps/chosen": -6.05224609375, "logps/rejected": -21.353927612304688, "loss": 0.1025, "rewards/accuracies": 1.0, "rewards/chosen": 2.1725809574127197, "rewards/margins": 2.1725809574127197, "rewards/rejected": 0.0, "step": 1870 }, { "epoch": 10.452513966480447, "grad_norm": 1.1344001752076176, "learning_rate": 3.237501905987214e-07, "logits/chosen": -2.693904161453247, "logits/rejected": -2.6437413692474365, "logps/chosen": -24.93901824951172, "logps/rejected": -94.80763244628906, "loss": 0.0998, "rewards/accuracies": 1.0, "rewards/chosen": 3.0128211975097656, "rewards/margins": 3.0128211975097656, "rewards/rejected": 0.0, "step": 1871 }, { "epoch": 10.458100558659218, "grad_norm": 0.7219791491200507, "learning_rate": 3.2317990262335217e-07, "logits/chosen": -3.526754856109619, "logits/rejected": -3.445040464401245, "logps/chosen": -8.690518379211426, "logps/rejected": -58.01318359375, "loss": 0.0873, "rewards/accuracies": 1.0, "rewards/chosen": 3.334439754486084, "rewards/margins": 3.334439754486084, "rewards/rejected": 0.0, "step": 1872 }, { "epoch": 10.46368715083799, "grad_norm": 0.6052807808897211, "learning_rate": 3.2260987743413756e-07, "logits/chosen": -3.3755505084991455, "logits/rejected": -3.410623073577881, "logps/chosen": -1.857917308807373, "logps/rejected": -54.81926345825195, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 2.1309776306152344, "rewards/margins": 2.1309776306152344, "rewards/rejected": 0.0, "step": 1873 }, { "epoch": 10.46927374301676, "grad_norm": 0.597599134085182, "learning_rate": 3.220401158782364e-07, "logits/chosen": -3.3665552139282227, "logits/rejected": -3.3034322261810303, "logps/chosen": -8.424214363098145, "logps/rejected": -34.91681671142578, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": 2.7273669242858887, "rewards/margins": 2.7273669242858887, "rewards/rejected": 0.0, "step": 1874 }, { "epoch": 10.474860335195531, "grad_norm": 0.5489151263112975, "learning_rate": 3.214706188024162e-07, "logits/chosen": -3.3839004039764404, "logits/rejected": -3.606938123703003, "logps/chosen": -1.805442214012146, "logps/rejected": -63.92960739135742, "loss": 0.1057, "rewards/accuracies": 1.0, "rewards/chosen": 2.525209903717041, "rewards/margins": 2.525209903717041, "rewards/rejected": 0.0, "step": 1875 }, { "epoch": 10.480446927374302, "grad_norm": 0.703639741134572, "learning_rate": 3.2090138705305065e-07, "logits/chosen": -3.439547300338745, "logits/rejected": -3.591928005218506, "logps/chosen": -2.223017454147339, "logps/rejected": -49.858394622802734, "loss": 0.1127, "rewards/accuracies": 1.0, "rewards/chosen": 2.5340662002563477, "rewards/margins": 2.5340662002563477, "rewards/rejected": 0.0, "step": 1876 }, { "epoch": 10.486033519553073, "grad_norm": 1.854366482925165, "learning_rate": 3.203324214761198e-07, "logits/chosen": -3.5880889892578125, "logits/rejected": -3.425997257232666, "logps/chosen": -3.504964590072632, "logps/rejected": -44.9100341796875, "loss": 0.1284, "rewards/accuracies": 1.0, "rewards/chosen": 2.7333126068115234, "rewards/margins": 2.7333126068115234, "rewards/rejected": 0.0, "step": 1877 }, { "epoch": 10.491620111731844, "grad_norm": 0.6785987682925823, "learning_rate": 3.1976372291720756e-07, "logits/chosen": -3.468360662460327, "logits/rejected": -3.4381210803985596, "logps/chosen": -3.877368211746216, "logps/rejected": -41.90599060058594, "loss": 0.09, "rewards/accuracies": 1.0, "rewards/chosen": 3.057978630065918, "rewards/margins": 3.057978630065918, "rewards/rejected": 0.0, "step": 1878 }, { "epoch": 10.497206703910614, "grad_norm": 2.0267640363629873, "learning_rate": 3.191952922215013e-07, "logits/chosen": -3.4153389930725098, "logits/rejected": -3.2908735275268555, "logps/chosen": -17.036602020263672, "logps/rejected": -59.45793151855469, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": 3.0718488693237305, "rewards/margins": 3.0718488693237305, "rewards/rejected": 0.0, "step": 1879 }, { "epoch": 10.502793296089386, "grad_norm": 1.2286848174016436, "learning_rate": 3.186271302337906e-07, "logits/chosen": -3.3930225372314453, "logits/rejected": -3.4970273971557617, "logps/chosen": -6.700510501861572, "logps/rejected": -53.76878356933594, "loss": 0.0949, "rewards/accuracies": 1.0, "rewards/chosen": 2.4969258308410645, "rewards/margins": 2.4969258308410645, "rewards/rejected": 0.0, "step": 1880 }, { "epoch": 10.502793296089386, "eval_logits/chosen": -3.3038909435272217, "eval_logits/rejected": -3.43035626411438, "eval_logps/chosen": -28.814382553100586, "eval_logps/rejected": -53.03582000732422, "eval_loss": 0.9273196458816528, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": 0.12556585669517517, "eval_rewards/margins": 0.12556585669517517, "eval_rewards/rejected": 0.0, "eval_runtime": 32.722, "eval_samples_per_second": 9.474, "eval_steps_per_second": 0.306, "step": 1880 }, { "epoch": 10.508379888268156, "grad_norm": 1.3767565364552767, "learning_rate": 3.180592377984649e-07, "logits/chosen": -3.4134857654571533, "logits/rejected": -3.629716396331787, "logps/chosen": -3.9373466968536377, "logps/rejected": -50.44145202636719, "loss": 0.098, "rewards/accuracies": 1.0, "rewards/chosen": 2.643033742904663, "rewards/margins": 2.643033742904663, "rewards/rejected": 0.0, "step": 1881 }, { "epoch": 10.513966480446927, "grad_norm": 0.5177749659849222, "learning_rate": 3.174916157595138e-07, "logits/chosen": -3.090735912322998, "logits/rejected": -3.250361919403076, "logps/chosen": -12.252424240112305, "logps/rejected": -33.36803436279297, "loss": 0.1161, "rewards/accuracies": 1.0, "rewards/chosen": 3.7724671363830566, "rewards/margins": 3.7724671363830566, "rewards/rejected": 0.0, "step": 1882 }, { "epoch": 10.519553072625698, "grad_norm": 0.8496090813239161, "learning_rate": 3.169242649605246e-07, "logits/chosen": -3.366504669189453, "logits/rejected": -3.426129102706909, "logps/chosen": -1.0884718894958496, "logps/rejected": -38.06122589111328, "loss": 0.1116, "rewards/accuracies": 1.0, "rewards/chosen": 2.3657970428466797, "rewards/margins": 2.3657970428466797, "rewards/rejected": 0.0, "step": 1883 }, { "epoch": 10.525139664804469, "grad_norm": 0.8796284574737169, "learning_rate": 3.163571862446818e-07, "logits/chosen": -3.7220442295074463, "logits/rejected": -3.6978707313537598, "logps/chosen": -7.9090800285339355, "logps/rejected": -24.3577938079834, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 2.8309731483459473, "rewards/margins": 2.8309731483459473, "rewards/rejected": 0.0, "step": 1884 }, { "epoch": 10.53072625698324, "grad_norm": 0.5312805536348916, "learning_rate": 3.15790380454765e-07, "logits/chosen": -3.2869033813476562, "logits/rejected": -3.2248241901397705, "logps/chosen": -2.2369425296783447, "logps/rejected": -33.74311828613281, "loss": 0.098, "rewards/accuracies": 1.0, "rewards/chosen": 3.060045003890991, "rewards/margins": 3.060045003890991, "rewards/rejected": 0.0, "step": 1885 }, { "epoch": 10.53631284916201, "grad_norm": 0.7302794320445865, "learning_rate": 3.152238484331491e-07, "logits/chosen": -3.4044575691223145, "logits/rejected": -3.6623547077178955, "logps/chosen": -6.0326924324035645, "logps/rejected": -89.37249755859375, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 2.8519020080566406, "rewards/margins": 2.8519020080566406, "rewards/rejected": 0.0, "step": 1886 }, { "epoch": 10.541899441340782, "grad_norm": 0.5663235521646847, "learning_rate": 3.14657591021801e-07, "logits/chosen": -3.168351173400879, "logits/rejected": -3.2040045261383057, "logps/chosen": -2.657283067703247, "logps/rejected": -60.280731201171875, "loss": 0.1132, "rewards/accuracies": 1.0, "rewards/chosen": 2.359914779663086, "rewards/margins": 2.359914779663086, "rewards/rejected": 0.0, "step": 1887 }, { "epoch": 10.547486033519553, "grad_norm": 0.9185305140268429, "learning_rate": 3.140916090622803e-07, "logits/chosen": -3.514221668243408, "logits/rejected": -3.578711748123169, "logps/chosen": -0.9986670017242432, "logps/rejected": -43.1521110534668, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 2.387401580810547, "rewards/margins": 2.387401580810547, "rewards/rejected": 0.0, "step": 1888 }, { "epoch": 10.553072625698324, "grad_norm": 1.4701073629648287, "learning_rate": 3.135259033957373e-07, "logits/chosen": -3.2775659561157227, "logits/rejected": -3.515453338623047, "logps/chosen": -8.058919906616211, "logps/rejected": -49.19865417480469, "loss": 0.1299, "rewards/accuracies": 1.0, "rewards/chosen": 3.6242642402648926, "rewards/margins": 3.6242642402648926, "rewards/rejected": 0.0, "step": 1889 }, { "epoch": 10.558659217877095, "grad_norm": 2.8172893416366644, "learning_rate": 3.129604748629108e-07, "logits/chosen": -3.2192795276641846, "logits/rejected": -3.2048914432525635, "logps/chosen": -0.8653272390365601, "logps/rejected": -58.21757507324219, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": 2.102515697479248, "rewards/margins": 2.102515697479248, "rewards/rejected": 0.0, "step": 1890 }, { "epoch": 10.564245810055866, "grad_norm": 0.5448801466788024, "learning_rate": 3.123953243041286e-07, "logits/chosen": -3.1389858722686768, "logits/rejected": -3.3476922512054443, "logps/chosen": -1.8807554244995117, "logps/rejected": -46.98809814453125, "loss": 0.1075, "rewards/accuracies": 1.0, "rewards/chosen": 2.8406925201416016, "rewards/margins": 2.8406925201416016, "rewards/rejected": 0.0, "step": 1891 }, { "epoch": 10.569832402234637, "grad_norm": 0.6888829582532957, "learning_rate": 3.118304525593052e-07, "logits/chosen": -3.44079327583313, "logits/rejected": -3.4656615257263184, "logps/chosen": -0.6652265191078186, "logps/rejected": -87.39227294921875, "loss": 0.0885, "rewards/accuracies": 1.0, "rewards/chosen": 1.701551079750061, "rewards/margins": 1.701551079750061, "rewards/rejected": 0.0, "step": 1892 }, { "epoch": 10.575418994413408, "grad_norm": 0.8904888977785465, "learning_rate": 3.1126586046794036e-07, "logits/chosen": -2.9938697814941406, "logits/rejected": -3.435041666030884, "logps/chosen": -0.6238469481468201, "logps/rejected": -48.46592330932617, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 1.8748118877410889, "rewards/margins": 1.8748118877410889, "rewards/rejected": 0.0, "step": 1893 }, { "epoch": 10.581005586592179, "grad_norm": 1.4878413639749024, "learning_rate": 3.107015488691185e-07, "logits/chosen": -2.7491397857666016, "logits/rejected": -2.68963623046875, "logps/chosen": -19.811344146728516, "logps/rejected": -62.037689208984375, "loss": 0.1163, "rewards/accuracies": 1.0, "rewards/chosen": 2.192837953567505, "rewards/margins": 2.192837953567505, "rewards/rejected": 0.0, "step": 1894 }, { "epoch": 10.58659217877095, "grad_norm": 0.6106390540040627, "learning_rate": 3.101375186015074e-07, "logits/chosen": -3.508349657058716, "logits/rejected": -3.42073655128479, "logps/chosen": -10.890157699584961, "logps/rejected": -28.693105697631836, "loss": 0.0979, "rewards/accuracies": 1.0, "rewards/chosen": 3.7642245292663574, "rewards/margins": 3.7642245292663574, "rewards/rejected": 0.0, "step": 1895 }, { "epoch": 10.59217877094972, "grad_norm": 0.8537535394462678, "learning_rate": 3.095737705033562e-07, "logits/chosen": -3.3676419258117676, "logits/rejected": -3.559569835662842, "logps/chosen": -1.8120158910751343, "logps/rejected": -109.5850830078125, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 2.8345835208892822, "rewards/margins": 2.8345835208892822, "rewards/rejected": 0.0, "step": 1896 }, { "epoch": 10.597765363128492, "grad_norm": 0.555774535175715, "learning_rate": 3.0901030541249507e-07, "logits/chosen": -3.679433822631836, "logits/rejected": -3.6491522789001465, "logps/chosen": -5.315507411956787, "logps/rejected": -32.05292510986328, "loss": 0.0868, "rewards/accuracies": 1.0, "rewards/chosen": 3.4725875854492188, "rewards/margins": 3.4725875854492188, "rewards/rejected": 0.0, "step": 1897 }, { "epoch": 10.603351955307263, "grad_norm": 1.2410684993215815, "learning_rate": 3.084471241663337e-07, "logits/chosen": -3.521740198135376, "logits/rejected": -3.4961280822753906, "logps/chosen": -6.980363845825195, "logps/rejected": -42.24245071411133, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": 3.250321388244629, "rewards/margins": 3.250321388244629, "rewards/rejected": 0.0, "step": 1898 }, { "epoch": 10.608938547486034, "grad_norm": 0.5777248211280508, "learning_rate": 3.078842276018592e-07, "logits/chosen": -3.5176894664764404, "logits/rejected": -3.681659698486328, "logps/chosen": -1.005691409111023, "logps/rejected": -30.57900619506836, "loss": 0.1313, "rewards/accuracies": 1.0, "rewards/chosen": 2.057600975036621, "rewards/margins": 2.057600975036621, "rewards/rejected": 0.0, "step": 1899 }, { "epoch": 10.614525139664805, "grad_norm": 0.6841981164333766, "learning_rate": 3.0732161655563663e-07, "logits/chosen": -3.368708848953247, "logits/rejected": -3.388643503189087, "logps/chosen": -1.4272022247314453, "logps/rejected": -39.79710388183594, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": 2.8871188163757324, "rewards/margins": 2.8871188163757324, "rewards/rejected": 0.0, "step": 1900 }, { "epoch": 10.614525139664805, "eval_logits/chosen": -3.3017399311065674, "eval_logits/rejected": -3.4273781776428223, "eval_logps/chosen": -28.869882583618164, "eval_logps/rejected": -52.3021240234375, "eval_loss": 0.930737316608429, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": 0.12001585960388184, "eval_rewards/margins": 0.12001585960388184, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7227, "eval_samples_per_second": 9.474, "eval_steps_per_second": 0.306, "step": 1900 }, { "epoch": 10.620111731843576, "grad_norm": 0.873393200558506, "learning_rate": 3.06759291863806e-07, "logits/chosen": -3.3183014392852783, "logits/rejected": -3.3055436611175537, "logps/chosen": -1.8771289587020874, "logps/rejected": -41.003761291503906, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 2.37975811958313, "rewards/margins": 2.37975811958313, "rewards/rejected": 0.0, "step": 1901 }, { "epoch": 10.625698324022347, "grad_norm": 1.0898740440146328, "learning_rate": 3.06197254362082e-07, "logits/chosen": -3.221456289291382, "logits/rejected": -3.405565023422241, "logps/chosen": -6.760497093200684, "logps/rejected": -75.37287902832031, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": 2.8092257976531982, "rewards/margins": 2.8092257976531982, "rewards/rejected": 0.0, "step": 1902 }, { "epoch": 10.631284916201118, "grad_norm": 0.7698520182240444, "learning_rate": 3.0563550488575217e-07, "logits/chosen": -3.7989161014556885, "logits/rejected": -3.5736124515533447, "logps/chosen": -0.87165367603302, "logps/rejected": -71.2402114868164, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 2.5508410930633545, "rewards/margins": 2.5508410930633545, "rewards/rejected": 0.0, "step": 1903 }, { "epoch": 10.636871508379889, "grad_norm": 1.2176089987866257, "learning_rate": 3.0507404426967684e-07, "logits/chosen": -3.381575107574463, "logits/rejected": -3.1284801959991455, "logps/chosen": -0.5544835329055786, "logps/rejected": -59.278507232666016, "loss": 0.124, "rewards/accuracies": 1.0, "rewards/chosen": 2.175469398498535, "rewards/margins": 2.175469398498535, "rewards/rejected": 0.0, "step": 1904 }, { "epoch": 10.64245810055866, "grad_norm": 0.7624922702151887, "learning_rate": 3.045128733482858e-07, "logits/chosen": -3.4666285514831543, "logits/rejected": -3.5378479957580566, "logps/chosen": -0.3660850524902344, "logps/rejected": -49.37598419189453, "loss": 0.0987, "rewards/accuracies": 1.0, "rewards/chosen": 1.7370697259902954, "rewards/margins": 1.7370697259902954, "rewards/rejected": 0.0, "step": 1905 }, { "epoch": 10.64804469273743, "grad_norm": 1.1756598573532124, "learning_rate": 3.0395199295557963e-07, "logits/chosen": -3.420257806777954, "logits/rejected": -3.4432029724121094, "logps/chosen": -29.889463424682617, "logps/rejected": -50.085670471191406, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": 3.109011650085449, "rewards/margins": 3.109011650085449, "rewards/rejected": 0.0, "step": 1906 }, { "epoch": 10.653631284916202, "grad_norm": 1.3235234717404352, "learning_rate": 3.0339140392512597e-07, "logits/chosen": -3.469914674758911, "logits/rejected": -3.234611988067627, "logps/chosen": -0.6308854818344116, "logps/rejected": -52.49540710449219, "loss": 0.0976, "rewards/accuracies": 1.0, "rewards/chosen": 2.2448477745056152, "rewards/margins": 2.2448477745056152, "rewards/rejected": 0.0, "step": 1907 }, { "epoch": 10.659217877094973, "grad_norm": 1.232860094957608, "learning_rate": 3.028311070900601e-07, "logits/chosen": -3.393388509750366, "logits/rejected": -3.316394567489624, "logps/chosen": -0.7641682624816895, "logps/rejected": -44.09846115112305, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": 1.9598150253295898, "rewards/margins": 1.9598150253295898, "rewards/rejected": 0.0, "step": 1908 }, { "epoch": 10.664804469273744, "grad_norm": 0.7945585085668834, "learning_rate": 3.022711032830831e-07, "logits/chosen": -3.463810443878174, "logits/rejected": -3.5105905532836914, "logps/chosen": -13.247488975524902, "logps/rejected": -35.6832275390625, "loss": 0.1045, "rewards/accuracies": 1.0, "rewards/chosen": 2.5260543823242188, "rewards/margins": 2.5260543823242188, "rewards/rejected": 0.0, "step": 1909 }, { "epoch": 10.670391061452515, "grad_norm": 1.8275428433326804, "learning_rate": 3.017113933364601e-07, "logits/chosen": -3.427177667617798, "logits/rejected": -3.3653926849365234, "logps/chosen": -1.6435902118682861, "logps/rejected": -40.15003204345703, "loss": 0.079, "rewards/accuracies": 1.0, "rewards/chosen": 2.7192788124084473, "rewards/margins": 2.7192788124084473, "rewards/rejected": 0.0, "step": 1910 }, { "epoch": 10.675977653631286, "grad_norm": 1.301829605088348, "learning_rate": 3.0115197808202e-07, "logits/chosen": -3.458570957183838, "logits/rejected": -3.323073387145996, "logps/chosen": -3.0687127113342285, "logps/rejected": -42.68876266479492, "loss": 0.1218, "rewards/accuracies": 1.0, "rewards/chosen": 3.2481980323791504, "rewards/margins": 3.2481980323791504, "rewards/rejected": 0.0, "step": 1911 }, { "epoch": 10.681564245810057, "grad_norm": 1.1111341206972045, "learning_rate": 3.005928583511533e-07, "logits/chosen": -3.2323250770568848, "logits/rejected": -3.360978126525879, "logps/chosen": -24.130712509155273, "logps/rejected": -93.03076934814453, "loss": 0.1063, "rewards/accuracies": 1.0, "rewards/chosen": 3.4027507305145264, "rewards/margins": 3.4027507305145264, "rewards/rejected": 0.0, "step": 1912 }, { "epoch": 10.687150837988828, "grad_norm": 0.597553110429445, "learning_rate": 3.000340349748115e-07, "logits/chosen": -3.2457923889160156, "logits/rejected": -3.084268808364868, "logps/chosen": -4.707247734069824, "logps/rejected": -32.77264404296875, "loss": 0.1144, "rewards/accuracies": 1.0, "rewards/chosen": 2.896733283996582, "rewards/margins": 2.896733283996582, "rewards/rejected": 0.0, "step": 1913 }, { "epoch": 10.692737430167599, "grad_norm": 1.8222346411955612, "learning_rate": 2.994755087835057e-07, "logits/chosen": -3.381150722503662, "logits/rejected": -3.5762670040130615, "logps/chosen": -1.53989839553833, "logps/rejected": -61.47698974609375, "loss": 0.1051, "rewards/accuracies": 1.0, "rewards/chosen": 2.131669044494629, "rewards/margins": 2.131669044494629, "rewards/rejected": 0.0, "step": 1914 }, { "epoch": 10.69832402234637, "grad_norm": 0.606089882489306, "learning_rate": 2.9891728060730554e-07, "logits/chosen": -3.354189157485962, "logits/rejected": -3.413299560546875, "logps/chosen": -4.776266574859619, "logps/rejected": -37.680641174316406, "loss": 0.1025, "rewards/accuracies": 1.0, "rewards/chosen": 3.6719112396240234, "rewards/margins": 3.6719112396240234, "rewards/rejected": 0.0, "step": 1915 }, { "epoch": 10.703910614525139, "grad_norm": 0.6255216250633863, "learning_rate": 2.98359351275837e-07, "logits/chosen": -3.3042688369750977, "logits/rejected": -3.2384936809539795, "logps/chosen": -21.000625610351562, "logps/rejected": -35.76881790161133, "loss": 0.0752, "rewards/accuracies": 1.0, "rewards/chosen": 3.684452533721924, "rewards/margins": 3.684452533721924, "rewards/rejected": 0.0, "step": 1916 }, { "epoch": 10.70949720670391, "grad_norm": 0.5183001117662647, "learning_rate": 2.978017216182828e-07, "logits/chosen": -3.7192203998565674, "logits/rejected": -3.6571333408355713, "logps/chosen": -0.5691882371902466, "logps/rejected": -41.70458984375, "loss": 0.0879, "rewards/accuracies": 1.0, "rewards/chosen": 1.7690117359161377, "rewards/margins": 1.7690117359161377, "rewards/rejected": 0.0, "step": 1917 }, { "epoch": 10.71508379888268, "grad_norm": 0.5499181775208685, "learning_rate": 2.9724439246337983e-07, "logits/chosen": -3.563899040222168, "logits/rejected": -3.4503607749938965, "logps/chosen": -31.96733283996582, "logps/rejected": -61.7823600769043, "loss": 0.1077, "rewards/accuracies": 1.0, "rewards/chosen": 4.206428050994873, "rewards/margins": 4.206428050994873, "rewards/rejected": 0.0, "step": 1918 }, { "epoch": 10.720670391061452, "grad_norm": 1.5115312931237417, "learning_rate": 2.966873646394184e-07, "logits/chosen": -3.1484599113464355, "logits/rejected": -3.1858108043670654, "logps/chosen": -3.586225748062134, "logps/rejected": -45.302642822265625, "loss": 0.0997, "rewards/accuracies": 1.0, "rewards/chosen": 2.2733733654022217, "rewards/margins": 2.2733733654022217, "rewards/rejected": 0.0, "step": 1919 }, { "epoch": 10.726256983240223, "grad_norm": 0.62356296755553, "learning_rate": 2.961306389742412e-07, "logits/chosen": -3.3710219860076904, "logits/rejected": -3.560894250869751, "logps/chosen": -2.2886223793029785, "logps/rejected": -25.385658264160156, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": 2.3927793502807617, "rewards/margins": 2.3927793502807617, "rewards/rejected": 0.0, "step": 1920 }, { "epoch": 10.726256983240223, "eval_logits/chosen": -3.2988133430480957, "eval_logits/rejected": -3.424295425415039, "eval_logps/chosen": -28.57001304626465, "eval_logps/rejected": -52.16538619995117, "eval_loss": 0.9159795045852661, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": 0.15000297129154205, "eval_rewards/margins": 0.15000297129154205, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7401, "eval_samples_per_second": 9.469, "eval_steps_per_second": 0.305, "step": 1920 }, { "epoch": 10.731843575418994, "grad_norm": 1.0503261425352703, "learning_rate": 2.955742162952416e-07, "logits/chosen": -3.2798068523406982, "logits/rejected": -3.387129783630371, "logps/chosen": -0.26641911268234253, "logps/rejected": -74.62503814697266, "loss": 0.0995, "rewards/accuracies": 1.0, "rewards/chosen": 1.4750702381134033, "rewards/margins": 1.4750702381134033, "rewards/rejected": 0.0, "step": 1921 }, { "epoch": 10.737430167597765, "grad_norm": 2.100259964729435, "learning_rate": 2.950180974293629e-07, "logits/chosen": -3.501065969467163, "logits/rejected": -3.4448275566101074, "logps/chosen": -1.0389482975006104, "logps/rejected": -31.143098831176758, "loss": 0.1366, "rewards/accuracies": 1.0, "rewards/chosen": 2.0664217472076416, "rewards/margins": 2.0664217472076416, "rewards/rejected": 0.0, "step": 1922 }, { "epoch": 10.743016759776536, "grad_norm": 2.1238573896044284, "learning_rate": 2.944622832030965e-07, "logits/chosen": -3.509662628173828, "logits/rejected": -3.5671849250793457, "logps/chosen": -1.2824070453643799, "logps/rejected": -116.43682098388672, "loss": 0.1177, "rewards/accuracies": 1.0, "rewards/chosen": 2.1577115058898926, "rewards/margins": 2.1577115058898926, "rewards/rejected": 0.0, "step": 1923 }, { "epoch": 10.748603351955307, "grad_norm": 0.839912529658205, "learning_rate": 2.939067744424818e-07, "logits/chosen": -3.3863625526428223, "logits/rejected": -3.307372570037842, "logps/chosen": -4.680670261383057, "logps/rejected": -69.56145477294922, "loss": 0.1003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3522374629974365, "rewards/margins": 2.3522374629974365, "rewards/rejected": 0.0, "step": 1924 }, { "epoch": 10.754189944134078, "grad_norm": 0.5323115968588062, "learning_rate": 2.933515719731032e-07, "logits/chosen": -3.5454235076904297, "logits/rejected": -3.3904130458831787, "logps/chosen": -1.270723819732666, "logps/rejected": -50.822425842285156, "loss": 0.0943, "rewards/accuracies": 1.0, "rewards/chosen": 2.1869544982910156, "rewards/margins": 2.1869544982910156, "rewards/rejected": 0.0, "step": 1925 }, { "epoch": 10.759776536312849, "grad_norm": 0.5828229340637391, "learning_rate": 2.927966766200908e-07, "logits/chosen": -3.409114122390747, "logits/rejected": -3.580754280090332, "logps/chosen": -1.3685938119888306, "logps/rejected": -67.45565032958984, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 2.452620267868042, "rewards/margins": 2.452620267868042, "rewards/rejected": 0.0, "step": 1926 }, { "epoch": 10.76536312849162, "grad_norm": 0.5205560299987109, "learning_rate": 2.92242089208118e-07, "logits/chosen": -3.6759276390075684, "logits/rejected": -3.5998117923736572, "logps/chosen": -19.0526123046875, "logps/rejected": -69.45529174804688, "loss": 0.1059, "rewards/accuracies": 1.0, "rewards/chosen": 3.8056230545043945, "rewards/margins": 3.8056230545043945, "rewards/rejected": 0.0, "step": 1927 }, { "epoch": 10.77094972067039, "grad_norm": 1.6213822192455811, "learning_rate": 2.916878105614002e-07, "logits/chosen": -3.3806276321411133, "logits/rejected": -3.4644851684570312, "logps/chosen": -0.5472477674484253, "logps/rejected": -49.34136962890625, "loss": 0.1178, "rewards/accuracies": 1.0, "rewards/chosen": 1.5164899826049805, "rewards/margins": 1.5164899826049805, "rewards/rejected": 0.0, "step": 1928 }, { "epoch": 10.776536312849162, "grad_norm": 0.5263298691262972, "learning_rate": 2.91133841503694e-07, "logits/chosen": -3.7453887462615967, "logits/rejected": -3.672898292541504, "logps/chosen": -1.9932701587677002, "logps/rejected": -46.162261962890625, "loss": 0.1077, "rewards/accuracies": 1.0, "rewards/chosen": 2.4026293754577637, "rewards/margins": 2.4026293754577637, "rewards/rejected": 0.0, "step": 1929 }, { "epoch": 10.782122905027933, "grad_norm": 1.0631947297476212, "learning_rate": 2.9058018285829687e-07, "logits/chosen": -3.469196081161499, "logits/rejected": -3.448237895965576, "logps/chosen": -2.9169487953186035, "logps/rejected": -35.945030212402344, "loss": 0.0893, "rewards/accuracies": 1.0, "rewards/chosen": 2.2502312660217285, "rewards/margins": 2.2502312660217285, "rewards/rejected": 0.0, "step": 1930 }, { "epoch": 10.787709497206704, "grad_norm": 1.058273935507051, "learning_rate": 2.900268354480432e-07, "logits/chosen": -3.6098685264587402, "logits/rejected": -3.6558334827423096, "logps/chosen": -2.7207980155944824, "logps/rejected": -37.187965393066406, "loss": 0.1115, "rewards/accuracies": 1.0, "rewards/chosen": 3.113663673400879, "rewards/margins": 3.113663673400879, "rewards/rejected": 0.0, "step": 1931 }, { "epoch": 10.793296089385475, "grad_norm": 1.5996199889833045, "learning_rate": 2.8947380009530643e-07, "logits/chosen": -3.5827581882476807, "logits/rejected": -3.387035846710205, "logps/chosen": -3.351130485534668, "logps/rejected": -29.75267791748047, "loss": 0.1116, "rewards/accuracies": 1.0, "rewards/chosen": 2.2918968200683594, "rewards/margins": 2.2918968200683594, "rewards/rejected": 0.0, "step": 1932 }, { "epoch": 10.798882681564246, "grad_norm": 0.9484081597408495, "learning_rate": 2.8892107762199536e-07, "logits/chosen": -3.4980547428131104, "logits/rejected": -3.5675454139709473, "logps/chosen": -2.8483567237854004, "logps/rejected": -50.05510711669922, "loss": 0.1111, "rewards/accuracies": 1.0, "rewards/chosen": 2.7646398544311523, "rewards/margins": 2.7646398544311523, "rewards/rejected": 0.0, "step": 1933 }, { "epoch": 10.804469273743017, "grad_norm": 1.644980661624941, "learning_rate": 2.8836866884955405e-07, "logits/chosen": -3.267019271850586, "logits/rejected": -3.480748414993286, "logps/chosen": -6.18753719329834, "logps/rejected": -50.49284362792969, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 2.6509337425231934, "rewards/margins": 2.6509337425231934, "rewards/rejected": 0.0, "step": 1934 }, { "epoch": 10.810055865921788, "grad_norm": 0.5187585647974681, "learning_rate": 2.8781657459896035e-07, "logits/chosen": -3.3421170711517334, "logits/rejected": -3.4056236743927, "logps/chosen": -9.775256156921387, "logps/rejected": -48.3869514465332, "loss": 0.1076, "rewards/accuracies": 1.0, "rewards/chosen": 3.34415864944458, "rewards/margins": 3.34415864944458, "rewards/rejected": 0.0, "step": 1935 }, { "epoch": 10.815642458100559, "grad_norm": 0.5438588348285353, "learning_rate": 2.872647956907246e-07, "logits/chosen": -3.78772234916687, "logits/rejected": -3.7548537254333496, "logps/chosen": -0.6591119170188904, "logps/rejected": -86.69601440429688, "loss": 0.0881, "rewards/accuracies": 1.0, "rewards/chosen": 2.208374500274658, "rewards/margins": 2.208374500274658, "rewards/rejected": 0.0, "step": 1936 }, { "epoch": 10.82122905027933, "grad_norm": 1.9386996272615775, "learning_rate": 2.8671333294488846e-07, "logits/chosen": -3.363569974899292, "logits/rejected": -3.1417958736419678, "logps/chosen": -1.0323398113250732, "logps/rejected": -63.85051345825195, "loss": 0.1271, "rewards/accuracies": 1.0, "rewards/chosen": 2.382911443710327, "rewards/margins": 2.382911443710327, "rewards/rejected": 0.0, "step": 1937 }, { "epoch": 10.8268156424581, "grad_norm": 0.9552522085373845, "learning_rate": 2.8616218718102387e-07, "logits/chosen": -3.3511106967926025, "logits/rejected": -3.5561373233795166, "logps/chosen": -17.523204803466797, "logps/rejected": -42.81861114501953, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 2.9075281620025635, "rewards/margins": 2.9075281620025635, "rewards/rejected": 0.0, "step": 1938 }, { "epoch": 10.832402234636872, "grad_norm": 2.3717920822897516, "learning_rate": 2.8561135921823156e-07, "logits/chosen": -3.3507566452026367, "logits/rejected": -3.2384164333343506, "logps/chosen": -1.3525452613830566, "logps/rejected": -77.60067749023438, "loss": 0.1043, "rewards/accuracies": 1.0, "rewards/chosen": 1.6592729091644287, "rewards/margins": 1.6592729091644287, "rewards/rejected": 0.0, "step": 1939 }, { "epoch": 10.837988826815643, "grad_norm": 1.4011181177985683, "learning_rate": 2.8506084987513977e-07, "logits/chosen": -3.2683188915252686, "logits/rejected": -3.394500732421875, "logps/chosen": -1.9470837116241455, "logps/rejected": -30.30990982055664, "loss": 0.1188, "rewards/accuracies": 1.0, "rewards/chosen": 2.501068592071533, "rewards/margins": 2.501068592071533, "rewards/rejected": 0.0, "step": 1940 }, { "epoch": 10.837988826815643, "eval_logits/chosen": -3.294837236404419, "eval_logits/rejected": -3.4212710857391357, "eval_logps/chosen": -28.783550262451172, "eval_logps/rejected": -52.66058349609375, "eval_loss": 0.921066164970398, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": 0.12864917516708374, "eval_rewards/margins": 0.12864917516708374, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6845, "eval_samples_per_second": 9.485, "eval_steps_per_second": 0.306, "step": 1940 }, { "epoch": 10.843575418994414, "grad_norm": 1.4302198050688797, "learning_rate": 2.845106599699041e-07, "logits/chosen": -3.2402455806732178, "logits/rejected": -3.3550610542297363, "logps/chosen": -3.4866740703582764, "logps/rejected": -29.693632125854492, "loss": 0.1133, "rewards/accuracies": 1.0, "rewards/chosen": 2.1640701293945312, "rewards/margins": 2.1640701293945312, "rewards/rejected": 0.0, "step": 1941 }, { "epoch": 10.849162011173185, "grad_norm": 0.7268407995244136, "learning_rate": 2.8396079032020383e-07, "logits/chosen": -3.356489419937134, "logits/rejected": -3.349102020263672, "logps/chosen": -2.3652195930480957, "logps/rejected": -64.1489486694336, "loss": 0.0812, "rewards/accuracies": 1.0, "rewards/chosen": 3.05629301071167, "rewards/margins": 3.05629301071167, "rewards/rejected": 0.0, "step": 1942 }, { "epoch": 10.854748603351956, "grad_norm": 0.9204822561237457, "learning_rate": 2.834112417432437e-07, "logits/chosen": -2.7726292610168457, "logits/rejected": -2.7120933532714844, "logps/chosen": -6.252779006958008, "logps/rejected": -46.49617004394531, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": 3.0688788890838623, "rewards/margins": 3.0688788890838623, "rewards/rejected": 0.0, "step": 1943 }, { "epoch": 10.860335195530727, "grad_norm": 2.1601964192839476, "learning_rate": 2.828620150557508e-07, "logits/chosen": -3.430494785308838, "logits/rejected": -3.464099407196045, "logps/chosen": -1.9049228429794312, "logps/rejected": -42.39170455932617, "loss": 0.0863, "rewards/accuracies": 1.0, "rewards/chosen": 2.5023398399353027, "rewards/margins": 2.5023398399353027, "rewards/rejected": 0.0, "step": 1944 }, { "epoch": 10.865921787709498, "grad_norm": 2.032897164494274, "learning_rate": 2.823131110739737e-07, "logits/chosen": -3.3368167877197266, "logits/rejected": -3.2068445682525635, "logps/chosen": -0.6238577365875244, "logps/rejected": -100.99341583251953, "loss": 0.12, "rewards/accuracies": 1.0, "rewards/chosen": 2.0053141117095947, "rewards/margins": 2.0053141117095947, "rewards/rejected": 0.0, "step": 1945 }, { "epoch": 10.871508379888269, "grad_norm": 0.535290823782499, "learning_rate": 2.8176453061368143e-07, "logits/chosen": -3.3537046909332275, "logits/rejected": -3.333434581756592, "logps/chosen": -6.67498779296875, "logps/rejected": -36.00105285644531, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": 2.6048285961151123, "rewards/margins": 2.6048285961151123, "rewards/rejected": 0.0, "step": 1946 }, { "epoch": 10.87709497206704, "grad_norm": 0.568975661018824, "learning_rate": 2.8121627449016225e-07, "logits/chosen": -3.5854060649871826, "logits/rejected": -3.5121772289276123, "logps/chosen": -9.655231475830078, "logps/rejected": -59.148563385009766, "loss": 0.1263, "rewards/accuracies": 1.0, "rewards/chosen": 3.508772850036621, "rewards/margins": 3.508772850036621, "rewards/rejected": 0.0, "step": 1947 }, { "epoch": 10.88268156424581, "grad_norm": 0.9634577235067475, "learning_rate": 2.8066834351822255e-07, "logits/chosen": -3.4804861545562744, "logits/rejected": -3.5888350009918213, "logps/chosen": -2.7011873722076416, "logps/rejected": -75.55279541015625, "loss": 0.1107, "rewards/accuracies": 1.0, "rewards/chosen": 2.418036937713623, "rewards/margins": 2.418036937713623, "rewards/rejected": 0.0, "step": 1948 }, { "epoch": 10.888268156424582, "grad_norm": 0.7512882568369851, "learning_rate": 2.8012073851218486e-07, "logits/chosen": -3.5435726642608643, "logits/rejected": -3.4737308025360107, "logps/chosen": -2.4472789764404297, "logps/rejected": -44.22047805786133, "loss": 0.0943, "rewards/accuracies": 1.0, "rewards/chosen": 2.8572998046875, "rewards/margins": 2.8572998046875, "rewards/rejected": 0.0, "step": 1949 }, { "epoch": 10.893854748603353, "grad_norm": 0.5463667182865234, "learning_rate": 2.7957346028588856e-07, "logits/chosen": -3.351771831512451, "logits/rejected": -3.396040916442871, "logps/chosen": -24.655719757080078, "logps/rejected": -61.6947021484375, "loss": 0.0799, "rewards/accuracies": 1.0, "rewards/chosen": 3.0992283821105957, "rewards/margins": 3.0992283821105957, "rewards/rejected": 0.0, "step": 1950 }, { "epoch": 10.899441340782122, "grad_norm": 0.9779824265490464, "learning_rate": 2.790265096526857e-07, "logits/chosen": -3.2073376178741455, "logits/rejected": -3.3103933334350586, "logps/chosen": -0.8609297871589661, "logps/rejected": -63.67443084716797, "loss": 0.1221, "rewards/accuracies": 1.0, "rewards/chosen": 1.9942567348480225, "rewards/margins": 1.9942567348480225, "rewards/rejected": 0.0, "step": 1951 }, { "epoch": 10.905027932960895, "grad_norm": 3.878224074611816, "learning_rate": 2.7847988742544295e-07, "logits/chosen": -3.391176462173462, "logits/rejected": -3.37831711769104, "logps/chosen": -1.989864706993103, "logps/rejected": -58.87008285522461, "loss": 0.1191, "rewards/accuracies": 1.0, "rewards/chosen": 2.503532886505127, "rewards/margins": 2.503532886505127, "rewards/rejected": 0.0, "step": 1952 }, { "epoch": 10.910614525139664, "grad_norm": 1.061200239757773, "learning_rate": 2.7793359441653805e-07, "logits/chosen": -3.3318586349487305, "logits/rejected": -3.3976187705993652, "logps/chosen": -1.9514379501342773, "logps/rejected": -41.914207458496094, "loss": 0.1216, "rewards/accuracies": 1.0, "rewards/chosen": 2.640911102294922, "rewards/margins": 2.640911102294922, "rewards/rejected": 0.0, "step": 1953 }, { "epoch": 10.916201117318435, "grad_norm": 0.6202107189716216, "learning_rate": 2.773876314378597e-07, "logits/chosen": -3.4760282039642334, "logits/rejected": -3.516247510910034, "logps/chosen": -19.76568031311035, "logps/rejected": -42.70884704589844, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": 4.23059606552124, "rewards/margins": 4.23059606552124, "rewards/rejected": 0.0, "step": 1954 }, { "epoch": 10.921787709497206, "grad_norm": 0.6151641360085743, "learning_rate": 2.768419993008061e-07, "logits/chosen": -3.4956982135772705, "logits/rejected": -3.478393793106079, "logps/chosen": -0.6943643689155579, "logps/rejected": -74.71638488769531, "loss": 0.124, "rewards/accuracies": 1.0, "rewards/chosen": 1.9081413745880127, "rewards/margins": 1.9081413745880127, "rewards/rejected": 0.0, "step": 1955 }, { "epoch": 10.927374301675977, "grad_norm": 0.6040467280421871, "learning_rate": 2.762966988162838e-07, "logits/chosen": -3.4565107822418213, "logits/rejected": -3.611046314239502, "logps/chosen": -1.6663256883621216, "logps/rejected": -27.423904418945312, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 2.8244056701660156, "rewards/margins": 2.8244056701660156, "rewards/rejected": 0.0, "step": 1956 }, { "epoch": 10.932960893854748, "grad_norm": 4.431823116814799, "learning_rate": 2.757517307947065e-07, "logits/chosen": -3.713186264038086, "logits/rejected": -3.288609266281128, "logps/chosen": -2.0880815982818604, "logps/rejected": -53.6541633605957, "loss": 0.103, "rewards/accuracies": 1.0, "rewards/chosen": 2.3980512619018555, "rewards/margins": 2.3980512619018555, "rewards/rejected": 0.0, "step": 1957 }, { "epoch": 10.938547486033519, "grad_norm": 1.4618321912785606, "learning_rate": 2.752070960459934e-07, "logits/chosen": -3.5102028846740723, "logits/rejected": -3.429625988006592, "logps/chosen": -1.5148488283157349, "logps/rejected": -29.034435272216797, "loss": 0.0979, "rewards/accuracies": 1.0, "rewards/chosen": 1.9058281183242798, "rewards/margins": 1.9058281183242798, "rewards/rejected": 0.0, "step": 1958 }, { "epoch": 10.94413407821229, "grad_norm": 1.0733951619882203, "learning_rate": 2.746627953795694e-07, "logits/chosen": -3.506091356277466, "logits/rejected": -3.4129672050476074, "logps/chosen": -3.1617989540100098, "logps/rejected": -41.68645095825195, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": 3.317117214202881, "rewards/margins": 3.317117214202881, "rewards/rejected": 0.0, "step": 1959 }, { "epoch": 10.949720670391061, "grad_norm": 0.8766754576708049, "learning_rate": 2.741188296043615e-07, "logits/chosen": -3.3641016483306885, "logits/rejected": -3.35844087600708, "logps/chosen": -3.9195914268493652, "logps/rejected": -46.63465881347656, "loss": 0.1093, "rewards/accuracies": 1.0, "rewards/chosen": 2.267271041870117, "rewards/margins": 2.267271041870117, "rewards/rejected": 0.0, "step": 1960 }, { "epoch": 10.949720670391061, "eval_logits/chosen": -3.294306516647339, "eval_logits/rejected": -3.421966075897217, "eval_logps/chosen": -28.862768173217773, "eval_logps/rejected": -52.48956298828125, "eval_loss": 0.9208307266235352, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": 0.12072701752185822, "eval_rewards/margins": 0.12072701752185822, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6643, "eval_samples_per_second": 9.49, "eval_steps_per_second": 0.306, "step": 1960 }, { "epoch": 10.955307262569832, "grad_norm": 1.0814898778876383, "learning_rate": 2.7357519952880023e-07, "logits/chosen": -3.5372488498687744, "logits/rejected": -3.697329044342041, "logps/chosen": -12.813082695007324, "logps/rejected": -74.08599853515625, "loss": 0.0946, "rewards/accuracies": 1.0, "rewards/chosen": 3.092062473297119, "rewards/margins": 3.092062473297119, "rewards/rejected": 0.0, "step": 1961 }, { "epoch": 10.960893854748603, "grad_norm": 0.9277978725629857, "learning_rate": 2.7303190596081645e-07, "logits/chosen": -3.6069722175598145, "logits/rejected": -3.2105302810668945, "logps/chosen": -0.25405922532081604, "logps/rejected": -116.97374725341797, "loss": 0.1159, "rewards/accuracies": 1.0, "rewards/chosen": 1.2921063899993896, "rewards/margins": 1.2921063899993896, "rewards/rejected": 0.0, "step": 1962 }, { "epoch": 10.966480446927374, "grad_norm": 5.685547353250434, "learning_rate": 2.7248894970784127e-07, "logits/chosen": -3.3292641639709473, "logits/rejected": -3.362541913986206, "logps/chosen": -0.9024280309677124, "logps/rejected": -110.84246826171875, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 2.2061171531677246, "rewards/margins": 2.2061171531677246, "rewards/rejected": 0.0, "step": 1963 }, { "epoch": 10.972067039106145, "grad_norm": 1.3361292153172168, "learning_rate": 2.719463315768043e-07, "logits/chosen": -3.4417128562927246, "logits/rejected": -3.208587408065796, "logps/chosen": -1.2682263851165771, "logps/rejected": -44.37663269042969, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": 2.668027639389038, "rewards/margins": 2.668027639389038, "rewards/rejected": 0.0, "step": 1964 }, { "epoch": 10.977653631284916, "grad_norm": 1.0865640081898422, "learning_rate": 2.7140405237413275e-07, "logits/chosen": -3.6909279823303223, "logits/rejected": -3.6696698665618896, "logps/chosen": -1.5324313640594482, "logps/rejected": -29.734695434570312, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 2.615619659423828, "rewards/margins": 2.615619659423828, "rewards/rejected": 0.0, "step": 1965 }, { "epoch": 10.983240223463687, "grad_norm": 0.5593576686892301, "learning_rate": 2.708621129057501e-07, "logits/chosen": -3.3819236755371094, "logits/rejected": -3.462059497833252, "logps/chosen": -14.919900894165039, "logps/rejected": -67.44153594970703, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 3.56858491897583, "rewards/margins": 3.56858491897583, "rewards/rejected": 0.0, "step": 1966 }, { "epoch": 10.988826815642458, "grad_norm": 0.6010040579794459, "learning_rate": 2.703205139770749e-07, "logits/chosen": -3.402266263961792, "logits/rejected": -3.3524739742279053, "logps/chosen": -7.726075649261475, "logps/rejected": -73.15519714355469, "loss": 0.0894, "rewards/accuracies": 1.0, "rewards/chosen": 2.823573589324951, "rewards/margins": 2.823573589324951, "rewards/rejected": 0.0, "step": 1967 }, { "epoch": 10.994413407821229, "grad_norm": 0.5636415023647893, "learning_rate": 2.6977925639301955e-07, "logits/chosen": -3.5721802711486816, "logits/rejected": -3.4628896713256836, "logps/chosen": -1.6515507698059082, "logps/rejected": -62.12843322753906, "loss": 0.0962, "rewards/accuracies": 1.0, "rewards/chosen": 2.9263627529144287, "rewards/margins": 2.9263627529144287, "rewards/rejected": 0.0, "step": 1968 }, { "epoch": 11.0, "grad_norm": 0.6608148885043948, "learning_rate": 2.6923834095798914e-07, "logits/chosen": -3.3973584175109863, "logits/rejected": -3.4416000843048096, "logps/chosen": -3.7127230167388916, "logps/rejected": -43.00478744506836, "loss": 0.1061, "rewards/accuracies": 1.0, "rewards/chosen": 2.04290771484375, "rewards/margins": 2.04290771484375, "rewards/rejected": 0.0, "step": 1969 }, { "epoch": 11.005586592178771, "grad_norm": 0.7570754454846276, "learning_rate": 2.6869776847588076e-07, "logits/chosen": -3.223130464553833, "logits/rejected": -3.216498613357544, "logps/chosen": -4.856444835662842, "logps/rejected": -37.17080307006836, "loss": 0.1125, "rewards/accuracies": 1.0, "rewards/chosen": 2.8240160942077637, "rewards/margins": 2.8240160942077637, "rewards/rejected": 0.0, "step": 1970 }, { "epoch": 11.011173184357542, "grad_norm": 0.7587609785066302, "learning_rate": 2.681575397500808e-07, "logits/chosen": -3.4802913665771484, "logits/rejected": -3.5363049507141113, "logps/chosen": -12.489826202392578, "logps/rejected": -59.16836166381836, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": 3.173032760620117, "rewards/margins": 3.173032760620117, "rewards/rejected": 0.0, "step": 1971 }, { "epoch": 11.016759776536313, "grad_norm": 0.5487584039154622, "learning_rate": 2.676176555834658e-07, "logits/chosen": -3.7237424850463867, "logits/rejected": -3.539865493774414, "logps/chosen": -7.172542572021484, "logps/rejected": -32.795982360839844, "loss": 0.0883, "rewards/accuracies": 1.0, "rewards/chosen": 3.917008876800537, "rewards/margins": 3.917008876800537, "rewards/rejected": 0.0, "step": 1972 }, { "epoch": 11.022346368715084, "grad_norm": 1.3067089064331106, "learning_rate": 2.6707811677839975e-07, "logits/chosen": -3.0273399353027344, "logits/rejected": -3.053793430328369, "logps/chosen": -1.631102204322815, "logps/rejected": -64.28227996826172, "loss": 0.1006, "rewards/accuracies": 1.0, "rewards/chosen": 2.1646173000335693, "rewards/margins": 2.1646173000335693, "rewards/rejected": 0.0, "step": 1973 }, { "epoch": 11.027932960893855, "grad_norm": 0.45809697672224176, "learning_rate": 2.6653892413673316e-07, "logits/chosen": -3.6722640991210938, "logits/rejected": -3.7272047996520996, "logps/chosen": -4.221530914306641, "logps/rejected": -37.29861068725586, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": 3.3270933628082275, "rewards/margins": 3.3270933628082275, "rewards/rejected": 0.0, "step": 1974 }, { "epoch": 11.033519553072626, "grad_norm": 0.4210681518111144, "learning_rate": 2.6600007845980256e-07, "logits/chosen": -3.356821298599243, "logits/rejected": -3.310434341430664, "logps/chosen": -25.088289260864258, "logps/rejected": -42.456703186035156, "loss": 0.0824, "rewards/accuracies": 1.0, "rewards/chosen": 3.16225528717041, "rewards/margins": 3.16225528717041, "rewards/rejected": 0.0, "step": 1975 }, { "epoch": 11.039106145251397, "grad_norm": 1.9968140360414173, "learning_rate": 2.654615805484284e-07, "logits/chosen": -3.5969486236572266, "logits/rejected": -3.580840587615967, "logps/chosen": -12.102442741394043, "logps/rejected": -32.60572814941406, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": 3.4988389015197754, "rewards/margins": 3.4988389015197754, "rewards/rejected": 0.0, "step": 1976 }, { "epoch": 11.044692737430168, "grad_norm": 0.4183318629887757, "learning_rate": 2.649234312029145e-07, "logits/chosen": -3.3813321590423584, "logits/rejected": -3.486619234085083, "logps/chosen": -3.1055541038513184, "logps/rejected": -53.136512756347656, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 2.6955175399780273, "rewards/margins": 2.6955175399780273, "rewards/rejected": 0.0, "step": 1977 }, { "epoch": 11.050279329608939, "grad_norm": 0.4650920392732018, "learning_rate": 2.6438563122304657e-07, "logits/chosen": -3.3683040142059326, "logits/rejected": -3.384139060974121, "logps/chosen": -9.655780792236328, "logps/rejected": -37.85082244873047, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 3.1659979820251465, "rewards/margins": 3.1659979820251465, "rewards/rejected": 0.0, "step": 1978 }, { "epoch": 11.05586592178771, "grad_norm": 0.6060572375804102, "learning_rate": 2.638481814080916e-07, "logits/chosen": -3.5283408164978027, "logits/rejected": -3.427849292755127, "logps/chosen": -1.1676838397979736, "logps/rejected": -48.28765106201172, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": 2.4302749633789062, "rewards/margins": 2.4302749633789062, "rewards/rejected": 0.0, "step": 1979 }, { "epoch": 11.061452513966481, "grad_norm": 0.4322984282245594, "learning_rate": 2.633110825567951e-07, "logits/chosen": -3.6758594512939453, "logits/rejected": -3.6464524269104004, "logps/chosen": -7.674620151519775, "logps/rejected": -60.0107421875, "loss": 0.133, "rewards/accuracies": 1.0, "rewards/chosen": 3.139512300491333, "rewards/margins": 3.139512300491333, "rewards/rejected": 0.0, "step": 1980 }, { "epoch": 11.061452513966481, "eval_logits/chosen": -3.2864978313446045, "eval_logits/rejected": -3.4144539833068848, "eval_logps/chosen": -29.15273666381836, "eval_logps/rejected": -53.541419982910156, "eval_loss": 0.9443025588989258, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": 0.09173060208559036, "eval_rewards/margins": 0.09173060208559036, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6928, "eval_samples_per_second": 9.482, "eval_steps_per_second": 0.306, "step": 1980 }, { "epoch": 11.067039106145252, "grad_norm": 0.4754917243396108, "learning_rate": 2.6277433546738205e-07, "logits/chosen": -3.230081558227539, "logits/rejected": -3.004481554031372, "logps/chosen": -0.15751391649246216, "logps/rejected": -70.2298583984375, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": 1.1462054252624512, "rewards/margins": 1.1462054252624512, "rewards/rejected": 0.0, "step": 1981 }, { "epoch": 11.072625698324023, "grad_norm": 0.48976582866360574, "learning_rate": 2.62237940937554e-07, "logits/chosen": -3.5290935039520264, "logits/rejected": -3.5418808460235596, "logps/chosen": -4.300559043884277, "logps/rejected": -35.070491790771484, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 3.1748969554901123, "rewards/margins": 3.1748969554901123, "rewards/rejected": 0.0, "step": 1982 }, { "epoch": 11.078212290502794, "grad_norm": 0.4792929877417928, "learning_rate": 2.617018997644889e-07, "logits/chosen": -3.114351511001587, "logits/rejected": -3.112081527709961, "logps/chosen": -10.51168155670166, "logps/rejected": -52.60894775390625, "loss": 0.1098, "rewards/accuracies": 1.0, "rewards/chosen": 3.372950792312622, "rewards/margins": 3.372950792312622, "rewards/rejected": 0.0, "step": 1983 }, { "epoch": 11.083798882681565, "grad_norm": 0.6160879245652107, "learning_rate": 2.611662127448393e-07, "logits/chosen": -3.4955079555511475, "logits/rejected": -3.5562283992767334, "logps/chosen": -15.152576446533203, "logps/rejected": -68.93836975097656, "loss": 0.1133, "rewards/accuracies": 1.0, "rewards/chosen": 2.657057523727417, "rewards/margins": 2.657057523727417, "rewards/rejected": 0.0, "step": 1984 }, { "epoch": 11.089385474860336, "grad_norm": 0.41305624555883874, "learning_rate": 2.6063088067473156e-07, "logits/chosen": -3.2018439769744873, "logits/rejected": -3.2682015895843506, "logps/chosen": -8.8161039352417, "logps/rejected": -29.48605728149414, "loss": 0.1102, "rewards/accuracies": 1.0, "rewards/chosen": 3.0993926525115967, "rewards/margins": 3.0993926525115967, "rewards/rejected": 0.0, "step": 1985 }, { "epoch": 11.094972067039107, "grad_norm": 0.5248289945571151, "learning_rate": 2.6009590434976444e-07, "logits/chosen": -3.2796809673309326, "logits/rejected": -3.429037094116211, "logps/chosen": -4.598927974700928, "logps/rejected": -62.207794189453125, "loss": 0.088, "rewards/accuracies": 1.0, "rewards/chosen": 2.3135581016540527, "rewards/margins": 2.3135581016540527, "rewards/rejected": 0.0, "step": 1986 }, { "epoch": 11.100558659217878, "grad_norm": 0.46065714416705367, "learning_rate": 2.595612845650078e-07, "logits/chosen": -3.3993310928344727, "logits/rejected": -3.231445074081421, "logps/chosen": -7.13292121887207, "logps/rejected": -73.23664855957031, "loss": 0.0851, "rewards/accuracies": 1.0, "rewards/chosen": 2.9397664070129395, "rewards/margins": 2.9397664070129395, "rewards/rejected": 0.0, "step": 1987 }, { "epoch": 11.106145251396647, "grad_norm": 0.4565078942258675, "learning_rate": 2.5902702211500246e-07, "logits/chosen": -3.4503095149993896, "logits/rejected": -3.3747165203094482, "logps/chosen": -3.5082180500030518, "logps/rejected": -67.02780151367188, "loss": 0.089, "rewards/accuracies": 1.0, "rewards/chosen": 2.288248062133789, "rewards/margins": 2.288248062133789, "rewards/rejected": 0.0, "step": 1988 }, { "epoch": 11.111731843575418, "grad_norm": 0.4719089458995008, "learning_rate": 2.584931177937569e-07, "logits/chosen": -3.2662644386291504, "logits/rejected": -3.3980376720428467, "logps/chosen": -1.2309666872024536, "logps/rejected": -61.435935974121094, "loss": 0.0827, "rewards/accuracies": 1.0, "rewards/chosen": 2.695246934890747, "rewards/margins": 2.695246934890747, "rewards/rejected": 0.0, "step": 1989 }, { "epoch": 11.11731843575419, "grad_norm": 0.6205757416787661, "learning_rate": 2.5795957239474853e-07, "logits/chosen": -3.261859893798828, "logits/rejected": -3.39766526222229, "logps/chosen": -0.8327498435974121, "logps/rejected": -61.17940139770508, "loss": 0.1069, "rewards/accuracies": 1.0, "rewards/chosen": 2.063709020614624, "rewards/margins": 2.063709020614624, "rewards/rejected": 0.0, "step": 1990 }, { "epoch": 11.12290502793296, "grad_norm": 1.8285638134381723, "learning_rate": 2.5742638671092054e-07, "logits/chosen": -3.409914016723633, "logits/rejected": -3.5615062713623047, "logps/chosen": -4.146988868713379, "logps/rejected": -44.768638610839844, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": 3.5233983993530273, "rewards/margins": 3.5233983993530273, "rewards/rejected": 0.0, "step": 1991 }, { "epoch": 11.128491620111731, "grad_norm": 0.5714640768278682, "learning_rate": 2.568935615346819e-07, "logits/chosen": -3.1631546020507812, "logits/rejected": -3.1505377292633057, "logps/chosen": -4.494215488433838, "logps/rejected": -57.05901336669922, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 3.5487918853759766, "rewards/margins": 3.5487918853759766, "rewards/rejected": 0.0, "step": 1992 }, { "epoch": 11.134078212290502, "grad_norm": 2.102452150592736, "learning_rate": 2.563610976579057e-07, "logits/chosen": -3.577125072479248, "logits/rejected": -3.68088698387146, "logps/chosen": -0.7124360799789429, "logps/rejected": -57.468414306640625, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": 2.1272451877593994, "rewards/margins": 2.1272451877593994, "rewards/rejected": 0.0, "step": 1993 }, { "epoch": 11.139664804469273, "grad_norm": 1.9838899032802388, "learning_rate": 2.558289958719282e-07, "logits/chosen": -3.1666100025177, "logits/rejected": -3.02303409576416, "logps/chosen": -2.4945926666259766, "logps/rejected": -40.52445983886719, "loss": 0.089, "rewards/accuracies": 1.0, "rewards/chosen": 3.230691909790039, "rewards/margins": 3.230691909790039, "rewards/rejected": 0.0, "step": 1994 }, { "epoch": 11.145251396648044, "grad_norm": 0.4207349330247922, "learning_rate": 2.552972569675472e-07, "logits/chosen": -3.5099363327026367, "logits/rejected": -3.418095350265503, "logps/chosen": -17.87454605102539, "logps/rejected": -29.387659072875977, "loss": 0.0829, "rewards/accuracies": 1.0, "rewards/chosen": 3.9897732734680176, "rewards/margins": 3.9897732734680176, "rewards/rejected": 0.0, "step": 1995 }, { "epoch": 11.150837988826815, "grad_norm": 1.3450358018556177, "learning_rate": 2.5476588173502156e-07, "logits/chosen": -3.435154438018799, "logits/rejected": -3.3458993434906006, "logps/chosen": -9.573295593261719, "logps/rejected": -66.47622680664062, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": 3.4141845703125, "rewards/margins": 3.4141845703125, "rewards/rejected": 0.0, "step": 1996 }, { "epoch": 11.156424581005586, "grad_norm": 2.1526471206314595, "learning_rate": 2.542348709640695e-07, "logits/chosen": -3.580726385116577, "logits/rejected": -3.455291748046875, "logps/chosen": -1.1961976289749146, "logps/rejected": -44.997676849365234, "loss": 0.0969, "rewards/accuracies": 1.0, "rewards/chosen": 2.362941026687622, "rewards/margins": 2.362941026687622, "rewards/rejected": 0.0, "step": 1997 }, { "epoch": 11.162011173184357, "grad_norm": 0.5094837035971311, "learning_rate": 2.5370422544386726e-07, "logits/chosen": -3.2765495777130127, "logits/rejected": -3.47189998626709, "logps/chosen": -0.45980337262153625, "logps/rejected": -67.9575424194336, "loss": 0.0942, "rewards/accuracies": 1.0, "rewards/chosen": 1.857908010482788, "rewards/margins": 1.857908010482788, "rewards/rejected": 0.0, "step": 1998 }, { "epoch": 11.167597765363128, "grad_norm": 0.6650143731143028, "learning_rate": 2.531739459630494e-07, "logits/chosen": -3.3117434978485107, "logits/rejected": -3.202538013458252, "logps/chosen": -5.910820484161377, "logps/rejected": -44.21772003173828, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 3.5671982765197754, "rewards/margins": 3.5671982765197754, "rewards/rejected": 0.0, "step": 1999 }, { "epoch": 11.1731843575419, "grad_norm": 1.0015887536491135, "learning_rate": 2.5264403330970466e-07, "logits/chosen": -3.384261131286621, "logits/rejected": -3.4819624423980713, "logps/chosen": -1.515886902809143, "logps/rejected": -64.92242431640625, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": 2.557046890258789, "rewards/margins": 2.557046890258789, "rewards/rejected": 0.0, "step": 2000 }, { "epoch": 11.1731843575419, "eval_logits/chosen": -3.290687084197998, "eval_logits/rejected": -3.415971279144287, "eval_logps/chosen": -29.300617218017578, "eval_logps/rejected": -53.619422912597656, "eval_loss": 0.9535685777664185, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": 0.07694248855113983, "eval_rewards/margins": 0.07694248855113983, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6671, "eval_samples_per_second": 9.49, "eval_steps_per_second": 0.306, "step": 2000 }, { "epoch": 11.17877094972067, "grad_norm": 3.398848744419848, "learning_rate": 2.5211448827137837e-07, "logits/chosen": -3.575794219970703, "logits/rejected": -3.443030834197998, "logps/chosen": -1.1924731731414795, "logps/rejected": -48.99359130859375, "loss": 0.1191, "rewards/accuracies": 1.0, "rewards/chosen": 2.109663486480713, "rewards/margins": 2.109663486480713, "rewards/rejected": 0.0, "step": 2001 }, { "epoch": 11.184357541899441, "grad_norm": 0.4464897914448476, "learning_rate": 2.5158531163506854e-07, "logits/chosen": -3.6060190200805664, "logits/rejected": -3.4800190925598145, "logps/chosen": -0.4656723141670227, "logps/rejected": -80.21681213378906, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": 2.1573047637939453, "rewards/margins": 2.1573047637939453, "rewards/rejected": 0.0, "step": 2002 }, { "epoch": 11.189944134078212, "grad_norm": 0.5631812622021007, "learning_rate": 2.510565041872258e-07, "logits/chosen": -3.494626760482788, "logits/rejected": -3.3245034217834473, "logps/chosen": -7.909552574157715, "logps/rejected": -34.34803009033203, "loss": 0.1069, "rewards/accuracies": 1.0, "rewards/chosen": 3.4123058319091797, "rewards/margins": 3.4123058319091797, "rewards/rejected": 0.0, "step": 2003 }, { "epoch": 11.195530726256983, "grad_norm": 1.1083021166285107, "learning_rate": 2.5052806671375224e-07, "logits/chosen": -3.471977949142456, "logits/rejected": -3.5947768688201904, "logps/chosen": -0.12925735116004944, "logps/rejected": -33.51356506347656, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 1.4285383224487305, "rewards/margins": 1.4285383224487305, "rewards/rejected": 0.0, "step": 2004 }, { "epoch": 11.201117318435754, "grad_norm": 0.5050854730725913, "learning_rate": 2.500000000000001e-07, "logits/chosen": -3.2237627506256104, "logits/rejected": -3.2795255184173584, "logps/chosen": -1.5750045776367188, "logps/rejected": -49.049198150634766, "loss": 0.0981, "rewards/accuracies": 1.0, "rewards/chosen": 2.1445679664611816, "rewards/margins": 2.1445679664611816, "rewards/rejected": 0.0, "step": 2005 }, { "epoch": 11.206703910614525, "grad_norm": 0.7780193463055313, "learning_rate": 2.4947230483077046e-07, "logits/chosen": -3.7185072898864746, "logits/rejected": -3.555849552154541, "logps/chosen": -2.027606725692749, "logps/rejected": -31.118919372558594, "loss": 0.106, "rewards/accuracies": 1.0, "rewards/chosen": 2.578911304473877, "rewards/margins": 2.578911304473877, "rewards/rejected": 0.0, "step": 2006 }, { "epoch": 11.212290502793296, "grad_norm": 0.7515147029504262, "learning_rate": 2.489449819903123e-07, "logits/chosen": -3.58176326751709, "logits/rejected": -3.5337986946105957, "logps/chosen": -6.2703022956848145, "logps/rejected": -36.084991455078125, "loss": 0.0806, "rewards/accuracies": 1.0, "rewards/chosen": 3.4396705627441406, "rewards/margins": 3.4396705627441406, "rewards/rejected": 0.0, "step": 2007 }, { "epoch": 11.217877094972067, "grad_norm": 0.6890743153357715, "learning_rate": 2.4841803226232166e-07, "logits/chosen": -3.3658447265625, "logits/rejected": -3.2889177799224854, "logps/chosen": -1.3058695793151855, "logps/rejected": -74.79275512695312, "loss": 0.1306, "rewards/accuracies": 1.0, "rewards/chosen": 2.2486634254455566, "rewards/margins": 2.2486634254455566, "rewards/rejected": 0.0, "step": 2008 }, { "epoch": 11.223463687150838, "grad_norm": 0.43004423077332365, "learning_rate": 2.47891456429939e-07, "logits/chosen": -3.491614818572998, "logits/rejected": -3.4320404529571533, "logps/chosen": -0.6702783107757568, "logps/rejected": -48.801658630371094, "loss": 0.0873, "rewards/accuracies": 1.0, "rewards/chosen": 2.3724660873413086, "rewards/margins": 2.3724660873413086, "rewards/rejected": 0.0, "step": 2009 }, { "epoch": 11.22905027932961, "grad_norm": 0.5405604310322775, "learning_rate": 2.473652552757504e-07, "logits/chosen": -3.5797393321990967, "logits/rejected": -3.596543550491333, "logps/chosen": -1.1575409173965454, "logps/rejected": -67.98393249511719, "loss": 0.1069, "rewards/accuracies": 1.0, "rewards/chosen": 1.9111378192901611, "rewards/margins": 1.9111378192901611, "rewards/rejected": 0.0, "step": 2010 }, { "epoch": 11.23463687150838, "grad_norm": 1.6557292561068724, "learning_rate": 2.4683942958178416e-07, "logits/chosen": -3.2692625522613525, "logits/rejected": -3.3795197010040283, "logps/chosen": -2.376598358154297, "logps/rejected": -32.81221008300781, "loss": 0.0886, "rewards/accuracies": 1.0, "rewards/chosen": 2.5531296730041504, "rewards/margins": 2.5531296730041504, "rewards/rejected": 0.0, "step": 2011 }, { "epoch": 11.240223463687151, "grad_norm": 0.45609308312774133, "learning_rate": 2.4631398012951104e-07, "logits/chosen": -3.3126449584960938, "logits/rejected": -3.3992831707000732, "logps/chosen": -14.70008659362793, "logps/rejected": -40.075138092041016, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 3.3907418251037598, "rewards/margins": 3.3907418251037598, "rewards/rejected": 0.0, "step": 2012 }, { "epoch": 11.245810055865922, "grad_norm": 0.5591609311118848, "learning_rate": 2.4578890769984227e-07, "logits/chosen": -3.0882582664489746, "logits/rejected": -3.1441426277160645, "logps/chosen": -3.592134475708008, "logps/rejected": -38.634803771972656, "loss": 0.0891, "rewards/accuracies": 1.0, "rewards/chosen": 3.2177412509918213, "rewards/margins": 3.2177412509918213, "rewards/rejected": 0.0, "step": 2013 }, { "epoch": 11.251396648044693, "grad_norm": 0.44512968524400404, "learning_rate": 2.4526421307312954e-07, "logits/chosen": -3.2984673976898193, "logits/rejected": -3.168893337249756, "logps/chosen": -8.026363372802734, "logps/rejected": -43.317626953125, "loss": 0.1167, "rewards/accuracies": 1.0, "rewards/chosen": 3.16433048248291, "rewards/margins": 3.16433048248291, "rewards/rejected": 0.0, "step": 2014 }, { "epoch": 11.256983240223464, "grad_norm": 0.5264800157859246, "learning_rate": 2.4473989702916197e-07, "logits/chosen": -3.5242347717285156, "logits/rejected": -3.5590171813964844, "logps/chosen": -1.7915736436843872, "logps/rejected": -26.157859802246094, "loss": 0.117, "rewards/accuracies": 1.0, "rewards/chosen": 2.661710739135742, "rewards/margins": 2.661710739135742, "rewards/rejected": 0.0, "step": 2015 }, { "epoch": 11.262569832402235, "grad_norm": 0.48931397758179795, "learning_rate": 2.4421596034716657e-07, "logits/chosen": -3.1743080615997314, "logits/rejected": -3.1876299381256104, "logps/chosen": -7.518608093261719, "logps/rejected": -57.348182678222656, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": 3.2221872806549072, "rewards/margins": 3.2221872806549072, "rewards/rejected": 0.0, "step": 2016 }, { "epoch": 11.268156424581006, "grad_norm": 0.47034890465607826, "learning_rate": 2.4369240380580713e-07, "logits/chosen": -3.6134889125823975, "logits/rejected": -3.5242602825164795, "logps/chosen": -1.8971567153930664, "logps/rejected": -62.129615783691406, "loss": 0.0792, "rewards/accuracies": 1.0, "rewards/chosen": 2.5112192630767822, "rewards/margins": 2.5112192630767822, "rewards/rejected": 0.0, "step": 2017 }, { "epoch": 11.273743016759777, "grad_norm": 2.982781794529716, "learning_rate": 2.431692281831811e-07, "logits/chosen": -3.328136682510376, "logits/rejected": -3.329343795776367, "logps/chosen": -3.3141212463378906, "logps/rejected": -64.44425201416016, "loss": 0.1221, "rewards/accuracies": 1.0, "rewards/chosen": 2.9037461280822754, "rewards/margins": 2.9037461280822754, "rewards/rejected": 0.0, "step": 2018 }, { "epoch": 11.279329608938548, "grad_norm": 0.880899449492303, "learning_rate": 2.426464342568213e-07, "logits/chosen": -3.4399499893188477, "logits/rejected": -3.456308603286743, "logps/chosen": -0.7736425995826721, "logps/rejected": -87.82368469238281, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": 2.341750383377075, "rewards/margins": 2.341750383377075, "rewards/rejected": 0.0, "step": 2019 }, { "epoch": 11.28491620111732, "grad_norm": 0.4050577518047395, "learning_rate": 2.421240228036923e-07, "logits/chosen": -3.4010915756225586, "logits/rejected": -3.3362855911254883, "logps/chosen": -7.492812633514404, "logps/rejected": -59.437522888183594, "loss": 0.0795, "rewards/accuracies": 1.0, "rewards/chosen": 3.1338376998901367, "rewards/margins": 3.1338376998901367, "rewards/rejected": 0.0, "step": 2020 }, { "epoch": 11.28491620111732, "eval_logits/chosen": -3.2706871032714844, "eval_logits/rejected": -3.3993258476257324, "eval_logps/chosen": -29.3426456451416, "eval_logps/rejected": -53.73309326171875, "eval_loss": 0.9600690007209778, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": 0.07273955643177032, "eval_rewards/margins": 0.07273955643177032, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6658, "eval_samples_per_second": 9.49, "eval_steps_per_second": 0.306, "step": 2020 }, { "epoch": 11.29050279329609, "grad_norm": 0.4855923805945946, "learning_rate": 2.4160199460019063e-07, "logits/chosen": -3.0380606651306152, "logits/rejected": -3.221691131591797, "logps/chosen": -0.1503109633922577, "logps/rejected": -49.60858917236328, "loss": 0.0973, "rewards/accuracies": 1.0, "rewards/chosen": 1.4069033861160278, "rewards/margins": 1.4069033861160278, "rewards/rejected": 0.0, "step": 2021 }, { "epoch": 11.296089385474861, "grad_norm": 0.4525412670100124, "learning_rate": 2.4108035042214315e-07, "logits/chosen": -3.411052703857422, "logits/rejected": -3.3881309032440186, "logps/chosen": -7.243567943572998, "logps/rejected": -83.95309448242188, "loss": 0.1271, "rewards/accuracies": 1.0, "rewards/chosen": 2.962278127670288, "rewards/margins": 2.962278127670288, "rewards/rejected": 0.0, "step": 2022 }, { "epoch": 11.30167597765363, "grad_norm": 0.4495443200088319, "learning_rate": 2.40559091044806e-07, "logits/chosen": -3.3428659439086914, "logits/rejected": -3.438525915145874, "logps/chosen": -2.012706995010376, "logps/rejected": -36.666404724121094, "loss": 0.1004, "rewards/accuracies": 1.0, "rewards/chosen": 2.595395565032959, "rewards/margins": 2.595395565032959, "rewards/rejected": 0.0, "step": 2023 }, { "epoch": 11.307262569832401, "grad_norm": 0.4579302566909401, "learning_rate": 2.4003821724286347e-07, "logits/chosen": -3.5048301219940186, "logits/rejected": -3.567451000213623, "logps/chosen": -4.573941230773926, "logps/rejected": -49.41082000732422, "loss": 0.0851, "rewards/accuracies": 1.0, "rewards/chosen": 2.8217461109161377, "rewards/margins": 2.8217461109161377, "rewards/rejected": 0.0, "step": 2024 }, { "epoch": 11.312849162011172, "grad_norm": 0.9202875260967602, "learning_rate": 2.395177297904268e-07, "logits/chosen": -3.6099133491516113, "logits/rejected": -3.168043851852417, "logps/chosen": -0.5238317251205444, "logps/rejected": -100.91718292236328, "loss": 0.1088, "rewards/accuracies": 1.0, "rewards/chosen": 2.2232165336608887, "rewards/margins": 2.2232165336608887, "rewards/rejected": 0.0, "step": 2025 }, { "epoch": 11.318435754189943, "grad_norm": 0.4451221198149984, "learning_rate": 2.38997629461033e-07, "logits/chosen": -3.102287769317627, "logits/rejected": -3.227853775024414, "logps/chosen": -20.150047302246094, "logps/rejected": -40.576385498046875, "loss": 0.07, "rewards/accuracies": 1.0, "rewards/chosen": 3.901444911956787, "rewards/margins": 3.901444911956787, "rewards/rejected": 0.0, "step": 2026 }, { "epoch": 11.324022346368714, "grad_norm": 0.4696703530585902, "learning_rate": 2.3847791702764357e-07, "logits/chosen": -3.080986261367798, "logits/rejected": -3.215866804122925, "logps/chosen": -0.3314264714717865, "logps/rejected": -102.53214263916016, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 1.6432881355285645, "rewards/margins": 1.6432881355285645, "rewards/rejected": 0.0, "step": 2027 }, { "epoch": 11.329608938547485, "grad_norm": 0.5094241705056239, "learning_rate": 2.379585932626444e-07, "logits/chosen": -3.2015206813812256, "logits/rejected": -3.4322457313537598, "logps/chosen": -25.20183753967285, "logps/rejected": -53.60784912109375, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": 3.472428321838379, "rewards/margins": 3.472428321838379, "rewards/rejected": 0.0, "step": 2028 }, { "epoch": 11.335195530726256, "grad_norm": 0.4306664131712824, "learning_rate": 2.3743965893784223e-07, "logits/chosen": -3.070831775665283, "logits/rejected": -3.1770286560058594, "logps/chosen": -29.29344940185547, "logps/rejected": -38.18990707397461, "loss": 0.076, "rewards/accuracies": 1.0, "rewards/chosen": 3.9771416187286377, "rewards/margins": 3.9771416187286377, "rewards/rejected": 0.0, "step": 2029 }, { "epoch": 11.340782122905027, "grad_norm": 0.714306424060186, "learning_rate": 2.3692111482446658e-07, "logits/chosen": -3.3322372436523438, "logits/rejected": -3.544323444366455, "logps/chosen": -1.9236233234405518, "logps/rejected": -33.58332061767578, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 2.7325119972229004, "rewards/margins": 2.7325119972229004, "rewards/rejected": 0.0, "step": 2030 }, { "epoch": 11.346368715083798, "grad_norm": 0.773819892480267, "learning_rate": 2.3640296169316604e-07, "logits/chosen": -3.1517975330352783, "logits/rejected": -3.1840906143188477, "logps/chosen": -4.88399076461792, "logps/rejected": -32.34602355957031, "loss": 0.1014, "rewards/accuracies": 1.0, "rewards/chosen": 2.597146511077881, "rewards/margins": 2.597146511077881, "rewards/rejected": 0.0, "step": 2031 }, { "epoch": 11.35195530726257, "grad_norm": 1.0386674687752542, "learning_rate": 2.358852003140085e-07, "logits/chosen": -3.435654401779175, "logits/rejected": -3.552208185195923, "logps/chosen": -35.12843322753906, "logps/rejected": -41.45343017578125, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 5.088462829589844, "rewards/margins": 5.088462829589844, "rewards/rejected": 0.0, "step": 2032 }, { "epoch": 11.35754189944134, "grad_norm": 0.6542811410942738, "learning_rate": 2.3536783145647936e-07, "logits/chosen": -3.254070520401001, "logits/rejected": -3.3398876190185547, "logps/chosen": -1.2409307956695557, "logps/rejected": -28.919286727905273, "loss": 0.1184, "rewards/accuracies": 1.0, "rewards/chosen": 2.3911044597625732, "rewards/margins": 2.3911044597625732, "rewards/rejected": 0.0, "step": 2033 }, { "epoch": 11.363128491620111, "grad_norm": 0.730378674888996, "learning_rate": 2.348508558894815e-07, "logits/chosen": -3.287074089050293, "logits/rejected": -3.207710027694702, "logps/chosen": -1.7696616649627686, "logps/rejected": -36.400367736816406, "loss": 0.0801, "rewards/accuracies": 1.0, "rewards/chosen": 2.2627506256103516, "rewards/margins": 2.2627506256103516, "rewards/rejected": 0.0, "step": 2034 }, { "epoch": 11.368715083798882, "grad_norm": 0.42622717376859515, "learning_rate": 2.3433427438133206e-07, "logits/chosen": -3.5788822174072266, "logits/rejected": -3.491795778274536, "logps/chosen": -1.0302319526672363, "logps/rejected": -40.77783203125, "loss": 0.093, "rewards/accuracies": 1.0, "rewards/chosen": 2.491347312927246, "rewards/margins": 2.491347312927246, "rewards/rejected": 0.0, "step": 2035 }, { "epoch": 11.374301675977653, "grad_norm": 0.49680662202885134, "learning_rate": 2.338180876997632e-07, "logits/chosen": -3.430384874343872, "logits/rejected": -3.5522563457489014, "logps/chosen": -2.5603208541870117, "logps/rejected": -56.43990707397461, "loss": 0.0887, "rewards/accuracies": 1.0, "rewards/chosen": 3.1166903972625732, "rewards/margins": 3.1166903972625732, "rewards/rejected": 0.0, "step": 2036 }, { "epoch": 11.379888268156424, "grad_norm": 0.4629377401366435, "learning_rate": 2.3330229661192074e-07, "logits/chosen": -3.4896841049194336, "logits/rejected": -3.30959153175354, "logps/chosen": -1.999104619026184, "logps/rejected": -45.836875915527344, "loss": 0.0841, "rewards/accuracies": 1.0, "rewards/chosen": 3.052429676055908, "rewards/margins": 3.052429676055908, "rewards/rejected": 0.0, "step": 2037 }, { "epoch": 11.385474860335195, "grad_norm": 0.4416742911024336, "learning_rate": 2.3278690188436144e-07, "logits/chosen": -3.3385913372039795, "logits/rejected": -3.5497634410858154, "logps/chosen": -0.18844816088676453, "logps/rejected": -62.22562789916992, "loss": 0.1158, "rewards/accuracies": 1.0, "rewards/chosen": 1.5747473239898682, "rewards/margins": 1.5747473239898682, "rewards/rejected": 0.0, "step": 2038 }, { "epoch": 11.391061452513966, "grad_norm": 0.6254421054874332, "learning_rate": 2.3227190428305422e-07, "logits/chosen": -3.536216974258423, "logits/rejected": -3.433133363723755, "logps/chosen": -7.839025974273682, "logps/rejected": -46.35868453979492, "loss": 0.0853, "rewards/accuracies": 1.0, "rewards/chosen": 3.7869088649749756, "rewards/margins": 3.7869088649749756, "rewards/rejected": 0.0, "step": 2039 }, { "epoch": 11.396648044692737, "grad_norm": 1.4167930649729465, "learning_rate": 2.3175730457337695e-07, "logits/chosen": -3.319622278213501, "logits/rejected": -2.999826669692993, "logps/chosen": -15.87757682800293, "logps/rejected": -42.64402770996094, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": 2.7572197914123535, "rewards/margins": 2.7572197914123535, "rewards/rejected": 0.0, "step": 2040 }, { "epoch": 11.396648044692737, "eval_logits/chosen": -3.2879624366760254, "eval_logits/rejected": -3.4116337299346924, "eval_logps/chosen": -29.63619613647461, "eval_logps/rejected": -53.59400177001953, "eval_loss": 0.9723474383354187, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": 0.043384671211242676, "eval_rewards/margins": 0.043384671211242676, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6912, "eval_samples_per_second": 9.483, "eval_steps_per_second": 0.306, "step": 2040 }, { "epoch": 11.402234636871508, "grad_norm": 0.4308853248830393, "learning_rate": 2.312431035201165e-07, "logits/chosen": -3.4698472023010254, "logits/rejected": -3.575444221496582, "logps/chosen": -23.711624145507812, "logps/rejected": -43.78266143798828, "loss": 0.0953, "rewards/accuracies": 1.0, "rewards/chosen": 4.2616167068481445, "rewards/margins": 4.2616167068481445, "rewards/rejected": 0.0, "step": 2041 }, { "epoch": 11.40782122905028, "grad_norm": 1.3220967883114643, "learning_rate": 2.3072930188746697e-07, "logits/chosen": -3.6334421634674072, "logits/rejected": -3.668956995010376, "logps/chosen": -16.505786895751953, "logps/rejected": -42.4336051940918, "loss": 0.1357, "rewards/accuracies": 1.0, "rewards/chosen": 3.0035178661346436, "rewards/margins": 3.0035178661346436, "rewards/rejected": 0.0, "step": 2042 }, { "epoch": 11.41340782122905, "grad_norm": 1.6326144170751449, "learning_rate": 2.3021590043902979e-07, "logits/chosen": -3.575676918029785, "logits/rejected": -3.6062214374542236, "logps/chosen": -2.799795150756836, "logps/rejected": -36.262168884277344, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 3.2209248542785645, "rewards/margins": 3.2209248542785645, "rewards/rejected": 0.0, "step": 2043 }, { "epoch": 11.418994413407821, "grad_norm": 0.8682698856020982, "learning_rate": 2.2970289993781022e-07, "logits/chosen": -3.4891762733459473, "logits/rejected": -3.435276746749878, "logps/chosen": -27.726642608642578, "logps/rejected": -50.93583679199219, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 2.8662686347961426, "rewards/margins": 2.8662686347961426, "rewards/rejected": 0.0, "step": 2044 }, { "epoch": 11.424581005586592, "grad_norm": 0.3732082545979339, "learning_rate": 2.2919030114621846e-07, "logits/chosen": -3.3657753467559814, "logits/rejected": -3.402918577194214, "logps/chosen": -6.243122577667236, "logps/rejected": -55.91102600097656, "loss": 0.0984, "rewards/accuracies": 1.0, "rewards/chosen": 3.098987340927124, "rewards/margins": 3.098987340927124, "rewards/rejected": 0.0, "step": 2045 }, { "epoch": 11.430167597765363, "grad_norm": 0.7583505967837766, "learning_rate": 2.2867810482606815e-07, "logits/chosen": -3.4816181659698486, "logits/rejected": -3.5671916007995605, "logps/chosen": -7.070084571838379, "logps/rejected": -71.01090240478516, "loss": 0.1139, "rewards/accuracies": 1.0, "rewards/chosen": 3.6898624897003174, "rewards/margins": 3.6898624897003174, "rewards/rejected": 0.0, "step": 2046 }, { "epoch": 11.435754189944134, "grad_norm": 0.45539848253727405, "learning_rate": 2.2816631173857347e-07, "logits/chosen": -3.114863634109497, "logits/rejected": -3.171290636062622, "logps/chosen": -1.5149728059768677, "logps/rejected": -43.40138244628906, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 2.839877128601074, "rewards/margins": 2.839877128601074, "rewards/rejected": 0.0, "step": 2047 }, { "epoch": 11.441340782122905, "grad_norm": 0.44648626009766645, "learning_rate": 2.2765492264435087e-07, "logits/chosen": -3.459627151489258, "logits/rejected": -3.62988543510437, "logps/chosen": -0.8486126065254211, "logps/rejected": -46.58003616333008, "loss": 0.1024, "rewards/accuracies": 1.0, "rewards/chosen": 2.526358127593994, "rewards/margins": 2.526358127593994, "rewards/rejected": 0.0, "step": 2048 }, { "epoch": 11.446927374301676, "grad_norm": 0.8420872506929729, "learning_rate": 2.2714393830341482e-07, "logits/chosen": -3.106123924255371, "logits/rejected": -3.256624698638916, "logps/chosen": -2.9391133785247803, "logps/rejected": -64.49932098388672, "loss": 0.0853, "rewards/accuracies": 1.0, "rewards/chosen": 2.3291449546813965, "rewards/margins": 2.3291449546813965, "rewards/rejected": 0.0, "step": 2049 }, { "epoch": 11.452513966480447, "grad_norm": 0.4427576769856584, "learning_rate": 2.266333594751797e-07, "logits/chosen": -3.3989005088806152, "logits/rejected": -2.927534341812134, "logps/chosen": -0.3201085925102234, "logps/rejected": -91.62421417236328, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": 1.822981357574463, "rewards/margins": 1.822981357574463, "rewards/rejected": 0.0, "step": 2050 }, { "epoch": 11.458100558659218, "grad_norm": 1.9602573980316333, "learning_rate": 2.2612318691845628e-07, "logits/chosen": -3.3661935329437256, "logits/rejected": -3.5726842880249023, "logps/chosen": -0.6012658476829529, "logps/rejected": -58.52061462402344, "loss": 0.1406, "rewards/accuracies": 1.0, "rewards/chosen": 1.9937279224395752, "rewards/margins": 1.9937279224395752, "rewards/rejected": 0.0, "step": 2051 }, { "epoch": 11.46368715083799, "grad_norm": 0.515007798693676, "learning_rate": 2.256134213914519e-07, "logits/chosen": -3.470977783203125, "logits/rejected": -3.5958571434020996, "logps/chosen": -6.127221584320068, "logps/rejected": -31.5274715423584, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 3.0713388919830322, "rewards/margins": 3.0713388919830322, "rewards/rejected": 0.0, "step": 2052 }, { "epoch": 11.46927374301676, "grad_norm": 0.4701438832254253, "learning_rate": 2.2510406365176876e-07, "logits/chosen": -3.633430004119873, "logits/rejected": -3.618941068649292, "logps/chosen": -9.452795028686523, "logps/rejected": -26.49870491027832, "loss": 0.0943, "rewards/accuracies": 1.0, "rewards/chosen": 3.020148515701294, "rewards/margins": 3.020148515701294, "rewards/rejected": 0.0, "step": 2053 }, { "epoch": 11.474860335195531, "grad_norm": 0.48878517668358795, "learning_rate": 2.245951144564036e-07, "logits/chosen": -3.1848111152648926, "logits/rejected": -3.3400304317474365, "logps/chosen": -1.1764954328536987, "logps/rejected": -49.090797424316406, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": 2.439659833908081, "rewards/margins": 2.439659833908081, "rewards/rejected": 0.0, "step": 2054 }, { "epoch": 11.480446927374302, "grad_norm": 0.5292091561947605, "learning_rate": 2.2408657456174519e-07, "logits/chosen": -3.138007640838623, "logits/rejected": -3.077355146408081, "logps/chosen": -19.65635108947754, "logps/rejected": -77.37872314453125, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 3.171947479248047, "rewards/margins": 3.171947479248047, "rewards/rejected": 0.0, "step": 2055 }, { "epoch": 11.486033519553073, "grad_norm": 0.7808373197064731, "learning_rate": 2.2357844472357428e-07, "logits/chosen": -3.347581148147583, "logits/rejected": -3.231194019317627, "logps/chosen": -0.8362330198287964, "logps/rejected": -72.31649780273438, "loss": 0.1392, "rewards/accuracies": 1.0, "rewards/chosen": 2.3305838108062744, "rewards/margins": 2.3305838108062744, "rewards/rejected": 0.0, "step": 2056 }, { "epoch": 11.491620111731844, "grad_norm": 0.5000031262306723, "learning_rate": 2.2307072569706286e-07, "logits/chosen": -3.4568393230438232, "logits/rejected": -3.389314889907837, "logps/chosen": -1.3895785808563232, "logps/rejected": -46.76854705810547, "loss": 0.1029, "rewards/accuracies": 1.0, "rewards/chosen": 2.2795677185058594, "rewards/margins": 2.2795677185058594, "rewards/rejected": 0.0, "step": 2057 }, { "epoch": 11.497206703910614, "grad_norm": 0.6276310130319155, "learning_rate": 2.2256341823677105e-07, "logits/chosen": -3.4142253398895264, "logits/rejected": -3.395514965057373, "logps/chosen": -0.7626144289970398, "logps/rejected": -52.26962661743164, "loss": 0.0947, "rewards/accuracies": 1.0, "rewards/chosen": 2.44633150100708, "rewards/margins": 2.44633150100708, "rewards/rejected": 0.0, "step": 2058 }, { "epoch": 11.502793296089386, "grad_norm": 0.7585110112800841, "learning_rate": 2.2205652309664874e-07, "logits/chosen": -3.391388177871704, "logits/rejected": -3.6604607105255127, "logps/chosen": -3.459272623062134, "logps/rejected": -98.18557739257812, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 2.000126838684082, "rewards/margins": 2.000126838684082, "rewards/rejected": 0.0, "step": 2059 }, { "epoch": 11.508379888268156, "grad_norm": 0.5827440897026235, "learning_rate": 2.2155004103003204e-07, "logits/chosen": -3.507307529449463, "logits/rejected": -3.482379913330078, "logps/chosen": -1.2515177726745605, "logps/rejected": -47.592872619628906, "loss": 0.1379, "rewards/accuracies": 1.0, "rewards/chosen": 1.8853281736373901, "rewards/margins": 1.8853281736373901, "rewards/rejected": 0.0, "step": 2060 }, { "epoch": 11.508379888268156, "eval_logits/chosen": -3.2820420265197754, "eval_logits/rejected": -3.407228469848633, "eval_logps/chosen": -29.63604164123535, "eval_logps/rejected": -53.693016052246094, "eval_loss": 0.9693609476089478, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": 0.04340028017759323, "eval_rewards/margins": 0.04340028017759323, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7388, "eval_samples_per_second": 9.469, "eval_steps_per_second": 0.305, "step": 2060 }, { "epoch": 11.513966480446927, "grad_norm": 0.7229903736753329, "learning_rate": 2.210439727896437e-07, "logits/chosen": -3.416884183883667, "logits/rejected": -3.483350992202759, "logps/chosen": -0.4949205815792084, "logps/rejected": -26.10192108154297, "loss": 0.0943, "rewards/accuracies": 1.0, "rewards/chosen": 2.139338970184326, "rewards/margins": 2.139338970184326, "rewards/rejected": 0.0, "step": 2061 }, { "epoch": 11.519553072625698, "grad_norm": 2.0827388719160456, "learning_rate": 2.205383191275909e-07, "logits/chosen": -3.3750455379486084, "logits/rejected": -3.5473861694335938, "logps/chosen": -9.99547290802002, "logps/rejected": -52.88782501220703, "loss": 0.1062, "rewards/accuracies": 1.0, "rewards/chosen": 2.794489622116089, "rewards/margins": 2.794489622116089, "rewards/rejected": 0.0, "step": 2062 }, { "epoch": 11.525139664804469, "grad_norm": 0.4764702527149911, "learning_rate": 2.200330807953657e-07, "logits/chosen": -3.4228296279907227, "logits/rejected": -3.34614634513855, "logps/chosen": -1.506871223449707, "logps/rejected": -73.5494384765625, "loss": 0.0976, "rewards/accuracies": 1.0, "rewards/chosen": 1.776228427886963, "rewards/margins": 1.776228427886963, "rewards/rejected": 0.0, "step": 2063 }, { "epoch": 11.53072625698324, "grad_norm": 0.6727045742513333, "learning_rate": 2.1952825854384138e-07, "logits/chosen": -3.231311082839966, "logits/rejected": -3.223719358444214, "logps/chosen": -4.259906768798828, "logps/rejected": -47.73058319091797, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": 3.09010648727417, "rewards/margins": 3.09010648727417, "rewards/rejected": 0.0, "step": 2064 }, { "epoch": 11.53631284916201, "grad_norm": 0.5606730494853186, "learning_rate": 2.1902385312327438e-07, "logits/chosen": -3.5459465980529785, "logits/rejected": -3.4588568210601807, "logps/chosen": -4.553582668304443, "logps/rejected": -40.20221710205078, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": 3.609736442565918, "rewards/margins": 3.609736442565918, "rewards/rejected": 0.0, "step": 2065 }, { "epoch": 11.541899441340782, "grad_norm": 0.42592803024726883, "learning_rate": 2.185198652833009e-07, "logits/chosen": -3.2113282680511475, "logits/rejected": -3.3077077865600586, "logps/chosen": -3.825059175491333, "logps/rejected": -82.86166381835938, "loss": 0.0893, "rewards/accuracies": 1.0, "rewards/chosen": 2.2419495582580566, "rewards/margins": 2.2419495582580566, "rewards/rejected": 0.0, "step": 2066 }, { "epoch": 11.547486033519553, "grad_norm": 0.4329193833006763, "learning_rate": 2.1801629577293617e-07, "logits/chosen": -3.5197689533233643, "logits/rejected": -3.620059013366699, "logps/chosen": -1.2800936698913574, "logps/rejected": -47.56212615966797, "loss": 0.0902, "rewards/accuracies": 1.0, "rewards/chosen": 2.994467258453369, "rewards/margins": 2.994467258453369, "rewards/rejected": 0.0, "step": 2067 }, { "epoch": 11.553072625698324, "grad_norm": 0.6671559109647464, "learning_rate": 2.1751314534057458e-07, "logits/chosen": -3.589369773864746, "logits/rejected": -3.6188716888427734, "logps/chosen": -1.5386724472045898, "logps/rejected": -43.215354919433594, "loss": 0.0966, "rewards/accuracies": 1.0, "rewards/chosen": 2.6964192390441895, "rewards/margins": 2.6964192390441895, "rewards/rejected": 0.0, "step": 2068 }, { "epoch": 11.558659217877095, "grad_norm": 1.173315727758856, "learning_rate": 2.1701041473398724e-07, "logits/chosen": -3.56231427192688, "logits/rejected": -3.6502504348754883, "logps/chosen": -2.079115629196167, "logps/rejected": -54.19228744506836, "loss": 0.086, "rewards/accuracies": 1.0, "rewards/chosen": 2.3071088790893555, "rewards/margins": 2.3071088790893555, "rewards/rejected": 0.0, "step": 2069 }, { "epoch": 11.564245810055866, "grad_norm": 0.40321424019889146, "learning_rate": 2.1650810470032126e-07, "logits/chosen": -3.4973809719085693, "logits/rejected": -3.4846181869506836, "logps/chosen": -0.876288652420044, "logps/rejected": -48.02561569213867, "loss": 0.0777, "rewards/accuracies": 1.0, "rewards/chosen": 2.3294496536254883, "rewards/margins": 2.3294496536254883, "rewards/rejected": 0.0, "step": 2070 }, { "epoch": 11.569832402234637, "grad_norm": 0.7211952196032335, "learning_rate": 2.1600621598609863e-07, "logits/chosen": -3.3570892810821533, "logits/rejected": -3.3892922401428223, "logps/chosen": -0.2817106544971466, "logps/rejected": -75.76512908935547, "loss": 0.1143, "rewards/accuracies": 1.0, "rewards/chosen": 1.5755364894866943, "rewards/margins": 1.5755364894866943, "rewards/rejected": 0.0, "step": 2071 }, { "epoch": 11.575418994413408, "grad_norm": 0.5148112052635592, "learning_rate": 2.1550474933721603e-07, "logits/chosen": -3.1936018466949463, "logits/rejected": -3.2042415142059326, "logps/chosen": -2.460733652114868, "logps/rejected": -67.72477722167969, "loss": 0.0962, "rewards/accuracies": 1.0, "rewards/chosen": 2.5301566123962402, "rewards/margins": 2.5301566123962402, "rewards/rejected": 0.0, "step": 2072 }, { "epoch": 11.581005586592179, "grad_norm": 0.6167113232137355, "learning_rate": 2.1500370549894132e-07, "logits/chosen": -3.265223503112793, "logits/rejected": -3.202566146850586, "logps/chosen": -1.2127795219421387, "logps/rejected": -49.323612213134766, "loss": 0.0969, "rewards/accuracies": 1.0, "rewards/chosen": 2.4704246520996094, "rewards/margins": 2.4704246520996094, "rewards/rejected": 0.0, "step": 2073 }, { "epoch": 11.58659217877095, "grad_norm": 0.6130793910733988, "learning_rate": 2.145030852159155e-07, "logits/chosen": -3.730278730392456, "logits/rejected": -3.687791109085083, "logps/chosen": -0.1538744866847992, "logps/rejected": -42.28436279296875, "loss": 0.0868, "rewards/accuracies": 1.0, "rewards/chosen": 1.4262964725494385, "rewards/margins": 1.4262964725494385, "rewards/rejected": 0.0, "step": 2074 }, { "epoch": 11.59217877094972, "grad_norm": 1.1180693351795155, "learning_rate": 2.1400288923214937e-07, "logits/chosen": -3.408236026763916, "logits/rejected": -3.3714475631713867, "logps/chosen": -1.658205509185791, "logps/rejected": -54.93598175048828, "loss": 0.1059, "rewards/accuracies": 1.0, "rewards/chosen": 2.5506534576416016, "rewards/margins": 2.5506534576416016, "rewards/rejected": 0.0, "step": 2075 }, { "epoch": 11.597765363128492, "grad_norm": 0.5064447962549264, "learning_rate": 2.1350311829102285e-07, "logits/chosen": -3.3614866733551025, "logits/rejected": -3.36944580078125, "logps/chosen": -0.326558381319046, "logps/rejected": -53.19791030883789, "loss": 0.0934, "rewards/accuracies": 1.0, "rewards/chosen": 1.6810168027877808, "rewards/margins": 1.6810168027877808, "rewards/rejected": 0.0, "step": 2076 }, { "epoch": 11.603351955307263, "grad_norm": 0.5463711792928004, "learning_rate": 2.1300377313528523e-07, "logits/chosen": -3.37495493888855, "logits/rejected": -3.3020448684692383, "logps/chosen": -7.588983535766602, "logps/rejected": -59.491539001464844, "loss": 0.0947, "rewards/accuracies": 1.0, "rewards/chosen": 2.7611913681030273, "rewards/margins": 2.7611913681030273, "rewards/rejected": 0.0, "step": 2077 }, { "epoch": 11.608938547486034, "grad_norm": 0.8164411536318138, "learning_rate": 2.1250485450705159e-07, "logits/chosen": -3.532632350921631, "logits/rejected": -3.720940589904785, "logps/chosen": -5.543283939361572, "logps/rejected": -85.24209594726562, "loss": 0.0807, "rewards/accuracies": 1.0, "rewards/chosen": 3.0614562034606934, "rewards/margins": 3.0614562034606934, "rewards/rejected": 0.0, "step": 2078 }, { "epoch": 11.614525139664805, "grad_norm": 0.927976280513748, "learning_rate": 2.120063631478044e-07, "logits/chosen": -3.5740299224853516, "logits/rejected": -3.7173476219177246, "logps/chosen": -10.757414817810059, "logps/rejected": -35.99129867553711, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": 3.0895583629608154, "rewards/margins": 3.0895583629608154, "rewards/rejected": 0.0, "step": 2079 }, { "epoch": 11.620111731843576, "grad_norm": 0.5243803748945338, "learning_rate": 2.115082997983904e-07, "logits/chosen": -3.521507740020752, "logits/rejected": -3.1975574493408203, "logps/chosen": -3.892530679702759, "logps/rejected": -26.3160400390625, "loss": 0.0688, "rewards/accuracies": 1.0, "rewards/chosen": 3.4351625442504883, "rewards/margins": 3.4351625442504883, "rewards/rejected": 0.0, "step": 2080 }, { "epoch": 11.620111731843576, "eval_logits/chosen": -3.2764930725097656, "eval_logits/rejected": -3.4013266563415527, "eval_logps/chosen": -29.51177978515625, "eval_logps/rejected": -54.050270080566406, "eval_loss": 0.9726657271385193, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": 0.05582625791430473, "eval_rewards/margins": 0.05582625791430473, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7021, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.306, "step": 2080 }, { "epoch": 11.625698324022347, "grad_norm": 0.5444743947234505, "learning_rate": 2.110106651990205e-07, "logits/chosen": -3.456770658493042, "logits/rejected": -3.2560853958129883, "logps/chosen": -0.5817017555236816, "logps/rejected": -118.65254211425781, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": 2.1886162757873535, "rewards/margins": 2.1886162757873535, "rewards/rejected": 0.0, "step": 2081 }, { "epoch": 11.631284916201118, "grad_norm": 1.1128860956078888, "learning_rate": 2.1051346008926812e-07, "logits/chosen": -3.5402708053588867, "logits/rejected": -3.5771753787994385, "logps/chosen": -7.171589374542236, "logps/rejected": -23.094192504882812, "loss": 0.1111, "rewards/accuracies": 1.0, "rewards/chosen": 3.2549638748168945, "rewards/margins": 3.2549638748168945, "rewards/rejected": 0.0, "step": 2082 }, { "epoch": 11.636871508379889, "grad_norm": 0.5792323796491307, "learning_rate": 2.1001668520806913e-07, "logits/chosen": -3.3181416988372803, "logits/rejected": -3.6179141998291016, "logps/chosen": -10.795852661132812, "logps/rejected": -59.83184051513672, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": 3.2504091262817383, "rewards/margins": 3.2504091262817383, "rewards/rejected": 0.0, "step": 2083 }, { "epoch": 11.64245810055866, "grad_norm": 0.4761403785744252, "learning_rate": 2.0952034129371882e-07, "logits/chosen": -3.581909418106079, "logits/rejected": -3.1815035343170166, "logps/chosen": -11.74966049194336, "logps/rejected": -73.74712371826172, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": 3.506298542022705, "rewards/margins": 3.506298542022705, "rewards/rejected": 0.0, "step": 2084 }, { "epoch": 11.64804469273743, "grad_norm": 0.5056695575110497, "learning_rate": 2.0902442908387307e-07, "logits/chosen": -3.187776803970337, "logits/rejected": -3.6853528022766113, "logps/chosen": -1.2484012842178345, "logps/rejected": -62.97319793701172, "loss": 0.0901, "rewards/accuracies": 1.0, "rewards/chosen": 2.032311201095581, "rewards/margins": 2.032311201095581, "rewards/rejected": 0.0, "step": 2085 }, { "epoch": 11.653631284916202, "grad_norm": 1.9086239252426467, "learning_rate": 2.0852894931554582e-07, "logits/chosen": -3.5712294578552246, "logits/rejected": -3.558100700378418, "logps/chosen": -7.533488750457764, "logps/rejected": -67.64964294433594, "loss": 0.1437, "rewards/accuracies": 1.0, "rewards/chosen": 3.256208896636963, "rewards/margins": 3.256208896636963, "rewards/rejected": 0.0, "step": 2086 }, { "epoch": 11.659217877094973, "grad_norm": 0.5605342456016813, "learning_rate": 2.0803390272510762e-07, "logits/chosen": -3.5309107303619385, "logits/rejected": -3.030372381210327, "logps/chosen": -10.332486152648926, "logps/rejected": -64.68547821044922, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 3.2190003395080566, "rewards/margins": 3.2190003395080566, "rewards/rejected": 0.0, "step": 2087 }, { "epoch": 11.664804469273744, "grad_norm": 0.530977087774688, "learning_rate": 2.0753929004828653e-07, "logits/chosen": -3.2939679622650146, "logits/rejected": -3.307115077972412, "logps/chosen": -1.5258204936981201, "logps/rejected": -64.69021606445312, "loss": 0.0944, "rewards/accuracies": 1.0, "rewards/chosen": 2.4115960597991943, "rewards/margins": 2.4115960597991943, "rewards/rejected": 0.0, "step": 2088 }, { "epoch": 11.670391061452515, "grad_norm": 0.4456606009275189, "learning_rate": 2.0704511202016483e-07, "logits/chosen": -3.461855173110962, "logits/rejected": -3.3375463485717773, "logps/chosen": -0.309223473072052, "logps/rejected": -48.380401611328125, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": 1.6402618885040283, "rewards/margins": 1.6402618885040283, "rewards/rejected": 0.0, "step": 2089 }, { "epoch": 11.675977653631286, "grad_norm": 0.41630393104425495, "learning_rate": 2.0655136937517903e-07, "logits/chosen": -3.4845030307769775, "logits/rejected": -3.4499826431274414, "logps/chosen": -0.4920295774936676, "logps/rejected": -35.75161361694336, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 2.52449893951416, "rewards/margins": 2.52449893951416, "rewards/rejected": 0.0, "step": 2090 }, { "epoch": 11.681564245810057, "grad_norm": 0.5409384675075213, "learning_rate": 2.0605806284711857e-07, "logits/chosen": -3.332338333129883, "logits/rejected": -3.3106095790863037, "logps/chosen": -1.0673799514770508, "logps/rejected": -61.278194427490234, "loss": 0.1149, "rewards/accuracies": 1.0, "rewards/chosen": 2.2988715171813965, "rewards/margins": 2.2988715171813965, "rewards/rejected": 0.0, "step": 2091 }, { "epoch": 11.687150837988828, "grad_norm": 2.4057489678597936, "learning_rate": 2.0556519316912523e-07, "logits/chosen": -3.5256285667419434, "logits/rejected": -3.313889265060425, "logps/chosen": -1.9876480102539062, "logps/rejected": -53.89966583251953, "loss": 0.1322, "rewards/accuracies": 1.0, "rewards/chosen": 3.1563377380371094, "rewards/margins": 3.1563377380371094, "rewards/rejected": 0.0, "step": 2092 }, { "epoch": 11.692737430167599, "grad_norm": 0.5666254690362961, "learning_rate": 2.0507276107369047e-07, "logits/chosen": -3.126408338546753, "logits/rejected": -3.331571102142334, "logps/chosen": -0.6741228103637695, "logps/rejected": -52.774986267089844, "loss": 0.1049, "rewards/accuracies": 1.0, "rewards/chosen": 1.9165393114089966, "rewards/margins": 1.9165393114089966, "rewards/rejected": 0.0, "step": 2093 }, { "epoch": 11.69832402234637, "grad_norm": 1.180640827580221, "learning_rate": 2.045807672926566e-07, "logits/chosen": -3.6363162994384766, "logits/rejected": -3.535933494567871, "logps/chosen": -0.7559529542922974, "logps/rejected": -32.57082748413086, "loss": 0.1104, "rewards/accuracies": 1.0, "rewards/chosen": 2.1516265869140625, "rewards/margins": 2.1516265869140625, "rewards/rejected": 0.0, "step": 2094 }, { "epoch": 11.703910614525139, "grad_norm": 1.0739088949985398, "learning_rate": 2.040892125572138e-07, "logits/chosen": -3.442941904067993, "logits/rejected": -3.2288362979888916, "logps/chosen": -0.630698561668396, "logps/rejected": -53.339263916015625, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": 2.0331945419311523, "rewards/margins": 2.0331945419311523, "rewards/rejected": 0.0, "step": 2095 }, { "epoch": 11.70949720670391, "grad_norm": 1.968379364228944, "learning_rate": 2.0359809759789997e-07, "logits/chosen": -3.225104331970215, "logits/rejected": -3.3189284801483154, "logps/chosen": -1.058898687362671, "logps/rejected": -53.428955078125, "loss": 0.1065, "rewards/accuracies": 1.0, "rewards/chosen": 2.358876943588257, "rewards/margins": 2.358876943588257, "rewards/rejected": 0.0, "step": 2096 }, { "epoch": 11.71508379888268, "grad_norm": 0.8065051146922257, "learning_rate": 2.0310742314459945e-07, "logits/chosen": -3.1914432048797607, "logits/rejected": -3.2663049697875977, "logps/chosen": -9.212248802185059, "logps/rejected": -52.66050338745117, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": 3.001647472381592, "rewards/margins": 3.001647472381592, "rewards/rejected": 0.0, "step": 2097 }, { "epoch": 11.720670391061452, "grad_norm": 2.1276834324619203, "learning_rate": 2.0261718992654175e-07, "logits/chosen": -3.418238639831543, "logits/rejected": -3.5759994983673096, "logps/chosen": -1.724967360496521, "logps/rejected": -67.7520751953125, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": 2.7611351013183594, "rewards/margins": 2.7611351013183594, "rewards/rejected": 0.0, "step": 2098 }, { "epoch": 11.726256983240223, "grad_norm": 1.9786927674124697, "learning_rate": 2.0212739867230083e-07, "logits/chosen": -3.475306510925293, "logits/rejected": -3.4787042140960693, "logps/chosen": -3.1913681030273438, "logps/rejected": -29.67127227783203, "loss": 0.1106, "rewards/accuracies": 1.0, "rewards/chosen": 3.3035426139831543, "rewards/margins": 3.3035426139831543, "rewards/rejected": 0.0, "step": 2099 }, { "epoch": 11.731843575418994, "grad_norm": 1.1462877697832752, "learning_rate": 2.0163805010979351e-07, "logits/chosen": -3.322868824005127, "logits/rejected": -3.2770586013793945, "logps/chosen": -7.151961803436279, "logps/rejected": -94.27562713623047, "loss": 0.1238, "rewards/accuracies": 1.0, "rewards/chosen": 2.769965171813965, "rewards/margins": 2.769965171813965, "rewards/rejected": 0.0, "step": 2100 }, { "epoch": 11.731843575418994, "eval_logits/chosen": -3.273817777633667, "eval_logits/rejected": -3.3991897106170654, "eval_logps/chosen": -29.351842880249023, "eval_logps/rejected": -53.964881896972656, "eval_loss": 0.9642612338066101, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": 0.07181987911462784, "eval_rewards/margins": 0.07181987911462784, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7177, "eval_samples_per_second": 9.475, "eval_steps_per_second": 0.306, "step": 2100 }, { "epoch": 11.737430167597765, "grad_norm": 0.4820005838869611, "learning_rate": 2.0114914496627943e-07, "logits/chosen": -3.5352282524108887, "logits/rejected": -3.244537830352783, "logps/chosen": -0.8986507058143616, "logps/rejected": -57.75574493408203, "loss": 0.1192, "rewards/accuracies": 1.0, "rewards/chosen": 2.129504919052124, "rewards/margins": 2.129504919052124, "rewards/rejected": 0.0, "step": 2101 }, { "epoch": 11.743016759776536, "grad_norm": 0.5069767628399827, "learning_rate": 2.00660683968358e-07, "logits/chosen": -3.225170612335205, "logits/rejected": -3.566986322402954, "logps/chosen": -0.30834734439849854, "logps/rejected": -101.76130676269531, "loss": 0.1223, "rewards/accuracies": 1.0, "rewards/chosen": 1.7267638444900513, "rewards/margins": 1.7267638444900513, "rewards/rejected": 0.0, "step": 2102 }, { "epoch": 11.748603351955307, "grad_norm": 0.5131499011156985, "learning_rate": 2.001726678419698e-07, "logits/chosen": -3.2986557483673096, "logits/rejected": -3.2515666484832764, "logps/chosen": -0.8364378213882446, "logps/rejected": -45.09752655029297, "loss": 0.0952, "rewards/accuracies": 1.0, "rewards/chosen": 2.3112003803253174, "rewards/margins": 2.3112003803253174, "rewards/rejected": 0.0, "step": 2103 }, { "epoch": 11.754189944134078, "grad_norm": 1.3827191962840166, "learning_rate": 1.9968509731239352e-07, "logits/chosen": -3.241009473800659, "logits/rejected": -3.2275688648223877, "logps/chosen": -0.8142117261886597, "logps/rejected": -60.17033767700195, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 2.4198660850524902, "rewards/margins": 2.4198660850524902, "rewards/rejected": 0.0, "step": 2104 }, { "epoch": 11.759776536312849, "grad_norm": 4.5464145286381195, "learning_rate": 1.991979731042458e-07, "logits/chosen": -3.138615369796753, "logits/rejected": -3.4954192638397217, "logps/chosen": -4.946340560913086, "logps/rejected": -149.63125610351562, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 2.6405844688415527, "rewards/margins": 2.6405844688415527, "rewards/rejected": 0.0, "step": 2105 }, { "epoch": 11.76536312849162, "grad_norm": 1.816761052938629, "learning_rate": 1.987112959414801e-07, "logits/chosen": -3.4835660457611084, "logits/rejected": -3.4168593883514404, "logps/chosen": -2.545900821685791, "logps/rejected": -33.79948425292969, "loss": 0.1533, "rewards/accuracies": 1.0, "rewards/chosen": 2.324270009994507, "rewards/margins": 2.324270009994507, "rewards/rejected": 0.0, "step": 2106 }, { "epoch": 11.77094972067039, "grad_norm": 1.021166904232079, "learning_rate": 1.9822506654738487e-07, "logits/chosen": -3.2605655193328857, "logits/rejected": -3.407984733581543, "logps/chosen": -1.2512481212615967, "logps/rejected": -46.00942611694336, "loss": 0.1047, "rewards/accuracies": 1.0, "rewards/chosen": 2.4974803924560547, "rewards/margins": 2.4974803924560547, "rewards/rejected": 0.0, "step": 2107 }, { "epoch": 11.776536312849162, "grad_norm": 2.827081417890233, "learning_rate": 1.9773928564458408e-07, "logits/chosen": -3.1404619216918945, "logits/rejected": -3.169956684112549, "logps/chosen": -9.449261665344238, "logps/rejected": -33.91954803466797, "loss": 0.0939, "rewards/accuracies": 1.0, "rewards/chosen": 3.326777458190918, "rewards/margins": 3.326777458190918, "rewards/rejected": 0.0, "step": 2108 }, { "epoch": 11.782122905027933, "grad_norm": 0.9347712030633651, "learning_rate": 1.972539539550346e-07, "logits/chosen": -3.3544299602508545, "logits/rejected": -3.3321897983551025, "logps/chosen": -4.494087219238281, "logps/rejected": -33.97837829589844, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 2.867629051208496, "rewards/margins": 2.867629051208496, "rewards/rejected": 0.0, "step": 2109 }, { "epoch": 11.787709497206704, "grad_norm": 0.44190054236605475, "learning_rate": 1.967690722000256e-07, "logits/chosen": -3.0685782432556152, "logits/rejected": -3.000790596008301, "logps/chosen": -3.3948845863342285, "logps/rejected": -47.722129821777344, "loss": 0.1099, "rewards/accuracies": 1.0, "rewards/chosen": 2.6497321128845215, "rewards/margins": 2.6497321128845215, "rewards/rejected": 0.0, "step": 2110 }, { "epoch": 11.793296089385475, "grad_norm": 0.40784498624049237, "learning_rate": 1.9628464110017756e-07, "logits/chosen": -3.157773017883301, "logits/rejected": -3.2839903831481934, "logps/chosen": -6.024378776550293, "logps/rejected": -52.742496490478516, "loss": 0.1117, "rewards/accuracies": 1.0, "rewards/chosen": 2.5025463104248047, "rewards/margins": 2.5025463104248047, "rewards/rejected": 0.0, "step": 2111 }, { "epoch": 11.798882681564246, "grad_norm": 0.9165257709534127, "learning_rate": 1.9580066137544204e-07, "logits/chosen": -3.252223014831543, "logits/rejected": -3.277956008911133, "logps/chosen": -0.549738883972168, "logps/rejected": -31.06273078918457, "loss": 0.1134, "rewards/accuracies": 1.0, "rewards/chosen": 1.8361353874206543, "rewards/margins": 1.8361353874206543, "rewards/rejected": 0.0, "step": 2112 }, { "epoch": 11.804469273743017, "grad_norm": 0.6663716498748804, "learning_rate": 1.9531713374509823e-07, "logits/chosen": -3.544379472732544, "logits/rejected": -3.51424241065979, "logps/chosen": -1.7592977285385132, "logps/rejected": -36.52344512939453, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": 2.8453869819641113, "rewards/margins": 2.8453869819641113, "rewards/rejected": 0.0, "step": 2113 }, { "epoch": 11.810055865921788, "grad_norm": 0.5070481300564695, "learning_rate": 1.9483405892775496e-07, "logits/chosen": -3.4501495361328125, "logits/rejected": -3.3233320713043213, "logps/chosen": -2.323268413543701, "logps/rejected": -49.021034240722656, "loss": 0.0926, "rewards/accuracies": 1.0, "rewards/chosen": 3.024831533432007, "rewards/margins": 3.024831533432007, "rewards/rejected": 0.0, "step": 2114 }, { "epoch": 11.815642458100559, "grad_norm": 0.5281047641423829, "learning_rate": 1.9435143764134716e-07, "logits/chosen": -3.6315793991088867, "logits/rejected": -3.5739078521728516, "logps/chosen": -1.9289957284927368, "logps/rejected": -43.96451950073242, "loss": 0.1205, "rewards/accuracies": 1.0, "rewards/chosen": 2.6348986625671387, "rewards/margins": 2.6348986625671387, "rewards/rejected": 0.0, "step": 2115 }, { "epoch": 11.82122905027933, "grad_norm": 1.869397832797094, "learning_rate": 1.9386927060313606e-07, "logits/chosen": -3.134777307510376, "logits/rejected": -3.1398351192474365, "logps/chosen": -8.755775451660156, "logps/rejected": -29.74664878845215, "loss": 0.1416, "rewards/accuracies": 1.0, "rewards/chosen": 3.388301372528076, "rewards/margins": 3.388301372528076, "rewards/rejected": 0.0, "step": 2116 }, { "epoch": 11.8268156424581, "grad_norm": 0.43745260018046894, "learning_rate": 1.9338755852970777e-07, "logits/chosen": -3.228424549102783, "logits/rejected": -3.3705661296844482, "logps/chosen": -1.1017125844955444, "logps/rejected": -50.154632568359375, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 2.395521640777588, "rewards/margins": 2.395521640777588, "rewards/rejected": 0.0, "step": 2117 }, { "epoch": 11.832402234636872, "grad_norm": 0.465204292742605, "learning_rate": 1.9290630213697217e-07, "logits/chosen": -3.456580638885498, "logits/rejected": -3.3084499835968018, "logps/chosen": -4.237008094787598, "logps/rejected": -41.89889907836914, "loss": 0.1071, "rewards/accuracies": 1.0, "rewards/chosen": 3.5508902072906494, "rewards/margins": 3.5508902072906494, "rewards/rejected": 0.0, "step": 2118 }, { "epoch": 11.837988826815643, "grad_norm": 1.9742680386399745, "learning_rate": 1.92425502140162e-07, "logits/chosen": -3.3315179347991943, "logits/rejected": -3.209519147872925, "logps/chosen": -4.020634651184082, "logps/rejected": -30.618637084960938, "loss": 0.1219, "rewards/accuracies": 1.0, "rewards/chosen": 2.715378999710083, "rewards/margins": 2.715378999710083, "rewards/rejected": 0.0, "step": 2119 }, { "epoch": 11.843575418994414, "grad_norm": 0.9255626394952459, "learning_rate": 1.9194515925383149e-07, "logits/chosen": -3.1456193923950195, "logits/rejected": -3.241938829421997, "logps/chosen": -10.063833236694336, "logps/rejected": -51.778404235839844, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 2.76076340675354, "rewards/margins": 2.76076340675354, "rewards/rejected": 0.0, "step": 2120 }, { "epoch": 11.843575418994414, "eval_logits/chosen": -3.272770643234253, "eval_logits/rejected": -3.3985888957977295, "eval_logps/chosen": -29.64059066772461, "eval_logps/rejected": -54.02800750732422, "eval_loss": 0.9608362913131714, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": 0.042945243418216705, "eval_rewards/margins": 0.042945243418216705, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7156, "eval_samples_per_second": 9.476, "eval_steps_per_second": 0.306, "step": 2120 }, { "epoch": 11.849162011173185, "grad_norm": 1.3394621049516837, "learning_rate": 1.9146527419185626e-07, "logits/chosen": -3.4177379608154297, "logits/rejected": -3.447481393814087, "logps/chosen": -1.6461918354034424, "logps/rejected": -57.266231536865234, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 2.7926669120788574, "rewards/margins": 2.7926669120788574, "rewards/rejected": 0.0, "step": 2121 }, { "epoch": 11.854748603351956, "grad_norm": 0.4624204756202754, "learning_rate": 1.9098584766743027e-07, "logits/chosen": -3.424178123474121, "logits/rejected": -3.4454879760742188, "logps/chosen": -1.4741344451904297, "logps/rejected": -52.27875518798828, "loss": 0.1387, "rewards/accuracies": 1.0, "rewards/chosen": 2.6950955390930176, "rewards/margins": 2.6950955390930176, "rewards/rejected": 0.0, "step": 2122 }, { "epoch": 11.860335195530727, "grad_norm": 0.7365250401734182, "learning_rate": 1.9050688039306716e-07, "logits/chosen": -3.210947275161743, "logits/rejected": -3.3241193294525146, "logps/chosen": -2.2139992713928223, "logps/rejected": -64.00433349609375, "loss": 0.0718, "rewards/accuracies": 1.0, "rewards/chosen": 3.1638004779815674, "rewards/margins": 3.1638004779815674, "rewards/rejected": 0.0, "step": 2123 }, { "epoch": 11.865921787709498, "grad_norm": 0.7778971142705587, "learning_rate": 1.9002837308059756e-07, "logits/chosen": -2.925323009490967, "logits/rejected": -2.8885862827301025, "logps/chosen": -8.409188270568848, "logps/rejected": -55.37174606323242, "loss": 0.1143, "rewards/accuracies": 1.0, "rewards/chosen": 2.9058103561401367, "rewards/margins": 2.9058103561401367, "rewards/rejected": 0.0, "step": 2124 }, { "epoch": 11.871508379888269, "grad_norm": 0.8504688428084934, "learning_rate": 1.8955032644116847e-07, "logits/chosen": -3.2839128971099854, "logits/rejected": -3.3986287117004395, "logps/chosen": -3.0384836196899414, "logps/rejected": -57.346824645996094, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 2.340529441833496, "rewards/margins": 2.340529441833496, "rewards/rejected": 0.0, "step": 2125 }, { "epoch": 11.87709497206704, "grad_norm": 4.1360967384429115, "learning_rate": 1.8907274118524235e-07, "logits/chosen": -3.41344952583313, "logits/rejected": -3.3344621658325195, "logps/chosen": -5.416000843048096, "logps/rejected": -33.39531707763672, "loss": 0.1251, "rewards/accuracies": 1.0, "rewards/chosen": 3.1773829460144043, "rewards/margins": 3.1773829460144043, "rewards/rejected": 0.0, "step": 2126 }, { "epoch": 11.88268156424581, "grad_norm": 0.49328609463654216, "learning_rate": 1.8859561802259595e-07, "logits/chosen": -3.2809367179870605, "logits/rejected": -3.4360008239746094, "logps/chosen": -0.2677420973777771, "logps/rejected": -45.579261779785156, "loss": 0.1304, "rewards/accuracies": 1.0, "rewards/chosen": 1.699220895767212, "rewards/margins": 1.699220895767212, "rewards/rejected": 0.0, "step": 2127 }, { "epoch": 11.888268156424582, "grad_norm": 0.9312043475282246, "learning_rate": 1.8811895766231928e-07, "logits/chosen": -3.3856492042541504, "logits/rejected": -3.301811456680298, "logps/chosen": -0.7853468656539917, "logps/rejected": -97.83235168457031, "loss": 0.0943, "rewards/accuracies": 1.0, "rewards/chosen": 2.086432456970215, "rewards/margins": 2.086432456970215, "rewards/rejected": 0.0, "step": 2128 }, { "epoch": 11.893854748603353, "grad_norm": 0.6598547775012298, "learning_rate": 1.8764276081281428e-07, "logits/chosen": -3.3419158458709717, "logits/rejected": -3.520700693130493, "logps/chosen": -3.2810051441192627, "logps/rejected": -53.71620559692383, "loss": 0.1139, "rewards/accuracies": 1.0, "rewards/chosen": 3.639993667602539, "rewards/margins": 3.639993667602539, "rewards/rejected": 0.0, "step": 2129 }, { "epoch": 11.899441340782122, "grad_norm": 0.4169267337223477, "learning_rate": 1.8716702818179486e-07, "logits/chosen": -3.5588531494140625, "logits/rejected": -3.3309450149536133, "logps/chosen": -1.766896367073059, "logps/rejected": -52.8946533203125, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 2.736774444580078, "rewards/margins": 2.736774444580078, "rewards/rejected": 0.0, "step": 2130 }, { "epoch": 11.905027932960895, "grad_norm": 0.863608474306918, "learning_rate": 1.866917604762837e-07, "logits/chosen": -3.491826057434082, "logits/rejected": -3.6879568099975586, "logps/chosen": -26.05544662475586, "logps/rejected": -52.53486633300781, "loss": 0.0987, "rewards/accuracies": 1.0, "rewards/chosen": 3.1738553047180176, "rewards/margins": 3.1738553047180176, "rewards/rejected": 0.0, "step": 2131 }, { "epoch": 11.910614525139664, "grad_norm": 0.45433539472533707, "learning_rate": 1.8621695840261386e-07, "logits/chosen": -3.1819138526916504, "logits/rejected": -3.31738018989563, "logps/chosen": -0.6118383407592773, "logps/rejected": -62.76938247680664, "loss": 0.1046, "rewards/accuracies": 1.0, "rewards/chosen": 2.1953125, "rewards/margins": 2.1953125, "rewards/rejected": 0.0, "step": 2132 }, { "epoch": 11.916201117318435, "grad_norm": 0.46966855925399337, "learning_rate": 1.857426226664255e-07, "logits/chosen": -3.4747657775878906, "logits/rejected": -3.4594054222106934, "logps/chosen": -3.7471964359283447, "logps/rejected": -34.69524002075195, "loss": 0.1022, "rewards/accuracies": 1.0, "rewards/chosen": 2.984175205230713, "rewards/margins": 2.984175205230713, "rewards/rejected": 0.0, "step": 2133 }, { "epoch": 11.921787709497206, "grad_norm": 1.0275136441730288, "learning_rate": 1.85268753972666e-07, "logits/chosen": -3.4865078926086426, "logits/rejected": -3.601943016052246, "logps/chosen": -9.224930763244629, "logps/rejected": -97.36491394042969, "loss": 0.1125, "rewards/accuracies": 1.0, "rewards/chosen": 2.7399888038635254, "rewards/margins": 2.7399888038635254, "rewards/rejected": 0.0, "step": 2134 }, { "epoch": 11.927374301675977, "grad_norm": 0.5682073761034893, "learning_rate": 1.847953530255887e-07, "logits/chosen": -3.10056734085083, "logits/rejected": -3.3187789916992188, "logps/chosen": -28.744565963745117, "logps/rejected": -72.4722900390625, "loss": 0.1077, "rewards/accuracies": 1.0, "rewards/chosen": 3.4625368118286133, "rewards/margins": 3.4625368118286133, "rewards/rejected": 0.0, "step": 2135 }, { "epoch": 11.932960893854748, "grad_norm": 1.6231014668350887, "learning_rate": 1.8432242052875162e-07, "logits/chosen": -3.2790393829345703, "logits/rejected": -3.3346593379974365, "logps/chosen": -4.857345104217529, "logps/rejected": -88.18710327148438, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 3.2367210388183594, "rewards/margins": 3.2367210388183594, "rewards/rejected": 0.0, "step": 2136 }, { "epoch": 11.938547486033519, "grad_norm": 2.255345980568635, "learning_rate": 1.8384995718501682e-07, "logits/chosen": -3.2585246562957764, "logits/rejected": -3.4492573738098145, "logps/chosen": -0.42797067761421204, "logps/rejected": -95.33000183105469, "loss": 0.1403, "rewards/accuracies": 1.0, "rewards/chosen": 1.56181800365448, "rewards/margins": 1.56181800365448, "rewards/rejected": 0.0, "step": 2137 }, { "epoch": 11.94413407821229, "grad_norm": 1.023505762488989, "learning_rate": 1.8337796369654885e-07, "logits/chosen": -3.2255258560180664, "logits/rejected": -3.2198212146759033, "logps/chosen": -1.1346759796142578, "logps/rejected": -33.88743591308594, "loss": 0.1045, "rewards/accuracies": 1.0, "rewards/chosen": 2.6318230628967285, "rewards/margins": 2.6318230628967285, "rewards/rejected": 0.0, "step": 2138 }, { "epoch": 11.949720670391061, "grad_norm": 1.167077931668197, "learning_rate": 1.829064407648141e-07, "logits/chosen": -3.289717197418213, "logits/rejected": -3.258373737335205, "logps/chosen": -0.7531739473342896, "logps/rejected": -72.3611068725586, "loss": 0.1062, "rewards/accuracies": 1.0, "rewards/chosen": 2.3357958793640137, "rewards/margins": 2.3357958793640137, "rewards/rejected": 0.0, "step": 2139 }, { "epoch": 11.955307262569832, "grad_norm": 1.1151009280089255, "learning_rate": 1.8243538909057955e-07, "logits/chosen": -3.5916812419891357, "logits/rejected": -3.5592756271362305, "logps/chosen": -1.4563828706741333, "logps/rejected": -48.035823822021484, "loss": 0.0781, "rewards/accuracies": 1.0, "rewards/chosen": 2.417919158935547, "rewards/margins": 2.417919158935547, "rewards/rejected": 0.0, "step": 2140 }, { "epoch": 11.955307262569832, "eval_logits/chosen": -3.276703357696533, "eval_logits/rejected": -3.4007294178009033, "eval_logps/chosen": -29.58773422241211, "eval_logps/rejected": -54.25297164916992, "eval_loss": 0.9734715819358826, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": 0.04823073744773865, "eval_rewards/margins": 0.04823073744773865, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7264, "eval_samples_per_second": 9.472, "eval_steps_per_second": 0.306, "step": 2140 }, { "epoch": 11.960893854748603, "grad_norm": 1.9002789397551343, "learning_rate": 1.8196480937391224e-07, "logits/chosen": -3.371736526489258, "logits/rejected": -3.214059829711914, "logps/chosen": -10.563570022583008, "logps/rejected": -41.80162048339844, "loss": 0.0872, "rewards/accuracies": 1.0, "rewards/chosen": 3.8670542240142822, "rewards/margins": 3.8670542240142822, "rewards/rejected": 0.0, "step": 2141 }, { "epoch": 11.966480446927374, "grad_norm": 0.5989836192601836, "learning_rate": 1.8149470231417686e-07, "logits/chosen": -3.370497703552246, "logits/rejected": -3.3959856033325195, "logps/chosen": -0.544033408164978, "logps/rejected": -60.811676025390625, "loss": 0.108, "rewards/accuracies": 1.0, "rewards/chosen": 2.1266367435455322, "rewards/margins": 2.1266367435455322, "rewards/rejected": 0.0, "step": 2142 }, { "epoch": 11.972067039106145, "grad_norm": 1.4740528255270782, "learning_rate": 1.8102506861003668e-07, "logits/chosen": -3.4351704120635986, "logits/rejected": -3.348499059677124, "logps/chosen": -3.8457369804382324, "logps/rejected": -48.02412414550781, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 2.6251397132873535, "rewards/margins": 2.6251397132873535, "rewards/rejected": 0.0, "step": 2143 }, { "epoch": 11.977653631284916, "grad_norm": 0.584902785218383, "learning_rate": 1.805559089594509e-07, "logits/chosen": -3.5643978118896484, "logits/rejected": -3.5030927658081055, "logps/chosen": -0.784415602684021, "logps/rejected": -40.671119689941406, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/chosen": 2.2984249591827393, "rewards/margins": 2.2984249591827393, "rewards/rejected": 0.0, "step": 2144 }, { "epoch": 11.983240223463687, "grad_norm": 0.878489920214445, "learning_rate": 1.800872240596743e-07, "logits/chosen": -3.441546678543091, "logits/rejected": -3.4520084857940674, "logps/chosen": -3.2121951580047607, "logps/rejected": -45.37566375732422, "loss": 0.1155, "rewards/accuracies": 1.0, "rewards/chosen": 2.6495347023010254, "rewards/margins": 2.6495347023010254, "rewards/rejected": 0.0, "step": 2145 }, { "epoch": 11.988826815642458, "grad_norm": 0.7719241937540747, "learning_rate": 1.7961901460725604e-07, "logits/chosen": -3.2772867679595947, "logits/rejected": -3.240518569946289, "logps/chosen": -16.807783126831055, "logps/rejected": -37.623146057128906, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": 2.625868320465088, "rewards/margins": 2.625868320465088, "rewards/rejected": 0.0, "step": 2146 }, { "epoch": 11.994413407821229, "grad_norm": 0.7699004627039944, "learning_rate": 1.7915128129803887e-07, "logits/chosen": -3.148439645767212, "logits/rejected": -3.2155673503875732, "logps/chosen": -17.334806442260742, "logps/rejected": -39.368770599365234, "loss": 0.1111, "rewards/accuracies": 1.0, "rewards/chosen": 2.160996437072754, "rewards/margins": 2.160996437072754, "rewards/rejected": 0.0, "step": 2147 }, { "epoch": 12.0, "grad_norm": 0.37342330187378703, "learning_rate": 1.7868402482715766e-07, "logits/chosen": -3.322864294052124, "logits/rejected": -3.254760503768921, "logps/chosen": -0.28817659616470337, "logps/rejected": -69.99283599853516, "loss": 0.1228, "rewards/accuracies": 1.0, "rewards/chosen": 1.7711372375488281, "rewards/margins": 1.7711372375488281, "rewards/rejected": 0.0, "step": 2148 }, { "epoch": 12.005586592178771, "grad_norm": 0.40378364708525616, "learning_rate": 1.782172458890387e-07, "logits/chosen": -3.5557851791381836, "logits/rejected": -3.6075425148010254, "logps/chosen": -1.883641242980957, "logps/rejected": -52.34925079345703, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": 2.914915084838867, "rewards/margins": 2.914915084838867, "rewards/rejected": 0.0, "step": 2149 }, { "epoch": 12.011173184357542, "grad_norm": 0.5213657222741441, "learning_rate": 1.7775094517739903e-07, "logits/chosen": -3.403524160385132, "logits/rejected": -3.5024876594543457, "logps/chosen": -2.2176942825317383, "logps/rejected": -42.349082946777344, "loss": 0.0872, "rewards/accuracies": 1.0, "rewards/chosen": 2.812084674835205, "rewards/margins": 2.812084674835205, "rewards/rejected": 0.0, "step": 2150 }, { "epoch": 12.016759776536313, "grad_norm": 0.6444221789051341, "learning_rate": 1.7728512338524388e-07, "logits/chosen": -3.3550872802734375, "logits/rejected": -3.214698076248169, "logps/chosen": -5.084292411804199, "logps/rejected": -62.02729797363281, "loss": 0.0877, "rewards/accuracies": 1.0, "rewards/chosen": 3.57625150680542, "rewards/margins": 3.57625150680542, "rewards/rejected": 0.0, "step": 2151 }, { "epoch": 12.022346368715084, "grad_norm": 0.670653075788408, "learning_rate": 1.7681978120486796e-07, "logits/chosen": -3.240642547607422, "logits/rejected": -3.228584051132202, "logps/chosen": -12.051186561584473, "logps/rejected": -27.182872772216797, "loss": 0.1127, "rewards/accuracies": 1.0, "rewards/chosen": 3.1799678802490234, "rewards/margins": 3.1799678802490234, "rewards/rejected": 0.0, "step": 2152 }, { "epoch": 12.027932960893855, "grad_norm": 0.4131077591617559, "learning_rate": 1.763549193278524e-07, "logits/chosen": -3.4236247539520264, "logits/rejected": -3.319427490234375, "logps/chosen": -0.9676990509033203, "logps/rejected": -47.9969596862793, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": 2.5235748291015625, "rewards/margins": 2.5235748291015625, "rewards/rejected": 0.0, "step": 2153 }, { "epoch": 12.033519553072626, "grad_norm": 0.48318206462326607, "learning_rate": 1.7589053844506474e-07, "logits/chosen": -3.0199475288391113, "logits/rejected": -3.0718603134155273, "logps/chosen": -16.391725540161133, "logps/rejected": -41.51329803466797, "loss": 0.1158, "rewards/accuracies": 1.0, "rewards/chosen": 3.944697380065918, "rewards/margins": 3.944697380065918, "rewards/rejected": 0.0, "step": 2154 }, { "epoch": 12.039106145251397, "grad_norm": 0.4376015475095301, "learning_rate": 1.7542663924665773e-07, "logits/chosen": -3.278794765472412, "logits/rejected": -3.369462490081787, "logps/chosen": -29.321495056152344, "logps/rejected": -74.3553695678711, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 4.3270392417907715, "rewards/margins": 4.3270392417907715, "rewards/rejected": 0.0, "step": 2155 }, { "epoch": 12.044692737430168, "grad_norm": 0.5227145883328304, "learning_rate": 1.7496322242206811e-07, "logits/chosen": -3.485487222671509, "logits/rejected": -3.5119569301605225, "logps/chosen": -0.7452935576438904, "logps/rejected": -54.82063293457031, "loss": 0.0962, "rewards/accuracies": 1.0, "rewards/chosen": 2.328976631164551, "rewards/margins": 2.328976631164551, "rewards/rejected": 0.0, "step": 2156 }, { "epoch": 12.050279329608939, "grad_norm": 0.3774624657314431, "learning_rate": 1.7450028866001587e-07, "logits/chosen": -3.4001994132995605, "logits/rejected": -3.350005865097046, "logps/chosen": -9.71610164642334, "logps/rejected": -54.90071105957031, "loss": 0.1009, "rewards/accuracies": 1.0, "rewards/chosen": 2.7138869762420654, "rewards/margins": 2.7138869762420654, "rewards/rejected": 0.0, "step": 2157 }, { "epoch": 12.05586592178771, "grad_norm": 0.36388899604965774, "learning_rate": 1.7403783864850286e-07, "logits/chosen": -3.3635263442993164, "logits/rejected": -3.4219889640808105, "logps/chosen": -1.1147568225860596, "logps/rejected": -96.10930633544922, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": 2.4907445907592773, "rewards/margins": 2.4907445907592773, "rewards/rejected": 0.0, "step": 2158 }, { "epoch": 12.061452513966481, "grad_norm": 0.4618707459634858, "learning_rate": 1.7357587307481263e-07, "logits/chosen": -3.41139554977417, "logits/rejected": -3.4588818550109863, "logps/chosen": -0.8434318900108337, "logps/rejected": -45.489280700683594, "loss": 0.0951, "rewards/accuracies": 1.0, "rewards/chosen": 2.515263795852661, "rewards/margins": 2.515263795852661, "rewards/rejected": 0.0, "step": 2159 }, { "epoch": 12.067039106145252, "grad_norm": 0.38213439243139324, "learning_rate": 1.7311439262550764e-07, "logits/chosen": -3.2825281620025635, "logits/rejected": -3.3209400177001953, "logps/chosen": -1.6238597631454468, "logps/rejected": -44.853328704833984, "loss": 0.0932, "rewards/accuracies": 1.0, "rewards/chosen": 3.048689842224121, "rewards/margins": 3.048689842224121, "rewards/rejected": 0.0, "step": 2160 }, { "epoch": 12.067039106145252, "eval_logits/chosen": -3.26629638671875, "eval_logits/rejected": -3.391310453414917, "eval_logps/chosen": -29.958614349365234, "eval_logps/rejected": -54.96844482421875, "eval_loss": 0.9988755583763123, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": 0.011142569594085217, "eval_rewards/margins": 0.011142569594085217, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6987, "eval_samples_per_second": 9.481, "eval_steps_per_second": 0.306, "step": 2160 }, { "epoch": 12.072625698324023, "grad_norm": 0.3799615625511484, "learning_rate": 1.7265339798643047e-07, "logits/chosen": -3.2318854331970215, "logits/rejected": -3.3783950805664062, "logps/chosen": -4.772999286651611, "logps/rejected": -53.933372497558594, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": 2.409533739089966, "rewards/margins": 2.409533739089966, "rewards/rejected": 0.0, "step": 2161 }, { "epoch": 12.078212290502794, "grad_norm": 0.4618446017978479, "learning_rate": 1.721928898427012e-07, "logits/chosen": -3.3426854610443115, "logits/rejected": -3.2753188610076904, "logps/chosen": -9.115558624267578, "logps/rejected": -36.56756591796875, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 3.01203989982605, "rewards/margins": 3.01203989982605, "rewards/rejected": 0.0, "step": 2162 }, { "epoch": 12.083798882681565, "grad_norm": 0.5644777010435362, "learning_rate": 1.7173286887871684e-07, "logits/chosen": -3.2985708713531494, "logits/rejected": -3.3940656185150146, "logps/chosen": -0.3955370783805847, "logps/rejected": -80.76856231689453, "loss": 0.1251, "rewards/accuracies": 1.0, "rewards/chosen": 1.7360330820083618, "rewards/margins": 1.7360330820083618, "rewards/rejected": 0.0, "step": 2163 }, { "epoch": 12.089385474860336, "grad_norm": 0.9941797723077999, "learning_rate": 1.712733357781505e-07, "logits/chosen": -2.990569829940796, "logits/rejected": -2.997157573699951, "logps/chosen": -12.115921020507812, "logps/rejected": -45.2780876159668, "loss": 0.0899, "rewards/accuracies": 1.0, "rewards/chosen": 3.1058850288391113, "rewards/margins": 3.1058850288391113, "rewards/rejected": 0.0, "step": 2164 }, { "epoch": 12.094972067039107, "grad_norm": 0.37519598750511796, "learning_rate": 1.708142912239502e-07, "logits/chosen": -3.445521593093872, "logits/rejected": -3.2191734313964844, "logps/chosen": -7.310822486877441, "logps/rejected": -33.084388732910156, "loss": 0.0828, "rewards/accuracies": 1.0, "rewards/chosen": 3.684077501296997, "rewards/margins": 3.684077501296997, "rewards/rejected": 0.0, "step": 2165 }, { "epoch": 12.100558659217878, "grad_norm": 0.428464822203489, "learning_rate": 1.7035573589833796e-07, "logits/chosen": -3.2800114154815674, "logits/rejected": -3.3751020431518555, "logps/chosen": -1.4960510730743408, "logps/rejected": -66.55987548828125, "loss": 0.0788, "rewards/accuracies": 1.0, "rewards/chosen": 2.6144371032714844, "rewards/margins": 2.6144371032714844, "rewards/rejected": 0.0, "step": 2166 }, { "epoch": 12.106145251396647, "grad_norm": 0.4087638051616013, "learning_rate": 1.6989767048280857e-07, "logits/chosen": -3.595515727996826, "logits/rejected": -3.7190680503845215, "logps/chosen": -5.6592116355896, "logps/rejected": -55.2605094909668, "loss": 0.0707, "rewards/accuracies": 1.0, "rewards/chosen": 3.3244736194610596, "rewards/margins": 3.3244736194610596, "rewards/rejected": 0.0, "step": 2167 }, { "epoch": 12.111731843575418, "grad_norm": 0.35076977748399546, "learning_rate": 1.6944009565812884e-07, "logits/chosen": -3.5053322315216064, "logits/rejected": -3.5027060508728027, "logps/chosen": -10.765336036682129, "logps/rejected": -28.595090866088867, "loss": 0.0925, "rewards/accuracies": 1.0, "rewards/chosen": 3.610229730606079, "rewards/margins": 3.610229730606079, "rewards/rejected": 0.0, "step": 2168 }, { "epoch": 12.11731843575419, "grad_norm": 0.36300012325593645, "learning_rate": 1.689830121043363e-07, "logits/chosen": -3.317830801010132, "logits/rejected": -3.3645992279052734, "logps/chosen": -4.212851524353027, "logps/rejected": -26.253808975219727, "loss": 0.1199, "rewards/accuracies": 1.0, "rewards/chosen": 3.080103874206543, "rewards/margins": 3.080103874206543, "rewards/rejected": 0.0, "step": 2169 }, { "epoch": 12.12290502793296, "grad_norm": 0.8000529602967763, "learning_rate": 1.68526420500739e-07, "logits/chosen": -3.5707921981811523, "logits/rejected": -3.5279901027679443, "logps/chosen": -5.280650615692139, "logps/rejected": -57.73506164550781, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 2.6453027725219727, "rewards/margins": 2.6453027725219727, "rewards/rejected": 0.0, "step": 2170 }, { "epoch": 12.128491620111731, "grad_norm": 0.41276710037417147, "learning_rate": 1.6807032152591277e-07, "logits/chosen": -3.3153977394104004, "logits/rejected": -3.4051661491394043, "logps/chosen": -2.830223560333252, "logps/rejected": -44.051605224609375, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": 3.325134038925171, "rewards/margins": 3.325134038925171, "rewards/rejected": 0.0, "step": 2171 }, { "epoch": 12.134078212290502, "grad_norm": 0.6782927881683671, "learning_rate": 1.6761471585770231e-07, "logits/chosen": -3.4325907230377197, "logits/rejected": -3.4208483695983887, "logps/chosen": -7.119795799255371, "logps/rejected": -50.88927459716797, "loss": 0.0809, "rewards/accuracies": 1.0, "rewards/chosen": 3.3752222061157227, "rewards/margins": 3.3752222061157227, "rewards/rejected": 0.0, "step": 2172 }, { "epoch": 12.139664804469273, "grad_norm": 0.440721933774724, "learning_rate": 1.6715960417321868e-07, "logits/chosen": -3.5034115314483643, "logits/rejected": -3.3925890922546387, "logps/chosen": -1.6475284099578857, "logps/rejected": -30.561010360717773, "loss": 0.1197, "rewards/accuracies": 1.0, "rewards/chosen": 3.1842782497406006, "rewards/margins": 3.1842782497406006, "rewards/rejected": 0.0, "step": 2173 }, { "epoch": 12.145251396648044, "grad_norm": 0.4353003495233225, "learning_rate": 1.667049871488389e-07, "logits/chosen": -3.606475830078125, "logits/rejected": -3.7171545028686523, "logps/chosen": -1.5183839797973633, "logps/rejected": -64.20674133300781, "loss": 0.1239, "rewards/accuracies": 1.0, "rewards/chosen": 2.561896562576294, "rewards/margins": 2.561896562576294, "rewards/rejected": 0.0, "step": 2174 }, { "epoch": 12.150837988826815, "grad_norm": 0.5052694588263025, "learning_rate": 1.662508654602048e-07, "logits/chosen": -3.3743844032287598, "logits/rejected": -3.4152326583862305, "logps/chosen": -10.79573917388916, "logps/rejected": -37.08635330200195, "loss": 0.0945, "rewards/accuracies": 1.0, "rewards/chosen": 3.2049059867858887, "rewards/margins": 3.2049059867858887, "rewards/rejected": 0.0, "step": 2175 }, { "epoch": 12.156424581005586, "grad_norm": 0.3844838734093418, "learning_rate": 1.657972397822221e-07, "logits/chosen": -3.402432680130005, "logits/rejected": -3.5067174434661865, "logps/chosen": -0.4482359290122986, "logps/rejected": -54.469364166259766, "loss": 0.0899, "rewards/accuracies": 1.0, "rewards/chosen": 1.9763888120651245, "rewards/margins": 1.9763888120651245, "rewards/rejected": 0.0, "step": 2176 }, { "epoch": 12.162011173184357, "grad_norm": 0.42614555087996847, "learning_rate": 1.653441107890593e-07, "logits/chosen": -3.187455654144287, "logits/rejected": -3.3040640354156494, "logps/chosen": -43.201087951660156, "logps/rejected": -39.88694763183594, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 3.3057801723480225, "rewards/margins": 3.3057801723480225, "rewards/rejected": 0.0, "step": 2177 }, { "epoch": 12.167597765363128, "grad_norm": 0.4477183244334658, "learning_rate": 1.648914791541467e-07, "logits/chosen": -3.285576581954956, "logits/rejected": -3.524308919906616, "logps/chosen": -1.0072684288024902, "logps/rejected": -71.30606079101562, "loss": 0.1106, "rewards/accuracies": 1.0, "rewards/chosen": 2.2392828464508057, "rewards/margins": 2.2392828464508057, "rewards/rejected": 0.0, "step": 2178 }, { "epoch": 12.1731843575419, "grad_norm": 0.3276722931826065, "learning_rate": 1.6443934555017592e-07, "logits/chosen": -3.368210792541504, "logits/rejected": -3.342644214630127, "logps/chosen": -0.6389519572257996, "logps/rejected": -31.776371002197266, "loss": 0.1173, "rewards/accuracies": 1.0, "rewards/chosen": 2.2634239196777344, "rewards/margins": 2.2634239196777344, "rewards/rejected": 0.0, "step": 2179 }, { "epoch": 12.17877094972067, "grad_norm": 1.4139689292348572, "learning_rate": 1.6398771064909745e-07, "logits/chosen": -3.434065103530884, "logits/rejected": -3.4983153343200684, "logps/chosen": -6.767345428466797, "logps/rejected": -36.096797943115234, "loss": 0.0887, "rewards/accuracies": 1.0, "rewards/chosen": 3.419342041015625, "rewards/margins": 3.419342041015625, "rewards/rejected": 0.0, "step": 2180 }, { "epoch": 12.17877094972067, "eval_logits/chosen": -3.2602896690368652, "eval_logits/rejected": -3.3862922191619873, "eval_logps/chosen": -30.069255828857422, "eval_logps/rejected": -55.100860595703125, "eval_loss": 1.0059175491333008, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": 7.850527617847547e-05, "eval_rewards/margins": 7.850527617847547e-05, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.306, "step": 2180 }, { "epoch": 12.184357541899441, "grad_norm": 1.333253235266498, "learning_rate": 1.6353657512212165e-07, "logits/chosen": -3.5264642238616943, "logits/rejected": -3.4460244178771973, "logps/chosen": -4.27070426940918, "logps/rejected": -65.77252960205078, "loss": 0.0974, "rewards/accuracies": 1.0, "rewards/chosen": 2.6007394790649414, "rewards/margins": 2.6007394790649414, "rewards/rejected": 0.0, "step": 2181 }, { "epoch": 12.189944134078212, "grad_norm": 0.45016741182450515, "learning_rate": 1.6308593963971602e-07, "logits/chosen": -3.445392370223999, "logits/rejected": -3.356531858444214, "logps/chosen": -10.348440170288086, "logps/rejected": -49.000614166259766, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 3.6431450843811035, "rewards/margins": 3.6431450843811035, "rewards/rejected": 0.0, "step": 2182 }, { "epoch": 12.195530726256983, "grad_norm": 0.3812873777314003, "learning_rate": 1.6263580487160522e-07, "logits/chosen": -3.4518959522247314, "logits/rejected": -3.5236244201660156, "logps/chosen": -2.4868862628936768, "logps/rejected": -35.648406982421875, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 3.286860942840576, "rewards/margins": 3.286860942840576, "rewards/rejected": 0.0, "step": 2183 }, { "epoch": 12.201117318435754, "grad_norm": 0.3654832108414279, "learning_rate": 1.6218617148676967e-07, "logits/chosen": -3.2200927734375, "logits/rejected": -3.3731446266174316, "logps/chosen": -1.4337997436523438, "logps/rejected": -48.387725830078125, "loss": 0.1456, "rewards/accuracies": 1.0, "rewards/chosen": 2.6443228721618652, "rewards/margins": 2.6443228721618652, "rewards/rejected": 0.0, "step": 2184 }, { "epoch": 12.206703910614525, "grad_norm": 0.3652935435315875, "learning_rate": 1.6173704015344462e-07, "logits/chosen": -3.314303398132324, "logits/rejected": -3.428551435470581, "logps/chosen": -0.7392306327819824, "logps/rejected": -63.2274055480957, "loss": 0.1074, "rewards/accuracies": 1.0, "rewards/chosen": 2.33054780960083, "rewards/margins": 2.33054780960083, "rewards/rejected": 0.0, "step": 2185 }, { "epoch": 12.212290502793296, "grad_norm": 0.43048003457963097, "learning_rate": 1.612884115391193e-07, "logits/chosen": -3.207775354385376, "logits/rejected": -3.3207106590270996, "logps/chosen": -0.2809603214263916, "logps/rejected": -66.76077270507812, "loss": 0.0909, "rewards/accuracies": 1.0, "rewards/chosen": 1.8991446495056152, "rewards/margins": 1.8991446495056152, "rewards/rejected": 0.0, "step": 2186 }, { "epoch": 12.217877094972067, "grad_norm": 0.406424038962057, "learning_rate": 1.6084028631053547e-07, "logits/chosen": -3.457875967025757, "logits/rejected": -3.4442527294158936, "logps/chosen": -14.073575019836426, "logps/rejected": -75.53553771972656, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 3.7035882472991943, "rewards/margins": 3.7035882472991943, "rewards/rejected": 0.0, "step": 2187 }, { "epoch": 12.223463687150838, "grad_norm": 0.3896109584985441, "learning_rate": 1.6039266513368756e-07, "logits/chosen": -3.3706226348876953, "logits/rejected": -3.5377938747406006, "logps/chosen": -0.7984238862991333, "logps/rejected": -86.73091125488281, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": 2.480565071105957, "rewards/margins": 2.480565071105957, "rewards/rejected": 0.0, "step": 2188 }, { "epoch": 12.22905027932961, "grad_norm": 0.591910687950212, "learning_rate": 1.5994554867381966e-07, "logits/chosen": -3.386492967605591, "logits/rejected": -3.1366889476776123, "logps/chosen": -4.666633129119873, "logps/rejected": -84.95759582519531, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 2.382068157196045, "rewards/margins": 2.382068157196045, "rewards/rejected": 0.0, "step": 2189 }, { "epoch": 12.23463687150838, "grad_norm": 0.3807122108778428, "learning_rate": 1.5949893759542697e-07, "logits/chosen": -3.2206106185913086, "logits/rejected": -3.2755048274993896, "logps/chosen": -0.6934922933578491, "logps/rejected": -24.947383880615234, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 2.2132568359375, "rewards/margins": 2.2132568359375, "rewards/rejected": 0.0, "step": 2190 }, { "epoch": 12.240223463687151, "grad_norm": 0.4109311447771417, "learning_rate": 1.59052832562253e-07, "logits/chosen": -3.467851400375366, "logits/rejected": -3.601161003112793, "logps/chosen": -1.1642745733261108, "logps/rejected": -31.09759521484375, "loss": 0.1088, "rewards/accuracies": 1.0, "rewards/chosen": 2.292839527130127, "rewards/margins": 2.292839527130127, "rewards/rejected": 0.0, "step": 2191 }, { "epoch": 12.245810055865922, "grad_norm": 0.6455000209211309, "learning_rate": 1.5860723423728912e-07, "logits/chosen": -3.2219746112823486, "logits/rejected": -3.1025068759918213, "logps/chosen": -20.236587524414062, "logps/rejected": -45.65630340576172, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": 2.6771585941314697, "rewards/margins": 2.6771585941314697, "rewards/rejected": 0.0, "step": 2192 }, { "epoch": 12.251396648044693, "grad_norm": 0.4645737219040393, "learning_rate": 1.5816214328277393e-07, "logits/chosen": -3.4909138679504395, "logits/rejected": -3.520278215408325, "logps/chosen": -0.46695539355278015, "logps/rejected": -47.601051330566406, "loss": 0.1049, "rewards/accuracies": 1.0, "rewards/chosen": 1.9735751152038574, "rewards/margins": 1.9735751152038574, "rewards/rejected": 0.0, "step": 2193 }, { "epoch": 12.256983240223464, "grad_norm": 0.4397582110207123, "learning_rate": 1.5771756036019184e-07, "logits/chosen": -3.3240466117858887, "logits/rejected": -3.5705275535583496, "logps/chosen": -3.9083380699157715, "logps/rejected": -42.57920455932617, "loss": 0.0899, "rewards/accuracies": 1.0, "rewards/chosen": 3.4163131713867188, "rewards/margins": 3.4163131713867188, "rewards/rejected": 0.0, "step": 2194 }, { "epoch": 12.262569832402235, "grad_norm": 0.5262597377670216, "learning_rate": 1.5727348613027218e-07, "logits/chosen": -3.5049076080322266, "logits/rejected": -3.6485280990600586, "logps/chosen": -5.9262590408325195, "logps/rejected": -37.675071716308594, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 3.1949758529663086, "rewards/margins": 3.1949758529663086, "rewards/rejected": 0.0, "step": 2195 }, { "epoch": 12.268156424581006, "grad_norm": 0.416532771090321, "learning_rate": 1.5682992125298845e-07, "logits/chosen": -3.2953948974609375, "logits/rejected": -3.368100881576538, "logps/chosen": -0.5940881371498108, "logps/rejected": -68.54150390625, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 2.233419418334961, "rewards/margins": 2.233419418334961, "rewards/rejected": 0.0, "step": 2196 }, { "epoch": 12.273743016759777, "grad_norm": 0.5294551958350034, "learning_rate": 1.5638686638755693e-07, "logits/chosen": -3.361480474472046, "logits/rejected": -3.4927079677581787, "logps/chosen": -0.32955583930015564, "logps/rejected": -58.109832763671875, "loss": 0.1237, "rewards/accuracies": 1.0, "rewards/chosen": 1.5935286283493042, "rewards/margins": 1.5935286283493042, "rewards/rejected": 0.0, "step": 2197 }, { "epoch": 12.279329608938548, "grad_norm": 0.44180384245356047, "learning_rate": 1.5594432219243597e-07, "logits/chosen": -3.352954149246216, "logits/rejected": -3.4404146671295166, "logps/chosen": -5.233445167541504, "logps/rejected": -59.0228157043457, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": 3.1550750732421875, "rewards/margins": 3.1550750732421875, "rewards/rejected": 0.0, "step": 2198 }, { "epoch": 12.28491620111732, "grad_norm": 0.4837843686530828, "learning_rate": 1.555022893253255e-07, "logits/chosen": -3.3813109397888184, "logits/rejected": -3.3913769721984863, "logps/chosen": -2.2488725185394287, "logps/rejected": -39.50756072998047, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": 3.2228925228118896, "rewards/margins": 3.2228925228118896, "rewards/rejected": 0.0, "step": 2199 }, { "epoch": 12.29050279329609, "grad_norm": 0.38008790278698096, "learning_rate": 1.5506076844316446e-07, "logits/chosen": -3.2536447048187256, "logits/rejected": -3.1121184825897217, "logps/chosen": -3.3936192989349365, "logps/rejected": -37.43659973144531, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 3.1023712158203125, "rewards/margins": 3.1023712158203125, "rewards/rejected": 0.0, "step": 2200 }, { "epoch": 12.29050279329609, "eval_logits/chosen": -3.273606777191162, "eval_logits/rejected": -3.3964200019836426, "eval_logps/chosen": -30.072830200195312, "eval_logps/rejected": -54.90795135498047, "eval_loss": 1.008197546005249, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.000278729188721627, "eval_rewards/margins": -0.000278729188721627, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7007, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.306, "step": 2200 }, { "epoch": 12.296089385474861, "grad_norm": 0.39689128481062536, "learning_rate": 1.5461976020213202e-07, "logits/chosen": -3.4687182903289795, "logits/rejected": -3.4634604454040527, "logps/chosen": -25.262069702148438, "logps/rejected": -52.00890350341797, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": 3.5829715728759766, "rewards/margins": 3.5829715728759766, "rewards/rejected": 0.0, "step": 2201 }, { "epoch": 12.30167597765363, "grad_norm": 0.553139249662218, "learning_rate": 1.5417926525764474e-07, "logits/chosen": -3.5827085971832275, "logits/rejected": -3.570051670074463, "logps/chosen": -1.1240618228912354, "logps/rejected": -77.54736328125, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": 2.891718864440918, "rewards/margins": 2.891718864440918, "rewards/rejected": 0.0, "step": 2202 }, { "epoch": 12.307262569832401, "grad_norm": 0.43779318577112997, "learning_rate": 1.5373928426435662e-07, "logits/chosen": -3.643719434738159, "logits/rejected": -3.5031206607818604, "logps/chosen": -4.085589408874512, "logps/rejected": -45.71474838256836, "loss": 0.1176, "rewards/accuracies": 1.0, "rewards/chosen": 2.81341552734375, "rewards/margins": 2.81341552734375, "rewards/rejected": 0.0, "step": 2203 }, { "epoch": 12.312849162011172, "grad_norm": 1.1185142752538035, "learning_rate": 1.532998178761577e-07, "logits/chosen": -3.420722007751465, "logits/rejected": -3.2349698543548584, "logps/chosen": -2.943021774291992, "logps/rejected": -63.329444885253906, "loss": 0.1255, "rewards/accuracies": 1.0, "rewards/chosen": 2.979187488555908, "rewards/margins": 2.979187488555908, "rewards/rejected": 0.0, "step": 2204 }, { "epoch": 12.318435754189943, "grad_norm": 0.6442421529523719, "learning_rate": 1.5286086674617337e-07, "logits/chosen": -3.457326650619507, "logits/rejected": -3.456826686859131, "logps/chosen": -14.299467086791992, "logps/rejected": -37.61988830566406, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": 4.071661472320557, "rewards/margins": 4.071661472320557, "rewards/rejected": 0.0, "step": 2205 }, { "epoch": 12.324022346368714, "grad_norm": 1.8035445348879455, "learning_rate": 1.5242243152676316e-07, "logits/chosen": -3.164618492126465, "logits/rejected": -3.2809653282165527, "logps/chosen": -0.4628209173679352, "logps/rejected": -85.3101577758789, "loss": 0.1203, "rewards/accuracies": 1.0, "rewards/chosen": 2.0777363777160645, "rewards/margins": 2.0777363777160645, "rewards/rejected": 0.0, "step": 2206 }, { "epoch": 12.329608938547485, "grad_norm": 1.2268782360442905, "learning_rate": 1.5198451286951974e-07, "logits/chosen": -3.4171974658966064, "logits/rejected": -3.4579007625579834, "logps/chosen": -2.159998893737793, "logps/rejected": -32.2164306640625, "loss": 0.1027, "rewards/accuracies": 1.0, "rewards/chosen": 2.5198140144348145, "rewards/margins": 2.5198140144348145, "rewards/rejected": 0.0, "step": 2207 }, { "epoch": 12.335195530726256, "grad_norm": 0.3357318551819423, "learning_rate": 1.5154711142526865e-07, "logits/chosen": -3.39219331741333, "logits/rejected": -3.5031139850616455, "logps/chosen": -0.35548001527786255, "logps/rejected": -82.95361328125, "loss": 0.1168, "rewards/accuracies": 1.0, "rewards/chosen": 2.0930593013763428, "rewards/margins": 2.0930593013763428, "rewards/rejected": 0.0, "step": 2208 }, { "epoch": 12.340782122905027, "grad_norm": 0.4103618008361082, "learning_rate": 1.5111022784406575e-07, "logits/chosen": -3.3791322708129883, "logits/rejected": -3.258723735809326, "logps/chosen": -0.6859988570213318, "logps/rejected": -33.16255187988281, "loss": 0.0831, "rewards/accuracies": 1.0, "rewards/chosen": 2.4664194583892822, "rewards/margins": 2.4664194583892822, "rewards/rejected": 0.0, "step": 2209 }, { "epoch": 12.346368715083798, "grad_norm": 0.49985270363340084, "learning_rate": 1.5067386277519827e-07, "logits/chosen": -3.5391788482666016, "logits/rejected": -3.704902410507202, "logps/chosen": -9.990312576293945, "logps/rejected": -43.68678283691406, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": 3.8827545642852783, "rewards/margins": 3.8827545642852783, "rewards/rejected": 0.0, "step": 2210 }, { "epoch": 12.35195530726257, "grad_norm": 0.368229749968429, "learning_rate": 1.502380168671823e-07, "logits/chosen": -3.7137563228607178, "logits/rejected": -3.5952017307281494, "logps/chosen": -3.101325750350952, "logps/rejected": -38.811553955078125, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": 3.0027360916137695, "rewards/margins": 3.0027360916137695, "rewards/rejected": 0.0, "step": 2211 }, { "epoch": 12.35754189944134, "grad_norm": 1.620329892973557, "learning_rate": 1.4980269076776246e-07, "logits/chosen": -3.2845687866210938, "logits/rejected": -3.166053295135498, "logps/chosen": -0.5835750102996826, "logps/rejected": -54.8746452331543, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 2.3022303581237793, "rewards/margins": 2.3022303581237793, "rewards/rejected": 0.0, "step": 2212 }, { "epoch": 12.363128491620111, "grad_norm": 1.4818341578006364, "learning_rate": 1.4936788512391086e-07, "logits/chosen": -3.530430316925049, "logits/rejected": -3.3934195041656494, "logps/chosen": -0.3126392960548401, "logps/rejected": -39.160057067871094, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": 1.7503440380096436, "rewards/margins": 1.7503440380096436, "rewards/rejected": 0.0, "step": 2213 }, { "epoch": 12.368715083798882, "grad_norm": 0.4162038412431759, "learning_rate": 1.489336005818262e-07, "logits/chosen": -3.309678316116333, "logits/rejected": -3.298839569091797, "logps/chosen": -3.787191867828369, "logps/rejected": -68.62223815917969, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": 2.5112075805664062, "rewards/margins": 2.5112075805664062, "rewards/rejected": 0.0, "step": 2214 }, { "epoch": 12.374301675977653, "grad_norm": 0.3863444820312, "learning_rate": 1.4849983778693247e-07, "logits/chosen": -3.56412410736084, "logits/rejected": -3.5686590671539307, "logps/chosen": -1.782627820968628, "logps/rejected": -30.9729061126709, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": 2.731541633605957, "rewards/margins": 2.731541633605957, "rewards/rejected": 0.0, "step": 2215 }, { "epoch": 12.379888268156424, "grad_norm": 0.4670909926411719, "learning_rate": 1.4806659738387845e-07, "logits/chosen": -3.4526968002319336, "logits/rejected": -3.2827701568603516, "logps/chosen": -8.086905479431152, "logps/rejected": -61.43003463745117, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 3.2623023986816406, "rewards/margins": 3.2623023986816406, "rewards/rejected": 0.0, "step": 2216 }, { "epoch": 12.385474860335195, "grad_norm": 0.5216245990843851, "learning_rate": 1.47633880016537e-07, "logits/chosen": -3.4484896659851074, "logits/rejected": -3.606757640838623, "logps/chosen": -1.9328851699829102, "logps/rejected": -61.44506072998047, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 2.7314529418945312, "rewards/margins": 2.7314529418945312, "rewards/rejected": 0.0, "step": 2217 }, { "epoch": 12.391061452513966, "grad_norm": 0.39406110944684036, "learning_rate": 1.472016863280024e-07, "logits/chosen": -3.373232841491699, "logits/rejected": -3.3617072105407715, "logps/chosen": -1.397723913192749, "logps/rejected": -33.85374069213867, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": 2.688080310821533, "rewards/margins": 2.688080310821533, "rewards/rejected": 0.0, "step": 2218 }, { "epoch": 12.396648044692737, "grad_norm": 0.5872700290830913, "learning_rate": 1.4677001696059205e-07, "logits/chosen": -3.268261671066284, "logits/rejected": -3.110149621963501, "logps/chosen": -49.851505279541016, "logps/rejected": -38.20856475830078, "loss": 0.0935, "rewards/accuracies": 1.0, "rewards/chosen": 4.620477676391602, "rewards/margins": 4.620477676391602, "rewards/rejected": 0.0, "step": 2219 }, { "epoch": 12.402234636871508, "grad_norm": 0.7493306505323228, "learning_rate": 1.463388725558433e-07, "logits/chosen": -3.0824813842773438, "logits/rejected": -3.2189035415649414, "logps/chosen": -1.364945650100708, "logps/rejected": -90.79068756103516, "loss": 0.0828, "rewards/accuracies": 1.0, "rewards/chosen": 2.3378281593322754, "rewards/margins": 2.3378281593322754, "rewards/rejected": 0.0, "step": 2220 }, { "epoch": 12.402234636871508, "eval_logits/chosen": -3.263772964477539, "eval_logits/rejected": -3.389078140258789, "eval_logps/chosen": -29.9830379486084, "eval_logps/rejected": -54.61565399169922, "eval_loss": 1.0082952976226807, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": 0.00870051421225071, "eval_rewards/margins": 0.00870051421225071, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7282, "eval_samples_per_second": 9.472, "eval_steps_per_second": 0.306, "step": 2220 }, { "epoch": 12.40782122905028, "grad_norm": 0.7994012378277839, "learning_rate": 1.459082537545136e-07, "logits/chosen": -3.4357802867889404, "logits/rejected": -3.5319669246673584, "logps/chosen": -10.154078483581543, "logps/rejected": -25.528575897216797, "loss": 0.1334, "rewards/accuracies": 1.0, "rewards/chosen": 3.206284523010254, "rewards/margins": 3.206284523010254, "rewards/rejected": 0.0, "step": 2221 }, { "epoch": 12.41340782122905, "grad_norm": 0.4414507972582413, "learning_rate": 1.4547816119657906e-07, "logits/chosen": -3.3708724975585938, "logits/rejected": -3.5275392532348633, "logps/chosen": -2.411421298980713, "logps/rejected": -50.24624252319336, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": 2.822587013244629, "rewards/margins": 2.822587013244629, "rewards/rejected": 0.0, "step": 2222 }, { "epoch": 12.418994413407821, "grad_norm": 2.28493377914312, "learning_rate": 1.4504859552123388e-07, "logits/chosen": -3.313854217529297, "logits/rejected": -3.4889936447143555, "logps/chosen": -3.167032480239868, "logps/rejected": -65.09809112548828, "loss": 0.1161, "rewards/accuracies": 1.0, "rewards/chosen": 2.334043502807617, "rewards/margins": 2.334043502807617, "rewards/rejected": 0.0, "step": 2223 }, { "epoch": 12.424581005586592, "grad_norm": 0.4608324185014521, "learning_rate": 1.4461955736688913e-07, "logits/chosen": -3.000213146209717, "logits/rejected": -2.950758218765259, "logps/chosen": -9.52685832977295, "logps/rejected": -50.034454345703125, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": 2.961146354675293, "rewards/margins": 2.961146354675293, "rewards/rejected": 0.0, "step": 2224 }, { "epoch": 12.430167597765363, "grad_norm": 0.7029217378704538, "learning_rate": 1.4419104737117194e-07, "logits/chosen": -3.2592275142669678, "logits/rejected": -3.2010481357574463, "logps/chosen": -8.226953506469727, "logps/rejected": -46.574825286865234, "loss": 0.0949, "rewards/accuracies": 1.0, "rewards/chosen": 3.6288509368896484, "rewards/margins": 3.6288509368896484, "rewards/rejected": 0.0, "step": 2225 }, { "epoch": 12.435754189944134, "grad_norm": 1.8061372757179883, "learning_rate": 1.4376306617092444e-07, "logits/chosen": -3.177401542663574, "logits/rejected": -3.282233238220215, "logps/chosen": -1.1220273971557617, "logps/rejected": -81.22883605957031, "loss": 0.1089, "rewards/accuracies": 1.0, "rewards/chosen": 2.7618532180786133, "rewards/margins": 2.7618532180786133, "rewards/rejected": 0.0, "step": 2226 }, { "epoch": 12.441340782122905, "grad_norm": 0.4328558614661112, "learning_rate": 1.4333561440220283e-07, "logits/chosen": -3.417804002761841, "logits/rejected": -3.6785597801208496, "logps/chosen": -3.133924961090088, "logps/rejected": -59.23695755004883, "loss": 0.0753, "rewards/accuracies": 1.0, "rewards/chosen": 2.837836742401123, "rewards/margins": 2.837836742401123, "rewards/rejected": 0.0, "step": 2227 }, { "epoch": 12.446927374301676, "grad_norm": 0.3953785524468112, "learning_rate": 1.4290869270027694e-07, "logits/chosen": -2.9185352325439453, "logits/rejected": -2.9892590045928955, "logps/chosen": -30.449909210205078, "logps/rejected": -82.8421630859375, "loss": 0.0983, "rewards/accuracies": 1.0, "rewards/chosen": 3.1973934173583984, "rewards/margins": 3.1973934173583984, "rewards/rejected": 0.0, "step": 2228 }, { "epoch": 12.452513966480447, "grad_norm": 0.7888034554133668, "learning_rate": 1.4248230169962788e-07, "logits/chosen": -3.5354950428009033, "logits/rejected": -3.5087695121765137, "logps/chosen": -1.1169776916503906, "logps/rejected": -40.777870178222656, "loss": 0.1596, "rewards/accuracies": 1.0, "rewards/chosen": 2.6863861083984375, "rewards/margins": 2.6863861083984375, "rewards/rejected": 0.0, "step": 2229 }, { "epoch": 12.458100558659218, "grad_norm": 0.5604318581308836, "learning_rate": 1.4205644203394905e-07, "logits/chosen": -3.388920783996582, "logits/rejected": -3.4258475303649902, "logps/chosen": -1.7158160209655762, "logps/rejected": -24.538524627685547, "loss": 0.0931, "rewards/accuracies": 1.0, "rewards/chosen": 2.9679946899414062, "rewards/margins": 2.9679946899414062, "rewards/rejected": 0.0, "step": 2230 }, { "epoch": 12.46368715083799, "grad_norm": 0.45429736244359015, "learning_rate": 1.4163111433614367e-07, "logits/chosen": -3.5283403396606445, "logits/rejected": -3.4119808673858643, "logps/chosen": -6.153657913208008, "logps/rejected": -61.084014892578125, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 2.651522159576416, "rewards/margins": 2.651522159576416, "rewards/rejected": 0.0, "step": 2231 }, { "epoch": 12.46927374301676, "grad_norm": 0.7081597777192936, "learning_rate": 1.4120631923832433e-07, "logits/chosen": -3.5893843173980713, "logits/rejected": -3.5564889907836914, "logps/chosen": -1.6719268560409546, "logps/rejected": -39.64484405517578, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 2.543139696121216, "rewards/margins": 2.543139696121216, "rewards/rejected": 0.0, "step": 2232 }, { "epoch": 12.474860335195531, "grad_norm": 1.7343545818297634, "learning_rate": 1.407820573718123e-07, "logits/chosen": -3.1008386611938477, "logits/rejected": -3.3952138423919678, "logps/chosen": -0.5091478824615479, "logps/rejected": -76.26481628417969, "loss": 0.1341, "rewards/accuracies": 1.0, "rewards/chosen": 1.9277145862579346, "rewards/margins": 1.9277145862579346, "rewards/rejected": 0.0, "step": 2233 }, { "epoch": 12.480446927374302, "grad_norm": 0.9048386788322109, "learning_rate": 1.403583293671362e-07, "logits/chosen": -3.2435061931610107, "logits/rejected": -3.1556649208068848, "logps/chosen": -4.584765434265137, "logps/rejected": -57.08665466308594, "loss": 0.1173, "rewards/accuracies": 1.0, "rewards/chosen": 2.6094436645507812, "rewards/margins": 2.6094436645507812, "rewards/rejected": 0.0, "step": 2234 }, { "epoch": 12.486033519553073, "grad_norm": 0.6167273944700344, "learning_rate": 1.3993513585403137e-07, "logits/chosen": -3.394244432449341, "logits/rejected": -3.3959386348724365, "logps/chosen": -0.9997613430023193, "logps/rejected": -29.299489974975586, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": 2.547755241394043, "rewards/margins": 2.547755241394043, "rewards/rejected": 0.0, "step": 2235 }, { "epoch": 12.491620111731844, "grad_norm": 1.2274380369073263, "learning_rate": 1.3951247746143858e-07, "logits/chosen": -3.270373821258545, "logits/rejected": -3.2850375175476074, "logps/chosen": -4.181968688964844, "logps/rejected": -44.3603515625, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": 2.7681868076324463, "rewards/margins": 2.7681868076324463, "rewards/rejected": 0.0, "step": 2236 }, { "epoch": 12.497206703910614, "grad_norm": 0.4390033977859069, "learning_rate": 1.3909035481750387e-07, "logits/chosen": -3.350935459136963, "logits/rejected": -3.4299426078796387, "logps/chosen": -0.43021515011787415, "logps/rejected": -37.922393798828125, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 2.0837368965148926, "rewards/margins": 2.0837368965148926, "rewards/rejected": 0.0, "step": 2237 }, { "epoch": 12.502793296089386, "grad_norm": 0.6648856335827535, "learning_rate": 1.386687685495761e-07, "logits/chosen": -3.3990509510040283, "logits/rejected": -3.4585108757019043, "logps/chosen": -0.5690751671791077, "logps/rejected": -63.70109558105469, "loss": 0.0868, "rewards/accuracies": 1.0, "rewards/chosen": 1.9457981586456299, "rewards/margins": 1.9457981586456299, "rewards/rejected": 0.0, "step": 2238 }, { "epoch": 12.508379888268156, "grad_norm": 0.4289420248302391, "learning_rate": 1.3824771928420798e-07, "logits/chosen": -3.1850552558898926, "logits/rejected": -3.1720054149627686, "logps/chosen": -1.643048644065857, "logps/rejected": -43.59968185424805, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 2.7690720558166504, "rewards/margins": 2.7690720558166504, "rewards/rejected": 0.0, "step": 2239 }, { "epoch": 12.513966480446927, "grad_norm": 0.3732674150498357, "learning_rate": 1.3782720764715354e-07, "logits/chosen": -3.4731249809265137, "logits/rejected": -3.4296131134033203, "logps/chosen": -0.17604614794254303, "logps/rejected": -56.595123291015625, "loss": 0.1253, "rewards/accuracies": 1.0, "rewards/chosen": 1.5817192792892456, "rewards/margins": 1.5817192792892456, "rewards/rejected": 0.0, "step": 2240 }, { "epoch": 12.513966480446927, "eval_logits/chosen": -3.2657618522644043, "eval_logits/rejected": -3.390824556350708, "eval_logps/chosen": -30.071054458618164, "eval_logps/rejected": -55.22324752807617, "eval_loss": 1.0132746696472168, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": -0.00010088682029163465, "eval_rewards/margins": -0.00010088682029163465, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6892, "eval_samples_per_second": 9.483, "eval_steps_per_second": 0.306, "step": 2240 }, { "epoch": 12.519553072625698, "grad_norm": 0.7790209618270445, "learning_rate": 1.37407234263368e-07, "logits/chosen": -3.5272529125213623, "logits/rejected": -3.371617555618286, "logps/chosen": -2.3697447776794434, "logps/rejected": -40.91530227661133, "loss": 0.1124, "rewards/accuracies": 1.0, "rewards/chosen": 2.849588394165039, "rewards/margins": 2.849588394165039, "rewards/rejected": 0.0, "step": 2241 }, { "epoch": 12.525139664804469, "grad_norm": 1.0634639607925207, "learning_rate": 1.3698779975700657e-07, "logits/chosen": -3.3085103034973145, "logits/rejected": -3.4670214653015137, "logps/chosen": -2.182471513748169, "logps/rejected": -73.75332641601562, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": 2.5379199981689453, "rewards/margins": 2.5379199981689453, "rewards/rejected": 0.0, "step": 2242 }, { "epoch": 12.53072625698324, "grad_norm": 2.0743889979879833, "learning_rate": 1.3656890475142363e-07, "logits/chosen": -3.423936367034912, "logits/rejected": -3.3345353603363037, "logps/chosen": -1.023058295249939, "logps/rejected": -61.91513442993164, "loss": 0.1045, "rewards/accuracies": 1.0, "rewards/chosen": 2.6286544799804688, "rewards/margins": 2.6286544799804688, "rewards/rejected": 0.0, "step": 2243 }, { "epoch": 12.53631284916201, "grad_norm": 0.39536921993716084, "learning_rate": 1.3615054986917179e-07, "logits/chosen": -3.265951156616211, "logits/rejected": -3.256979465484619, "logps/chosen": -11.967455863952637, "logps/rejected": -78.19622802734375, "loss": 0.1265, "rewards/accuracies": 1.0, "rewards/chosen": 3.3327698707580566, "rewards/margins": 3.3327698707580566, "rewards/rejected": 0.0, "step": 2244 }, { "epoch": 12.541899441340782, "grad_norm": 1.8837471000302437, "learning_rate": 1.3573273573200072e-07, "logits/chosen": -3.3350565433502197, "logits/rejected": -3.3303418159484863, "logps/chosen": -1.2605342864990234, "logps/rejected": -43.27762222290039, "loss": 0.1406, "rewards/accuracies": 1.0, "rewards/chosen": 2.628943681716919, "rewards/margins": 2.628943681716919, "rewards/rejected": 0.0, "step": 2245 }, { "epoch": 12.547486033519553, "grad_norm": 0.4651543847476161, "learning_rate": 1.3531546296085715e-07, "logits/chosen": -3.2130534648895264, "logits/rejected": -3.4951980113983154, "logps/chosen": -1.6723864078521729, "logps/rejected": -40.059288024902344, "loss": 0.1083, "rewards/accuracies": 1.0, "rewards/chosen": 2.673156261444092, "rewards/margins": 2.673156261444092, "rewards/rejected": 0.0, "step": 2246 }, { "epoch": 12.553072625698324, "grad_norm": 0.6138720038777592, "learning_rate": 1.3489873217588205e-07, "logits/chosen": -3.507383108139038, "logits/rejected": -3.515362024307251, "logps/chosen": -1.6765773296356201, "logps/rejected": -67.03004455566406, "loss": 0.1105, "rewards/accuracies": 1.0, "rewards/chosen": 2.8080921173095703, "rewards/margins": 2.8080921173095703, "rewards/rejected": 0.0, "step": 2247 }, { "epoch": 12.558659217877095, "grad_norm": 0.7722542782114533, "learning_rate": 1.3448254399641206e-07, "logits/chosen": -3.5299947261810303, "logits/rejected": -3.495619058609009, "logps/chosen": -14.777970314025879, "logps/rejected": -62.427162170410156, "loss": 0.0973, "rewards/accuracies": 1.0, "rewards/chosen": 3.873805522918701, "rewards/margins": 3.873805522918701, "rewards/rejected": 0.0, "step": 2248 }, { "epoch": 12.564245810055866, "grad_norm": 0.41163074403727734, "learning_rate": 1.340668990409768e-07, "logits/chosen": -3.5440049171447754, "logits/rejected": -3.554119825363159, "logps/chosen": -3.7114877700805664, "logps/rejected": -36.919189453125, "loss": 0.0868, "rewards/accuracies": 1.0, "rewards/chosen": 3.0860977172851562, "rewards/margins": 3.0860977172851562, "rewards/rejected": 0.0, "step": 2249 }, { "epoch": 12.569832402234637, "grad_norm": 0.4857325688319073, "learning_rate": 1.336517979272987e-07, "logits/chosen": -3.6649792194366455, "logits/rejected": -3.497124433517456, "logps/chosen": -1.410256266593933, "logps/rejected": -38.30725860595703, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 2.8645949363708496, "rewards/margins": 2.8645949363708496, "rewards/rejected": 0.0, "step": 2250 }, { "epoch": 12.575418994413408, "grad_norm": 0.42074558799151407, "learning_rate": 1.3323724127229191e-07, "logits/chosen": -3.2385005950927734, "logits/rejected": -3.319828987121582, "logps/chosen": -0.31787440180778503, "logps/rejected": -84.67262268066406, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": 1.686431884765625, "rewards/margins": 1.686431884765625, "rewards/rejected": 0.0, "step": 2251 }, { "epoch": 12.581005586592179, "grad_norm": 0.518755336213781, "learning_rate": 1.3282322969206145e-07, "logits/chosen": -2.9969799518585205, "logits/rejected": -2.7330167293548584, "logps/chosen": -18.212379455566406, "logps/rejected": -60.43743133544922, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 3.314410448074341, "rewards/margins": 3.314410448074341, "rewards/rejected": 0.0, "step": 2252 }, { "epoch": 12.58659217877095, "grad_norm": 0.38469720487246467, "learning_rate": 1.3240976380190228e-07, "logits/chosen": -3.724048614501953, "logits/rejected": -3.806608200073242, "logps/chosen": -2.855919361114502, "logps/rejected": -27.25371551513672, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": 2.6647424697875977, "rewards/margins": 2.6647424697875977, "rewards/rejected": 0.0, "step": 2253 }, { "epoch": 12.59217877094972, "grad_norm": 0.9301604504936327, "learning_rate": 1.319968442162983e-07, "logits/chosen": -3.6221389770507812, "logits/rejected": -3.578037977218628, "logps/chosen": -7.850653171539307, "logps/rejected": -50.1861572265625, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": 2.3544840812683105, "rewards/margins": 2.3544840812683105, "rewards/rejected": 0.0, "step": 2254 }, { "epoch": 12.597765363128492, "grad_norm": 0.41255745904527696, "learning_rate": 1.3158447154892167e-07, "logits/chosen": -3.176616907119751, "logits/rejected": -3.4189412593841553, "logps/chosen": -6.886829853057861, "logps/rejected": -48.617454528808594, "loss": 0.0909, "rewards/accuracies": 1.0, "rewards/chosen": 3.4059970378875732, "rewards/margins": 3.4059970378875732, "rewards/rejected": 0.0, "step": 2255 }, { "epoch": 12.603351955307263, "grad_norm": 0.8510964543267436, "learning_rate": 1.3117264641263138e-07, "logits/chosen": -3.5320487022399902, "logits/rejected": -3.3638176918029785, "logps/chosen": -15.711894035339355, "logps/rejected": -36.03840255737305, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 3.414116382598877, "rewards/margins": 3.414116382598877, "rewards/rejected": 0.0, "step": 2256 }, { "epoch": 12.608938547486034, "grad_norm": 0.4011281051392134, "learning_rate": 1.307613694194733e-07, "logits/chosen": -3.2588086128234863, "logits/rejected": -3.465930461883545, "logps/chosen": -2.835179328918457, "logps/rejected": -98.97663879394531, "loss": 0.1149, "rewards/accuracies": 1.0, "rewards/chosen": 3.0574088096618652, "rewards/margins": 3.0574088096618652, "rewards/rejected": 0.0, "step": 2257 }, { "epoch": 12.614525139664805, "grad_norm": 0.380910060419441, "learning_rate": 1.3035064118067784e-07, "logits/chosen": -3.342238426208496, "logits/rejected": -3.1885581016540527, "logps/chosen": -11.553668975830078, "logps/rejected": -27.367694854736328, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 4.3055620193481445, "rewards/margins": 4.3055620193481445, "rewards/rejected": 0.0, "step": 2258 }, { "epoch": 12.620111731843576, "grad_norm": 2.3723394233967396, "learning_rate": 1.2994046230666067e-07, "logits/chosen": -3.3266143798828125, "logits/rejected": -3.239595651626587, "logps/chosen": -1.1125938892364502, "logps/rejected": -56.21533966064453, "loss": 0.0855, "rewards/accuracies": 1.0, "rewards/chosen": 2.46859073638916, "rewards/margins": 2.46859073638916, "rewards/rejected": 0.0, "step": 2259 }, { "epoch": 12.625698324022347, "grad_norm": 0.40838752606489764, "learning_rate": 1.2953083340702048e-07, "logits/chosen": -3.198375940322876, "logits/rejected": -3.3978817462921143, "logps/chosen": -0.4842168688774109, "logps/rejected": -103.18638610839844, "loss": 0.106, "rewards/accuracies": 1.0, "rewards/chosen": 2.0734763145446777, "rewards/margins": 2.0734763145446777, "rewards/rejected": 0.0, "step": 2260 }, { "epoch": 12.625698324022347, "eval_logits/chosen": -3.2565982341766357, "eval_logits/rejected": -3.3827126026153564, "eval_logps/chosen": -30.2506103515625, "eval_logps/rejected": -55.432945251464844, "eval_loss": 1.0206115245819092, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": -0.01805674470961094, "eval_rewards/margins": -0.01805674470961094, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7009, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.306, "step": 2260 }, { "epoch": 12.631284916201118, "grad_norm": 1.0435656728977338, "learning_rate": 1.291217550905388e-07, "logits/chosen": -3.0976061820983887, "logits/rejected": -3.229267120361328, "logps/chosen": -17.147357940673828, "logps/rejected": -43.47943878173828, "loss": 0.0882, "rewards/accuracies": 1.0, "rewards/chosen": 3.126455307006836, "rewards/margins": 3.126455307006836, "rewards/rejected": 0.0, "step": 2261 }, { "epoch": 12.636871508379889, "grad_norm": 0.6080571860345122, "learning_rate": 1.2871322796517885e-07, "logits/chosen": -3.450392007827759, "logits/rejected": -3.6465983390808105, "logps/chosen": -0.8923044204711914, "logps/rejected": -60.1802864074707, "loss": 0.0885, "rewards/accuracies": 1.0, "rewards/chosen": 2.3599853515625, "rewards/margins": 2.3599853515625, "rewards/rejected": 0.0, "step": 2262 }, { "epoch": 12.64245810055866, "grad_norm": 0.9806283048446751, "learning_rate": 1.283052526380848e-07, "logits/chosen": -3.5287232398986816, "logits/rejected": -3.418076992034912, "logps/chosen": -0.7432454824447632, "logps/rejected": -89.81851196289062, "loss": 0.1434, "rewards/accuracies": 1.0, "rewards/chosen": 2.3667078018188477, "rewards/margins": 2.3667078018188477, "rewards/rejected": 0.0, "step": 2263 }, { "epoch": 12.64804469273743, "grad_norm": 0.4467889703973007, "learning_rate": 1.2789782971558044e-07, "logits/chosen": -3.540520191192627, "logits/rejected": -3.454953908920288, "logps/chosen": -5.883553981781006, "logps/rejected": -87.08979797363281, "loss": 0.1113, "rewards/accuracies": 1.0, "rewards/chosen": 3.080805778503418, "rewards/margins": 3.080805778503418, "rewards/rejected": 0.0, "step": 2264 }, { "epoch": 12.653631284916202, "grad_norm": 0.4474252456646257, "learning_rate": 1.2749095980316876e-07, "logits/chosen": -3.170991897583008, "logits/rejected": -3.045565605163574, "logps/chosen": -4.559657573699951, "logps/rejected": -65.60655212402344, "loss": 0.1128, "rewards/accuracies": 1.0, "rewards/chosen": 2.680539608001709, "rewards/margins": 2.680539608001709, "rewards/rejected": 0.0, "step": 2265 }, { "epoch": 12.659217877094973, "grad_norm": 0.42478662256223054, "learning_rate": 1.2708464350553134e-07, "logits/chosen": -3.2953999042510986, "logits/rejected": -3.3270702362060547, "logps/chosen": -6.541364669799805, "logps/rejected": -37.65180206298828, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 3.0329442024230957, "rewards/margins": 3.0329442024230957, "rewards/rejected": 0.0, "step": 2266 }, { "epoch": 12.664804469273744, "grad_norm": 0.4749508217388263, "learning_rate": 1.26678881426526e-07, "logits/chosen": -3.16400146484375, "logits/rejected": -3.2337470054626465, "logps/chosen": -0.4564622640609741, "logps/rejected": -67.92149353027344, "loss": 0.0947, "rewards/accuracies": 1.0, "rewards/chosen": 2.0054678916931152, "rewards/margins": 2.0054678916931152, "rewards/rejected": 0.0, "step": 2267 }, { "epoch": 12.670391061452515, "grad_norm": 0.7671061924001767, "learning_rate": 1.2627367416918783e-07, "logits/chosen": -3.6778130531311035, "logits/rejected": -3.4792375564575195, "logps/chosen": -1.3930641412734985, "logps/rejected": -71.66496276855469, "loss": 0.0969, "rewards/accuracies": 1.0, "rewards/chosen": 2.701521873474121, "rewards/margins": 2.701521873474121, "rewards/rejected": 0.0, "step": 2268 }, { "epoch": 12.675977653631286, "grad_norm": 0.40730423026594653, "learning_rate": 1.2586902233572688e-07, "logits/chosen": -3.3053927421569824, "logits/rejected": -3.326603412628174, "logps/chosen": -2.162668228149414, "logps/rejected": -41.80281448364258, "loss": 0.0794, "rewards/accuracies": 1.0, "rewards/chosen": 2.7441184520721436, "rewards/margins": 2.7441184520721436, "rewards/rejected": 0.0, "step": 2269 }, { "epoch": 12.681564245810057, "grad_norm": 0.9292182777541721, "learning_rate": 1.2546492652752782e-07, "logits/chosen": -3.3834147453308105, "logits/rejected": -3.273434638977051, "logps/chosen": -0.6231309771537781, "logps/rejected": -61.2021484375, "loss": 0.1277, "rewards/accuracies": 1.0, "rewards/chosen": 1.9415831565856934, "rewards/margins": 1.9415831565856934, "rewards/rejected": 0.0, "step": 2270 }, { "epoch": 12.687150837988828, "grad_norm": 1.0463400068040039, "learning_rate": 1.2506138734514897e-07, "logits/chosen": -3.2636947631835938, "logits/rejected": -3.337916135787964, "logps/chosen": -8.795644760131836, "logps/rejected": -53.609832763671875, "loss": 0.094, "rewards/accuracies": 1.0, "rewards/chosen": 3.550365447998047, "rewards/margins": 3.550365447998047, "rewards/rejected": 0.0, "step": 2271 }, { "epoch": 12.692737430167599, "grad_norm": 0.42881096784347483, "learning_rate": 1.2465840538832145e-07, "logits/chosen": -3.402754306793213, "logits/rejected": -3.577317476272583, "logps/chosen": -8.689858436584473, "logps/rejected": -49.614295959472656, "loss": 0.1294, "rewards/accuracies": 1.0, "rewards/chosen": 2.9228854179382324, "rewards/margins": 2.9228854179382324, "rewards/rejected": 0.0, "step": 2272 }, { "epoch": 12.69832402234637, "grad_norm": 1.6028214497713202, "learning_rate": 1.2425598125594822e-07, "logits/chosen": -3.5034029483795166, "logits/rejected": -3.358663320541382, "logps/chosen": -1.7330137491226196, "logps/rejected": -42.20705795288086, "loss": 0.1177, "rewards/accuracies": 1.0, "rewards/chosen": 2.675685405731201, "rewards/margins": 2.675685405731201, "rewards/rejected": 0.0, "step": 2273 }, { "epoch": 12.703910614525139, "grad_norm": 0.5623449768736747, "learning_rate": 1.2385411554610304e-07, "logits/chosen": -3.3504233360290527, "logits/rejected": -3.390580177307129, "logps/chosen": -5.595243453979492, "logps/rejected": -42.18009948730469, "loss": 0.0749, "rewards/accuracies": 1.0, "rewards/chosen": 2.6517512798309326, "rewards/margins": 2.6517512798309326, "rewards/rejected": 0.0, "step": 2274 }, { "epoch": 12.70949720670391, "grad_norm": 0.686978584004586, "learning_rate": 1.2345280885603037e-07, "logits/chosen": -3.412993907928467, "logits/rejected": -3.3707876205444336, "logps/chosen": -1.2643492221832275, "logps/rejected": -33.132591247558594, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 2.125216484069824, "rewards/margins": 2.125216484069824, "rewards/rejected": 0.0, "step": 2275 }, { "epoch": 12.71508379888268, "grad_norm": 0.34437154460398195, "learning_rate": 1.2305206178214278e-07, "logits/chosen": -3.456272602081299, "logits/rejected": -3.514094114303589, "logps/chosen": -0.8294367790222168, "logps/rejected": -36.98713684082031, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": 2.3648805618286133, "rewards/margins": 2.3648805618286133, "rewards/rejected": 0.0, "step": 2276 }, { "epoch": 12.720670391061452, "grad_norm": 0.4517748131352269, "learning_rate": 1.2265187492002238e-07, "logits/chosen": -3.48850154876709, "logits/rejected": -3.203737497329712, "logps/chosen": -0.5373401045799255, "logps/rejected": -53.24457550048828, "loss": 0.1338, "rewards/accuracies": 1.0, "rewards/chosen": 2.1982016563415527, "rewards/margins": 2.1982016563415527, "rewards/rejected": 0.0, "step": 2277 }, { "epoch": 12.726256983240223, "grad_norm": 0.9630359520256496, "learning_rate": 1.2225224886441782e-07, "logits/chosen": -3.467395544052124, "logits/rejected": -3.5238499641418457, "logps/chosen": -6.839809894561768, "logps/rejected": -32.9104118347168, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": 3.6489503383636475, "rewards/margins": 3.6489503383636475, "rewards/rejected": 0.0, "step": 2278 }, { "epoch": 12.731843575418994, "grad_norm": 0.5032997328960539, "learning_rate": 1.2185318420924462e-07, "logits/chosen": -3.3734967708587646, "logits/rejected": -3.332409381866455, "logps/chosen": -0.4818335771560669, "logps/rejected": -82.39198303222656, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 2.0698351860046387, "rewards/margins": 2.0698351860046387, "rewards/rejected": 0.0, "step": 2279 }, { "epoch": 12.737430167597765, "grad_norm": 0.40200559581104356, "learning_rate": 1.21454681547584e-07, "logits/chosen": -3.384718656539917, "logits/rejected": -3.402092695236206, "logps/chosen": -4.479341506958008, "logps/rejected": -38.881500244140625, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": 2.584264039993286, "rewards/margins": 2.584264039993286, "rewards/rejected": 0.0, "step": 2280 }, { "epoch": 12.737430167597765, "eval_logits/chosen": -3.2632031440734863, "eval_logits/rejected": -3.387631893157959, "eval_logps/chosen": -30.60529136657715, "eval_logps/rejected": -55.6917610168457, "eval_loss": 1.0288151502609253, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": -0.053524959832429886, "eval_rewards/margins": -0.053524959832429886, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7164, "eval_samples_per_second": 9.475, "eval_steps_per_second": 0.306, "step": 2280 }, { "epoch": 12.743016759776536, "grad_norm": 0.46464037528578844, "learning_rate": 1.2105674147168177e-07, "logits/chosen": -3.386658191680908, "logits/rejected": -3.4391565322875977, "logps/chosen": -3.7839372158050537, "logps/rejected": -32.402061462402344, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 2.8743462562561035, "rewards/margins": 2.8743462562561035, "rewards/rejected": 0.0, "step": 2281 }, { "epoch": 12.748603351955307, "grad_norm": 0.5061322845280998, "learning_rate": 1.2065936457294785e-07, "logits/chosen": -3.6489040851593018, "logits/rejected": -3.607834815979004, "logps/chosen": -5.303872585296631, "logps/rejected": -50.91162872314453, "loss": 0.0827, "rewards/accuracies": 1.0, "rewards/chosen": 3.063188076019287, "rewards/margins": 3.063188076019287, "rewards/rejected": 0.0, "step": 2282 }, { "epoch": 12.754189944134078, "grad_norm": 0.4144482019368093, "learning_rate": 1.2026255144195507e-07, "logits/chosen": -3.567878246307373, "logits/rejected": -3.5910308361053467, "logps/chosen": -2.981426239013672, "logps/rejected": -48.233543395996094, "loss": 0.09, "rewards/accuracies": 1.0, "rewards/chosen": 3.2606635093688965, "rewards/margins": 3.2606635093688965, "rewards/rejected": 0.0, "step": 2283 }, { "epoch": 12.759776536312849, "grad_norm": 1.138898640724903, "learning_rate": 1.1986630266843833e-07, "logits/chosen": -3.4588491916656494, "logits/rejected": -3.533299684524536, "logps/chosen": -2.726400375366211, "logps/rejected": -59.0339469909668, "loss": 0.091, "rewards/accuracies": 1.0, "rewards/chosen": 2.9490859508514404, "rewards/margins": 2.9490859508514404, "rewards/rejected": 0.0, "step": 2284 }, { "epoch": 12.76536312849162, "grad_norm": 0.5321998402593737, "learning_rate": 1.1947061884129384e-07, "logits/chosen": -3.5031237602233887, "logits/rejected": -3.5955991744995117, "logps/chosen": -0.3842340111732483, "logps/rejected": -78.43391418457031, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": 1.9946914911270142, "rewards/margins": 1.9946914911270142, "rewards/rejected": 0.0, "step": 2285 }, { "epoch": 12.77094972067039, "grad_norm": 0.5736551056197052, "learning_rate": 1.1907550054857862e-07, "logits/chosen": -3.4828784465789795, "logits/rejected": -3.4474778175354004, "logps/chosen": -12.686964988708496, "logps/rejected": -37.39101028442383, "loss": 0.1182, "rewards/accuracies": 1.0, "rewards/chosen": 3.75471830368042, "rewards/margins": 3.75471830368042, "rewards/rejected": 0.0, "step": 2286 }, { "epoch": 12.776536312849162, "grad_norm": 0.41298213307299225, "learning_rate": 1.186809483775082e-07, "logits/chosen": -3.2222204208374023, "logits/rejected": -3.294398784637451, "logps/chosen": -5.704745292663574, "logps/rejected": -47.13985061645508, "loss": 0.1049, "rewards/accuracies": 1.0, "rewards/chosen": 3.5716466903686523, "rewards/margins": 3.5716466903686523, "rewards/rejected": 0.0, "step": 2287 }, { "epoch": 12.782122905027933, "grad_norm": 0.4950033879661093, "learning_rate": 1.182869629144579e-07, "logits/chosen": -3.2026522159576416, "logits/rejected": -3.1474239826202393, "logps/chosen": -12.411076545715332, "logps/rejected": -49.24952697753906, "loss": 0.0871, "rewards/accuracies": 1.0, "rewards/chosen": 2.957362174987793, "rewards/margins": 2.957362174987793, "rewards/rejected": 0.0, "step": 2288 }, { "epoch": 12.787709497206704, "grad_norm": 0.3400573035366642, "learning_rate": 1.1789354474496e-07, "logits/chosen": -3.648020029067993, "logits/rejected": -3.613192319869995, "logps/chosen": -1.627375841140747, "logps/rejected": -44.16062927246094, "loss": 0.121, "rewards/accuracies": 1.0, "rewards/chosen": 2.3433022499084473, "rewards/margins": 2.3433022499084473, "rewards/rejected": 0.0, "step": 2289 }, { "epoch": 12.793296089385475, "grad_norm": 0.454376022046106, "learning_rate": 1.1750069445370408e-07, "logits/chosen": -3.5968801975250244, "logits/rejected": -3.6151320934295654, "logps/chosen": -7.9256272315979, "logps/rejected": -36.523895263671875, "loss": 0.1153, "rewards/accuracies": 1.0, "rewards/chosen": 3.9409613609313965, "rewards/margins": 3.9409613609313965, "rewards/rejected": 0.0, "step": 2290 }, { "epoch": 12.798882681564246, "grad_norm": 1.0140073801191927, "learning_rate": 1.1710841262453563e-07, "logits/chosen": -3.7124454975128174, "logits/rejected": -3.6189231872558594, "logps/chosen": -1.2007167339324951, "logps/rejected": -38.000518798828125, "loss": 0.0988, "rewards/accuracies": 1.0, "rewards/chosen": 2.7239294052124023, "rewards/margins": 2.7239294052124023, "rewards/rejected": 0.0, "step": 2291 }, { "epoch": 12.804469273743017, "grad_norm": 0.3992200071958301, "learning_rate": 1.1671669984045529e-07, "logits/chosen": -3.361949920654297, "logits/rejected": -3.4557816982269287, "logps/chosen": -0.41469088196754456, "logps/rejected": -96.77440643310547, "loss": 0.1227, "rewards/accuracies": 1.0, "rewards/chosen": 1.8790662288665771, "rewards/margins": 1.8790662288665771, "rewards/rejected": 0.0, "step": 2292 }, { "epoch": 12.810055865921788, "grad_norm": 0.45543717625752306, "learning_rate": 1.1632555668361799e-07, "logits/chosen": -3.2611303329467773, "logits/rejected": -3.474437952041626, "logps/chosen": -0.6212449073791504, "logps/rejected": -107.97794342041016, "loss": 0.1059, "rewards/accuracies": 1.0, "rewards/chosen": 2.037877321243286, "rewards/margins": 2.037877321243286, "rewards/rejected": 0.0, "step": 2293 }, { "epoch": 12.815642458100559, "grad_norm": 0.39206772516047084, "learning_rate": 1.1593498373533195e-07, "logits/chosen": -3.4295456409454346, "logits/rejected": -3.521411895751953, "logps/chosen": -4.333605766296387, "logps/rejected": -51.17797088623047, "loss": 0.0834, "rewards/accuracies": 1.0, "rewards/chosen": 3.9523544311523438, "rewards/margins": 3.9523544311523438, "rewards/rejected": 0.0, "step": 2294 }, { "epoch": 12.82122905027933, "grad_norm": 0.37064044990678563, "learning_rate": 1.1554498157605863e-07, "logits/chosen": -3.3964834213256836, "logits/rejected": -3.3317043781280518, "logps/chosen": -1.1701773405075073, "logps/rejected": -41.469482421875, "loss": 0.1057, "rewards/accuracies": 1.0, "rewards/chosen": 2.688302516937256, "rewards/margins": 2.688302516937256, "rewards/rejected": 0.0, "step": 2295 }, { "epoch": 12.8268156424581, "grad_norm": 0.5249714895001094, "learning_rate": 1.1515555078541012e-07, "logits/chosen": -3.524517774581909, "logits/rejected": -3.5933830738067627, "logps/chosen": -6.073652267456055, "logps/rejected": -40.80901336669922, "loss": 0.0896, "rewards/accuracies": 1.0, "rewards/chosen": 2.266629695892334, "rewards/margins": 2.266629695892334, "rewards/rejected": 0.0, "step": 2296 }, { "epoch": 12.832402234636872, "grad_norm": 0.4645679581551184, "learning_rate": 1.1476669194215033e-07, "logits/chosen": -3.3193717002868652, "logits/rejected": -3.1490135192871094, "logps/chosen": -1.5837781429290771, "logps/rejected": -48.53924560546875, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": 2.7303173542022705, "rewards/margins": 2.7303173542022705, "rewards/rejected": 0.0, "step": 2297 }, { "epoch": 12.837988826815643, "grad_norm": 1.3326266721744495, "learning_rate": 1.143784056241926e-07, "logits/chosen": -3.339778184890747, "logits/rejected": -3.4562277793884277, "logps/chosen": -0.9490244388580322, "logps/rejected": -43.138038635253906, "loss": 0.0887, "rewards/accuracies": 1.0, "rewards/chosen": 2.599933385848999, "rewards/margins": 2.599933385848999, "rewards/rejected": 0.0, "step": 2298 }, { "epoch": 12.843575418994414, "grad_norm": 0.41659324142136256, "learning_rate": 1.1399069240859955e-07, "logits/chosen": -3.285973310470581, "logits/rejected": -3.210517406463623, "logps/chosen": -1.9052813053131104, "logps/rejected": -40.415252685546875, "loss": 0.0824, "rewards/accuracies": 1.0, "rewards/chosen": 3.0174355506896973, "rewards/margins": 3.0174355506896973, "rewards/rejected": 0.0, "step": 2299 }, { "epoch": 12.849162011173185, "grad_norm": 0.7115066262589483, "learning_rate": 1.1360355287158212e-07, "logits/chosen": -3.2919743061065674, "logits/rejected": -3.3648951053619385, "logps/chosen": -4.958246231079102, "logps/rejected": -43.85498809814453, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 3.034874439239502, "rewards/margins": 3.034874439239502, "rewards/rejected": 0.0, "step": 2300 }, { "epoch": 12.849162011173185, "eval_logits/chosen": -3.2657992839813232, "eval_logits/rejected": -3.389568328857422, "eval_logps/chosen": -30.225671768188477, "eval_logps/rejected": -55.22808837890625, "eval_loss": 1.0184718370437622, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.01556316576898098, "eval_rewards/margins": -0.01556316576898098, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7301, "eval_samples_per_second": 9.471, "eval_steps_per_second": 0.306, "step": 2300 }, { "epoch": 12.854748603351956, "grad_norm": 0.44602212592943713, "learning_rate": 1.1321698758849851e-07, "logits/chosen": -3.4965732097625732, "logits/rejected": -3.648874282836914, "logps/chosen": -1.9803823232650757, "logps/rejected": -81.80268859863281, "loss": 0.1273, "rewards/accuracies": 1.0, "rewards/chosen": 2.669301986694336, "rewards/margins": 2.669301986694336, "rewards/rejected": 0.0, "step": 2301 }, { "epoch": 12.860335195530727, "grad_norm": 0.460457654972344, "learning_rate": 1.128309971338537e-07, "logits/chosen": -3.4491028785705566, "logits/rejected": -3.6618459224700928, "logps/chosen": -0.38219979405403137, "logps/rejected": -105.92637634277344, "loss": 0.1088, "rewards/accuracies": 1.0, "rewards/chosen": 1.5562540292739868, "rewards/margins": 1.5562540292739868, "rewards/rejected": 0.0, "step": 2302 }, { "epoch": 12.865921787709498, "grad_norm": 0.5539161184618399, "learning_rate": 1.1244558208129795e-07, "logits/chosen": -3.259394407272339, "logits/rejected": -3.2880139350891113, "logps/chosen": -0.3285684287548065, "logps/rejected": -31.84539222717285, "loss": 0.1353, "rewards/accuracies": 1.0, "rewards/chosen": 1.7426209449768066, "rewards/margins": 1.7426209449768066, "rewards/rejected": 0.0, "step": 2303 }, { "epoch": 12.871508379888269, "grad_norm": 0.44330712618631835, "learning_rate": 1.1206074300362723e-07, "logits/chosen": -3.51483416557312, "logits/rejected": -3.612828254699707, "logps/chosen": -0.490126371383667, "logps/rejected": -43.235599517822266, "loss": 0.0939, "rewards/accuracies": 1.0, "rewards/chosen": 2.2221014499664307, "rewards/margins": 2.2221014499664307, "rewards/rejected": 0.0, "step": 2304 }, { "epoch": 12.87709497206704, "grad_norm": 0.4274635041965022, "learning_rate": 1.1167648047278033e-07, "logits/chosen": -3.399958372116089, "logits/rejected": -3.4519026279449463, "logps/chosen": -0.6736325025558472, "logps/rejected": -34.915618896484375, "loss": 0.0786, "rewards/accuracies": 1.0, "rewards/chosen": 2.456824541091919, "rewards/margins": 2.456824541091919, "rewards/rejected": 0.0, "step": 2305 }, { "epoch": 12.88268156424581, "grad_norm": 1.1598479550425806, "learning_rate": 1.1129279505984035e-07, "logits/chosen": -3.4720335006713867, "logits/rejected": -3.435961961746216, "logps/chosen": -2.019890308380127, "logps/rejected": -50.04304504394531, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": 2.0951738357543945, "rewards/margins": 2.0951738357543945, "rewards/rejected": 0.0, "step": 2306 }, { "epoch": 12.888268156424582, "grad_norm": 0.4425863194920016, "learning_rate": 1.1090968733503158e-07, "logits/chosen": -3.3264081478118896, "logits/rejected": -3.538029193878174, "logps/chosen": -0.9513140916824341, "logps/rejected": -86.88038635253906, "loss": 0.1116, "rewards/accuracies": 1.0, "rewards/chosen": 1.8230986595153809, "rewards/margins": 1.8230986595153809, "rewards/rejected": 0.0, "step": 2307 }, { "epoch": 12.893854748603353, "grad_norm": 0.6762900506062927, "learning_rate": 1.1052715786772077e-07, "logits/chosen": -3.3086395263671875, "logits/rejected": -3.4097607135772705, "logps/chosen": -1.1275460720062256, "logps/rejected": -51.35859680175781, "loss": 0.1287, "rewards/accuracies": 1.0, "rewards/chosen": 2.0423707962036133, "rewards/margins": 2.0423707962036133, "rewards/rejected": 0.0, "step": 2308 }, { "epoch": 12.899441340782122, "grad_norm": 1.1668855063735013, "learning_rate": 1.1014520722641473e-07, "logits/chosen": -3.527906894683838, "logits/rejected": -3.312272548675537, "logps/chosen": -15.089126586914062, "logps/rejected": -22.657346725463867, "loss": 0.0978, "rewards/accuracies": 1.0, "rewards/chosen": 2.603621244430542, "rewards/margins": 2.603621244430542, "rewards/rejected": 0.0, "step": 2309 }, { "epoch": 12.905027932960895, "grad_norm": 0.4243446357729765, "learning_rate": 1.0976383597876004e-07, "logits/chosen": -3.4055118560791016, "logits/rejected": -3.5252139568328857, "logps/chosen": -3.22821044921875, "logps/rejected": -42.4783935546875, "loss": 0.1049, "rewards/accuracies": 1.0, "rewards/chosen": 3.2797393798828125, "rewards/margins": 3.2797393798828125, "rewards/rejected": 0.0, "step": 2310 }, { "epoch": 12.910614525139664, "grad_norm": 0.5526143207246139, "learning_rate": 1.0938304469154236e-07, "logits/chosen": -3.4045262336730957, "logits/rejected": -3.383348226547241, "logps/chosen": -1.4761393070220947, "logps/rejected": -56.56805419921875, "loss": 0.1181, "rewards/accuracies": 1.0, "rewards/chosen": 2.531733512878418, "rewards/margins": 2.531733512878418, "rewards/rejected": 0.0, "step": 2311 }, { "epoch": 12.916201117318435, "grad_norm": 0.5165698942231859, "learning_rate": 1.0900283393068538e-07, "logits/chosen": -3.5687220096588135, "logits/rejected": -3.64994740486145, "logps/chosen": -0.9706995487213135, "logps/rejected": -50.04160690307617, "loss": 0.0942, "rewards/accuracies": 1.0, "rewards/chosen": 2.31058406829834, "rewards/margins": 2.31058406829834, "rewards/rejected": 0.0, "step": 2312 }, { "epoch": 12.921787709497206, "grad_norm": 1.3137916392199143, "learning_rate": 1.0862320426124987e-07, "logits/chosen": -3.4807398319244385, "logits/rejected": -3.5688562393188477, "logps/chosen": -3.280137538909912, "logps/rejected": -51.393150329589844, "loss": 0.1115, "rewards/accuracies": 1.0, "rewards/chosen": 3.0259995460510254, "rewards/margins": 3.0259995460510254, "rewards/rejected": 0.0, "step": 2313 }, { "epoch": 12.927374301675977, "grad_norm": 0.5793005922518515, "learning_rate": 1.0824415624743316e-07, "logits/chosen": -3.4329700469970703, "logits/rejected": -3.5415170192718506, "logps/chosen": -5.662870407104492, "logps/rejected": -44.23683547973633, "loss": 0.0636, "rewards/accuracies": 1.0, "rewards/chosen": 3.847524642944336, "rewards/margins": 3.847524642944336, "rewards/rejected": 0.0, "step": 2314 }, { "epoch": 12.932960893854748, "grad_norm": 0.688741724035675, "learning_rate": 1.0786569045256844e-07, "logits/chosen": -3.2243945598602295, "logits/rejected": -3.1893362998962402, "logps/chosen": -36.53736877441406, "logps/rejected": -35.05921173095703, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": 4.05674934387207, "rewards/margins": 4.05674934387207, "rewards/rejected": 0.0, "step": 2315 }, { "epoch": 12.938547486033519, "grad_norm": 0.7024087167910086, "learning_rate": 1.0748780743912273e-07, "logits/chosen": -3.17535662651062, "logits/rejected": -3.2973382472991943, "logps/chosen": -0.6445251703262329, "logps/rejected": -37.588218688964844, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 2.3380391597747803, "rewards/margins": 2.3380391597747803, "rewards/rejected": 0.0, "step": 2316 }, { "epoch": 12.94413407821229, "grad_norm": 0.43512439717326357, "learning_rate": 1.071105077686979e-07, "logits/chosen": -3.64111328125, "logits/rejected": -3.4275224208831787, "logps/chosen": -0.5026963949203491, "logps/rejected": -82.1552505493164, "loss": 0.0932, "rewards/accuracies": 1.0, "rewards/chosen": 2.1187191009521484, "rewards/margins": 2.1187191009521484, "rewards/rejected": 0.0, "step": 2317 }, { "epoch": 12.949720670391061, "grad_norm": 0.6007210329834282, "learning_rate": 1.0673379200202831e-07, "logits/chosen": -3.512960195541382, "logits/rejected": -3.60532283782959, "logps/chosen": -0.7710492610931396, "logps/rejected": -56.94520568847656, "loss": 0.0759, "rewards/accuracies": 1.0, "rewards/chosen": 2.1803629398345947, "rewards/margins": 2.1803629398345947, "rewards/rejected": 0.0, "step": 2318 }, { "epoch": 12.955307262569832, "grad_norm": 0.7332608521876424, "learning_rate": 1.0635766069898067e-07, "logits/chosen": -3.2716264724731445, "logits/rejected": -3.2488861083984375, "logps/chosen": -1.935168981552124, "logps/rejected": -39.56070327758789, "loss": 0.0909, "rewards/accuracies": 1.0, "rewards/chosen": 2.5785341262817383, "rewards/margins": 2.5785341262817383, "rewards/rejected": 0.0, "step": 2319 }, { "epoch": 12.960893854748603, "grad_norm": 0.8285272950785425, "learning_rate": 1.0598211441855309e-07, "logits/chosen": -3.2604289054870605, "logits/rejected": -3.347188949584961, "logps/chosen": -0.8810999989509583, "logps/rejected": -73.14479064941406, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": 2.319912910461426, "rewards/margins": 2.319912910461426, "rewards/rejected": 0.0, "step": 2320 }, { "epoch": 12.960893854748603, "eval_logits/chosen": -3.2581944465637207, "eval_logits/rejected": -3.3839733600616455, "eval_logps/chosen": -30.50138282775879, "eval_logps/rejected": -55.073570251464844, "eval_loss": 1.0289722681045532, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": -0.04313389211893082, "eval_rewards/margins": -0.04313389211893082, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6713, "eval_samples_per_second": 9.488, "eval_steps_per_second": 0.306, "step": 2320 }, { "epoch": 12.966480446927374, "grad_norm": 0.5036770849735132, "learning_rate": 1.0560715371887424e-07, "logits/chosen": -3.299856185913086, "logits/rejected": -3.561100482940674, "logps/chosen": -0.48619604110717773, "logps/rejected": -47.678043365478516, "loss": 0.0865, "rewards/accuracies": 1.0, "rewards/chosen": 2.1039106845855713, "rewards/margins": 2.1039106845855713, "rewards/rejected": 0.0, "step": 2321 }, { "epoch": 12.972067039106145, "grad_norm": 0.4041259081197625, "learning_rate": 1.0523277915720252e-07, "logits/chosen": -3.390162229537964, "logits/rejected": -3.370681047439575, "logps/chosen": -0.8836253881454468, "logps/rejected": -79.54744720458984, "loss": 0.1099, "rewards/accuracies": 1.0, "rewards/chosen": 2.232394218444824, "rewards/margins": 2.232394218444824, "rewards/rejected": 0.0, "step": 2322 }, { "epoch": 12.977653631284916, "grad_norm": 0.4098458996762994, "learning_rate": 1.0485899128992498e-07, "logits/chosen": -3.3942453861236572, "logits/rejected": -3.4706342220306396, "logps/chosen": -21.32162857055664, "logps/rejected": -35.426292419433594, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 4.139532089233398, "rewards/margins": 4.139532089233398, "rewards/rejected": 0.0, "step": 2323 }, { "epoch": 12.983240223463687, "grad_norm": 0.5340033200407157, "learning_rate": 1.0448579067255747e-07, "logits/chosen": -3.272456407546997, "logits/rejected": -3.3082878589630127, "logps/chosen": -6.20727014541626, "logps/rejected": -62.77374267578125, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": 3.2633891105651855, "rewards/margins": 3.2633891105651855, "rewards/rejected": 0.0, "step": 2324 }, { "epoch": 12.988826815642458, "grad_norm": 0.45993193104965063, "learning_rate": 1.0411317785974194e-07, "logits/chosen": -3.577439785003662, "logits/rejected": -3.5963082313537598, "logps/chosen": -2.768301486968994, "logps/rejected": -51.79939270019531, "loss": 0.0899, "rewards/accuracies": 1.0, "rewards/chosen": 2.9437942504882812, "rewards/margins": 2.9437942504882812, "rewards/rejected": 0.0, "step": 2325 }, { "epoch": 12.994413407821229, "grad_norm": 0.44211725643679217, "learning_rate": 1.0374115340524787e-07, "logits/chosen": -3.2956740856170654, "logits/rejected": -3.453584671020508, "logps/chosen": -2.895425796508789, "logps/rejected": -37.691078186035156, "loss": 0.0739, "rewards/accuracies": 1.0, "rewards/chosen": 3.1172409057617188, "rewards/margins": 3.1172409057617188, "rewards/rejected": 0.0, "step": 2326 }, { "epoch": 13.0, "grad_norm": 0.3660531302685357, "learning_rate": 1.0336971786196963e-07, "logits/chosen": -3.4428699016571045, "logits/rejected": -3.473644733428955, "logps/chosen": -0.23429515957832336, "logps/rejected": -100.61065673828125, "loss": 0.1103, "rewards/accuracies": 1.0, "rewards/chosen": 1.6497079133987427, "rewards/margins": 1.6497079133987427, "rewards/rejected": 0.0, "step": 2327 }, { "epoch": 13.005586592178771, "grad_norm": 0.6408412166809896, "learning_rate": 1.0299887178192668e-07, "logits/chosen": -3.5169355869293213, "logits/rejected": -3.649749755859375, "logps/chosen": -0.6415660977363586, "logps/rejected": -124.60030364990234, "loss": 0.1416, "rewards/accuracies": 1.0, "rewards/chosen": 2.137908458709717, "rewards/margins": 2.137908458709717, "rewards/rejected": 0.0, "step": 2328 }, { "epoch": 13.011173184357542, "grad_norm": 0.4911671107424767, "learning_rate": 1.026286157162623e-07, "logits/chosen": -3.168640375137329, "logits/rejected": -3.291517496109009, "logps/chosen": -1.6391764879226685, "logps/rejected": -44.04465866088867, "loss": 0.0824, "rewards/accuracies": 1.0, "rewards/chosen": 2.718446969985962, "rewards/margins": 2.718446969985962, "rewards/rejected": 0.0, "step": 2329 }, { "epoch": 13.016759776536313, "grad_norm": 0.36539337437173375, "learning_rate": 1.0225895021524288e-07, "logits/chosen": -3.586638927459717, "logits/rejected": -3.3814616203308105, "logps/chosen": -1.656554102897644, "logps/rejected": -53.50571060180664, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 3.1035714149475098, "rewards/margins": 3.1035714149475098, "rewards/rejected": 0.0, "step": 2330 }, { "epoch": 13.022346368715084, "grad_norm": 0.41900180730220143, "learning_rate": 1.0188987582825731e-07, "logits/chosen": -3.539759635925293, "logits/rejected": -3.4876558780670166, "logps/chosen": -0.5386089086532593, "logps/rejected": -53.84013366699219, "loss": 0.1126, "rewards/accuracies": 1.0, "rewards/chosen": 2.0928072929382324, "rewards/margins": 2.0928072929382324, "rewards/rejected": 0.0, "step": 2331 }, { "epoch": 13.027932960893855, "grad_norm": 0.41260798214807526, "learning_rate": 1.0152139310381564e-07, "logits/chosen": -3.0246479511260986, "logits/rejected": -3.1097795963287354, "logps/chosen": -16.28483772277832, "logps/rejected": -50.55168914794922, "loss": 0.0947, "rewards/accuracies": 1.0, "rewards/chosen": 3.687743663787842, "rewards/margins": 3.687743663787842, "rewards/rejected": 0.0, "step": 2332 }, { "epoch": 13.033519553072626, "grad_norm": 0.40657110392164086, "learning_rate": 1.0115350258954924e-07, "logits/chosen": -3.4643912315368652, "logits/rejected": -3.541138172149658, "logps/chosen": -6.377556800842285, "logps/rejected": -50.0582389831543, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 3.3987014293670654, "rewards/margins": 3.3987014293670654, "rewards/rejected": 0.0, "step": 2333 }, { "epoch": 13.039106145251397, "grad_norm": 0.5654692336015206, "learning_rate": 1.007862048322084e-07, "logits/chosen": -3.3608760833740234, "logits/rejected": -3.413088321685791, "logps/chosen": -0.3426097333431244, "logps/rejected": -43.96967315673828, "loss": 0.1014, "rewards/accuracies": 1.0, "rewards/chosen": 1.8785548210144043, "rewards/margins": 1.8785548210144043, "rewards/rejected": 0.0, "step": 2334 }, { "epoch": 13.044692737430168, "grad_norm": 1.1834090094153376, "learning_rate": 1.004195003776635e-07, "logits/chosen": -3.2762203216552734, "logits/rejected": -3.2310073375701904, "logps/chosen": -21.713489532470703, "logps/rejected": -64.16056823730469, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": 2.722824811935425, "rewards/margins": 2.722824811935425, "rewards/rejected": 0.0, "step": 2335 }, { "epoch": 13.050279329608939, "grad_norm": 0.37757238915543967, "learning_rate": 1.0005338977090222e-07, "logits/chosen": -3.4060440063476562, "logits/rejected": -3.3707094192504883, "logps/chosen": -0.9788174629211426, "logps/rejected": -70.06330108642578, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 2.3599703311920166, "rewards/margins": 2.3599703311920166, "rewards/rejected": 0.0, "step": 2336 }, { "epoch": 13.05586592178771, "grad_norm": 0.8538934050291332, "learning_rate": 9.968787355603043e-08, "logits/chosen": -3.287316083908081, "logits/rejected": -3.288396120071411, "logps/chosen": -2.2251811027526855, "logps/rejected": -25.12554168701172, "loss": 0.1251, "rewards/accuracies": 1.0, "rewards/chosen": 3.1232540607452393, "rewards/margins": 3.1232540607452393, "rewards/rejected": 0.0, "step": 2337 }, { "epoch": 13.061452513966481, "grad_norm": 0.41511079809678714, "learning_rate": 9.932295227627024e-08, "logits/chosen": -3.3394670486450195, "logits/rejected": -3.3566854000091553, "logps/chosen": -6.414815902709961, "logps/rejected": -53.56317901611328, "loss": 0.1009, "rewards/accuracies": 1.0, "rewards/chosen": 3.6157889366149902, "rewards/margins": 3.6157889366149902, "rewards/rejected": 0.0, "step": 2338 }, { "epoch": 13.067039106145252, "grad_norm": 0.34623811350305467, "learning_rate": 9.895862647395964e-08, "logits/chosen": -3.355541467666626, "logits/rejected": -3.5320823192596436, "logps/chosen": -8.17490005493164, "logps/rejected": -104.54173278808594, "loss": 0.1158, "rewards/accuracies": 1.0, "rewards/chosen": 3.5288174152374268, "rewards/margins": 3.5288174152374268, "rewards/rejected": 0.0, "step": 2339 }, { "epoch": 13.072625698324023, "grad_norm": 0.4178735197864293, "learning_rate": 9.859489669055165e-08, "logits/chosen": -2.971041440963745, "logits/rejected": -3.082794666290283, "logps/chosen": -3.396512269973755, "logps/rejected": -40.988922119140625, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": 2.658754587173462, "rewards/margins": 2.658754587173462, "rewards/rejected": 0.0, "step": 2340 }, { "epoch": 13.072625698324023, "eval_logits/chosen": -3.2651684284210205, "eval_logits/rejected": -3.3902339935302734, "eval_logps/chosen": -30.2309513092041, "eval_logps/rejected": -54.971168518066406, "eval_loss": 1.0176070928573608, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": -0.01609114371240139, "eval_rewards/margins": -0.01609114371240139, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6447, "eval_samples_per_second": 9.496, "eval_steps_per_second": 0.306, "step": 2340 }, { "epoch": 13.078212290502794, "grad_norm": 0.40969094850763416, "learning_rate": 9.823176346661355e-08, "logits/chosen": -3.2507095336914062, "logits/rejected": -3.461381673812866, "logps/chosen": -5.09861421585083, "logps/rejected": -50.253456115722656, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 2.959385395050049, "rewards/margins": 2.959385395050049, "rewards/rejected": 0.0, "step": 2341 }, { "epoch": 13.083798882681565, "grad_norm": 0.39790536006023935, "learning_rate": 9.786922734182596e-08, "logits/chosen": -3.341721534729004, "logits/rejected": -3.2856099605560303, "logps/chosen": -3.377207040786743, "logps/rejected": -36.50450897216797, "loss": 0.0969, "rewards/accuracies": 1.0, "rewards/chosen": 3.119828224182129, "rewards/margins": 3.119828224182129, "rewards/rejected": 0.0, "step": 2342 }, { "epoch": 13.089385474860336, "grad_norm": 0.44848991439820296, "learning_rate": 9.750728885498194e-08, "logits/chosen": -3.4456944465637207, "logits/rejected": -3.5190508365631104, "logps/chosen": -20.230289459228516, "logps/rejected": -60.31085968017578, "loss": 0.0777, "rewards/accuracies": 1.0, "rewards/chosen": 3.820690393447876, "rewards/margins": 3.820690393447876, "rewards/rejected": 0.0, "step": 2343 }, { "epoch": 13.094972067039107, "grad_norm": 0.4169931954396705, "learning_rate": 9.714594854398695e-08, "logits/chosen": -3.7361228466033936, "logits/rejected": -3.552189588546753, "logps/chosen": -0.8857628107070923, "logps/rejected": -43.35779571533203, "loss": 0.1337, "rewards/accuracies": 1.0, "rewards/chosen": 2.4181251525878906, "rewards/margins": 2.4181251525878906, "rewards/rejected": 0.0, "step": 2344 }, { "epoch": 13.100558659217878, "grad_norm": 0.3927370456785596, "learning_rate": 9.678520694585662e-08, "logits/chosen": -3.4572887420654297, "logits/rejected": -3.352020502090454, "logps/chosen": -17.534107208251953, "logps/rejected": -52.65359115600586, "loss": 0.0942, "rewards/accuracies": 1.0, "rewards/chosen": 3.997272491455078, "rewards/margins": 3.997272491455078, "rewards/rejected": 0.0, "step": 2345 }, { "epoch": 13.106145251396647, "grad_norm": 0.46886156405210677, "learning_rate": 9.642506459671745e-08, "logits/chosen": -3.3222928047180176, "logits/rejected": -3.3929309844970703, "logps/chosen": -3.5490429401397705, "logps/rejected": -98.44245147705078, "loss": 0.1173, "rewards/accuracies": 1.0, "rewards/chosen": 2.640000581741333, "rewards/margins": 2.640000581741333, "rewards/rejected": 0.0, "step": 2346 }, { "epoch": 13.111731843575418, "grad_norm": 0.4245962211325615, "learning_rate": 9.606552203180513e-08, "logits/chosen": -3.337649345397949, "logits/rejected": -3.1500914096832275, "logps/chosen": -14.195104598999023, "logps/rejected": -40.21344757080078, "loss": 0.1036, "rewards/accuracies": 1.0, "rewards/chosen": 3.5140798091888428, "rewards/margins": 3.5140798091888428, "rewards/rejected": 0.0, "step": 2347 }, { "epoch": 13.11731843575419, "grad_norm": 0.5806114984389156, "learning_rate": 9.570657978546381e-08, "logits/chosen": -3.372487783432007, "logits/rejected": -3.514693021774292, "logps/chosen": -0.7192268371582031, "logps/rejected": -105.02080535888672, "loss": 0.1272, "rewards/accuracies": 1.0, "rewards/chosen": 2.049988269805908, "rewards/margins": 2.049988269805908, "rewards/rejected": 0.0, "step": 2348 }, { "epoch": 13.12290502793296, "grad_norm": 0.39375042646087816, "learning_rate": 9.534823839114553e-08, "logits/chosen": -3.4985358715057373, "logits/rejected": -3.4309816360473633, "logps/chosen": -19.601560592651367, "logps/rejected": -43.45613098144531, "loss": 0.0898, "rewards/accuracies": 1.0, "rewards/chosen": 3.677501678466797, "rewards/margins": 3.677501678466797, "rewards/rejected": 0.0, "step": 2349 }, { "epoch": 13.128491620111731, "grad_norm": 0.4178634388752394, "learning_rate": 9.499049838140982e-08, "logits/chosen": -3.300751209259033, "logits/rejected": -3.357069969177246, "logps/chosen": -3.3639509677886963, "logps/rejected": -34.51044464111328, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": 3.083294153213501, "rewards/margins": 3.083294153213501, "rewards/rejected": 0.0, "step": 2350 }, { "epoch": 13.134078212290502, "grad_norm": 0.33561040614202914, "learning_rate": 9.463336028792157e-08, "logits/chosen": -3.408461570739746, "logits/rejected": -3.563394069671631, "logps/chosen": -2.475998878479004, "logps/rejected": -43.663578033447266, "loss": 0.1234, "rewards/accuracies": 1.0, "rewards/chosen": 3.060432195663452, "rewards/margins": 3.060432195663452, "rewards/rejected": 0.0, "step": 2351 }, { "epoch": 13.139664804469273, "grad_norm": 0.3351565583774232, "learning_rate": 9.42768246414517e-08, "logits/chosen": -3.5358974933624268, "logits/rejected": -3.603031635284424, "logps/chosen": -1.1546649932861328, "logps/rejected": -43.19342803955078, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 2.2943787574768066, "rewards/margins": 2.2943787574768066, "rewards/rejected": 0.0, "step": 2352 }, { "epoch": 13.145251396648044, "grad_norm": 0.4063516832940742, "learning_rate": 9.392089197187602e-08, "logits/chosen": -3.252317428588867, "logits/rejected": -3.304638624191284, "logps/chosen": -0.6251400709152222, "logps/rejected": -60.52002716064453, "loss": 0.1062, "rewards/accuracies": 1.0, "rewards/chosen": 2.2664456367492676, "rewards/margins": 2.2664456367492676, "rewards/rejected": 0.0, "step": 2353 }, { "epoch": 13.150837988826815, "grad_norm": 0.7339427043590783, "learning_rate": 9.356556280817334e-08, "logits/chosen": -3.2447080612182617, "logits/rejected": -3.2067127227783203, "logps/chosen": -0.7351337671279907, "logps/rejected": -80.97441864013672, "loss": 0.147, "rewards/accuracies": 1.0, "rewards/chosen": 2.164402723312378, "rewards/margins": 2.164402723312378, "rewards/rejected": 0.0, "step": 2354 }, { "epoch": 13.156424581005586, "grad_norm": 0.3783231588225719, "learning_rate": 9.321083767842646e-08, "logits/chosen": -3.2772772312164307, "logits/rejected": -3.340531587600708, "logps/chosen": -4.022555351257324, "logps/rejected": -37.15563201904297, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": 3.373973846435547, "rewards/margins": 3.373973846435547, "rewards/rejected": 0.0, "step": 2355 }, { "epoch": 13.162011173184357, "grad_norm": 0.5567838268088865, "learning_rate": 9.285671710981995e-08, "logits/chosen": -3.241022825241089, "logits/rejected": -3.380427598953247, "logps/chosen": -1.5501210689544678, "logps/rejected": -42.20450210571289, "loss": 0.121, "rewards/accuracies": 1.0, "rewards/chosen": 2.869105339050293, "rewards/margins": 2.869105339050293, "rewards/rejected": 0.0, "step": 2356 }, { "epoch": 13.167597765363128, "grad_norm": 0.41097528962627683, "learning_rate": 9.250320162864012e-08, "logits/chosen": -3.313119888305664, "logits/rejected": -3.4442129135131836, "logps/chosen": -5.751883029937744, "logps/rejected": -36.29100799560547, "loss": 0.1063, "rewards/accuracies": 1.0, "rewards/chosen": 2.2788279056549072, "rewards/margins": 2.2788279056549072, "rewards/rejected": 0.0, "step": 2357 }, { "epoch": 13.1731843575419, "grad_norm": 0.38552319656934547, "learning_rate": 9.215029176027373e-08, "logits/chosen": -3.304882049560547, "logits/rejected": -3.3625175952911377, "logps/chosen": -1.073160171508789, "logps/rejected": -75.98919677734375, "loss": 0.075, "rewards/accuracies": 1.0, "rewards/chosen": 2.6493020057678223, "rewards/margins": 2.6493020057678223, "rewards/rejected": 0.0, "step": 2358 }, { "epoch": 13.17877094972067, "grad_norm": 0.4618925236399504, "learning_rate": 9.179798802920813e-08, "logits/chosen": -3.4035518169403076, "logits/rejected": -3.3589015007019043, "logps/chosen": -6.312320709228516, "logps/rejected": -42.05010223388672, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 3.631145715713501, "rewards/margins": 3.631145715713501, "rewards/rejected": 0.0, "step": 2359 }, { "epoch": 13.184357541899441, "grad_norm": 0.3400826453101628, "learning_rate": 9.144629095902894e-08, "logits/chosen": -3.422144889831543, "logits/rejected": -3.3887534141540527, "logps/chosen": -18.418508529663086, "logps/rejected": -41.16917037963867, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": 3.146446466445923, "rewards/margins": 3.146446466445923, "rewards/rejected": 0.0, "step": 2360 }, { "epoch": 13.184357541899441, "eval_logits/chosen": -3.256208896636963, "eval_logits/rejected": -3.3817107677459717, "eval_logps/chosen": -30.51275634765625, "eval_logps/rejected": -55.494361877441406, "eval_loss": 1.0322561264038086, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.04427126422524452, "eval_rewards/margins": -0.04427126422524452, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6781, "eval_samples_per_second": 9.486, "eval_steps_per_second": 0.306, "step": 2360 }, { "epoch": 13.189944134078212, "grad_norm": 1.002263887913487, "learning_rate": 9.109520107242069e-08, "logits/chosen": -3.4199793338775635, "logits/rejected": -3.5535426139831543, "logps/chosen": -3.609666347503662, "logps/rejected": -48.57051086425781, "loss": 0.0818, "rewards/accuracies": 1.0, "rewards/chosen": 3.0819921493530273, "rewards/margins": 3.0819921493530273, "rewards/rejected": 0.0, "step": 2361 }, { "epoch": 13.195530726256983, "grad_norm": 0.340160470860633, "learning_rate": 9.074471889116591e-08, "logits/chosen": -3.2756481170654297, "logits/rejected": -3.430361270904541, "logps/chosen": -0.9844825267791748, "logps/rejected": -47.28947830200195, "loss": 0.1052, "rewards/accuracies": 1.0, "rewards/chosen": 2.6441168785095215, "rewards/margins": 2.6441168785095215, "rewards/rejected": 0.0, "step": 2362 }, { "epoch": 13.201117318435754, "grad_norm": 0.4376289377535241, "learning_rate": 9.039484493614297e-08, "logits/chosen": -3.513704299926758, "logits/rejected": -3.4845776557922363, "logps/chosen": -15.548395156860352, "logps/rejected": -56.104347229003906, "loss": 0.0836, "rewards/accuracies": 1.0, "rewards/chosen": 3.270206928253174, "rewards/margins": 3.270206928253174, "rewards/rejected": 0.0, "step": 2363 }, { "epoch": 13.206703910614525, "grad_norm": 0.4568918698457708, "learning_rate": 9.00455797273274e-08, "logits/chosen": -3.138018846511841, "logits/rejected": -3.2717790603637695, "logps/chosen": -1.6022212505340576, "logps/rejected": -51.163604736328125, "loss": 0.1047, "rewards/accuracies": 1.0, "rewards/chosen": 2.738399028778076, "rewards/margins": 2.738399028778076, "rewards/rejected": 0.0, "step": 2364 }, { "epoch": 13.212290502793296, "grad_norm": 0.40291158651426134, "learning_rate": 8.969692378378895e-08, "logits/chosen": -3.2863690853118896, "logits/rejected": -3.287386655807495, "logps/chosen": -25.854572296142578, "logps/rejected": -42.86607360839844, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": 4.345571994781494, "rewards/margins": 4.345571994781494, "rewards/rejected": 0.0, "step": 2365 }, { "epoch": 13.217877094972067, "grad_norm": 0.42250807313026095, "learning_rate": 8.934887762369292e-08, "logits/chosen": -3.4446041584014893, "logits/rejected": -3.593846321105957, "logps/chosen": -0.8210120797157288, "logps/rejected": -45.549713134765625, "loss": 0.1078, "rewards/accuracies": 1.0, "rewards/chosen": 2.494363307952881, "rewards/margins": 2.494363307952881, "rewards/rejected": 0.0, "step": 2366 }, { "epoch": 13.223463687150838, "grad_norm": 0.4410715974925357, "learning_rate": 8.900144176429763e-08, "logits/chosen": -3.172802448272705, "logits/rejected": -3.222987174987793, "logps/chosen": -6.237480640411377, "logps/rejected": -56.25542449951172, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 2.55985689163208, "rewards/margins": 2.55985689163208, "rewards/rejected": 0.0, "step": 2367 }, { "epoch": 13.22905027932961, "grad_norm": 1.9814781233024836, "learning_rate": 8.865461672195479e-08, "logits/chosen": -3.2542126178741455, "logits/rejected": -3.2465641498565674, "logps/chosen": -8.509102821350098, "logps/rejected": -52.00053787231445, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 3.2179455757141113, "rewards/margins": 3.2179455757141113, "rewards/rejected": 0.0, "step": 2368 }, { "epoch": 13.23463687150838, "grad_norm": 0.5108016115370785, "learning_rate": 8.830840301210796e-08, "logits/chosen": -3.230532169342041, "logits/rejected": -3.542701244354248, "logps/chosen": -1.4257872104644775, "logps/rejected": -74.46788024902344, "loss": 0.1489, "rewards/accuracies": 1.0, "rewards/chosen": 2.5936241149902344, "rewards/margins": 2.5936241149902344, "rewards/rejected": 0.0, "step": 2369 }, { "epoch": 13.240223463687151, "grad_norm": 0.43404165553903007, "learning_rate": 8.796280114929272e-08, "logits/chosen": -3.2910354137420654, "logits/rejected": -3.26667857170105, "logps/chosen": -5.609430313110352, "logps/rejected": -37.08899688720703, "loss": 0.1039, "rewards/accuracies": 1.0, "rewards/chosen": 3.2219905853271484, "rewards/margins": 3.2219905853271484, "rewards/rejected": 0.0, "step": 2370 }, { "epoch": 13.245810055865922, "grad_norm": 0.4531421096419252, "learning_rate": 8.761781164713472e-08, "logits/chosen": -3.318911075592041, "logits/rejected": -3.3356807231903076, "logps/chosen": -7.822868824005127, "logps/rejected": -53.15188980102539, "loss": 0.0794, "rewards/accuracies": 1.0, "rewards/chosen": 2.9234256744384766, "rewards/margins": 2.9234256744384766, "rewards/rejected": 0.0, "step": 2371 }, { "epoch": 13.251396648044693, "grad_norm": 0.4393544810482372, "learning_rate": 8.727343501834972e-08, "logits/chosen": -3.2168662548065186, "logits/rejected": -3.3560662269592285, "logps/chosen": -25.274131774902344, "logps/rejected": -55.020294189453125, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 3.256657123565674, "rewards/margins": 3.256657123565674, "rewards/rejected": 0.0, "step": 2372 }, { "epoch": 13.256983240223464, "grad_norm": 0.3822046822850603, "learning_rate": 8.692967177474309e-08, "logits/chosen": -3.4313976764678955, "logits/rejected": -3.4952926635742188, "logps/chosen": -6.944964408874512, "logps/rejected": -48.05303192138672, "loss": 0.0862, "rewards/accuracies": 1.0, "rewards/chosen": 4.522512912750244, "rewards/margins": 4.522512912750244, "rewards/rejected": 0.0, "step": 2373 }, { "epoch": 13.262569832402235, "grad_norm": 1.780335975045036, "learning_rate": 8.658652242720777e-08, "logits/chosen": -2.8541641235351562, "logits/rejected": -3.011066198348999, "logps/chosen": -3.7367441654205322, "logps/rejected": -40.481956481933594, "loss": 0.1302, "rewards/accuracies": 1.0, "rewards/chosen": 2.9329137802124023, "rewards/margins": 2.9329137802124023, "rewards/rejected": 0.0, "step": 2374 }, { "epoch": 13.268156424581006, "grad_norm": 0.615246519240483, "learning_rate": 8.62439874857252e-08, "logits/chosen": -3.517827033996582, "logits/rejected": -3.4203219413757324, "logps/chosen": -4.895214080810547, "logps/rejected": -49.74782180786133, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": 2.790727138519287, "rewards/margins": 2.790727138519287, "rewards/rejected": 0.0, "step": 2375 }, { "epoch": 13.273743016759777, "grad_norm": 0.3921191278380291, "learning_rate": 8.590206745936307e-08, "logits/chosen": -3.570204734802246, "logits/rejected": -3.51688551902771, "logps/chosen": -5.914793491363525, "logps/rejected": -104.97129821777344, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": 3.8447258472442627, "rewards/margins": 3.8447258472442627, "rewards/rejected": 0.0, "step": 2376 }, { "epoch": 13.279329608938548, "grad_norm": 0.8084549264345066, "learning_rate": 8.556076285627556e-08, "logits/chosen": -2.666715145111084, "logits/rejected": -2.7342615127563477, "logps/chosen": -8.738677978515625, "logps/rejected": -100.0520248413086, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": 2.3125147819519043, "rewards/margins": 2.3125147819519043, "rewards/rejected": 0.0, "step": 2377 }, { "epoch": 13.28491620111732, "grad_norm": 0.3602284916437586, "learning_rate": 8.522007418370187e-08, "logits/chosen": -3.507559061050415, "logits/rejected": -3.5460026264190674, "logps/chosen": -0.5510622262954712, "logps/rejected": -39.30126190185547, "loss": 0.0962, "rewards/accuracies": 1.0, "rewards/chosen": 2.086501359939575, "rewards/margins": 2.086501359939575, "rewards/rejected": 0.0, "step": 2378 }, { "epoch": 13.29050279329609, "grad_norm": 0.449789327678422, "learning_rate": 8.488000194796646e-08, "logits/chosen": -3.4018757343292236, "logits/rejected": -3.5999956130981445, "logps/chosen": -2.413813352584839, "logps/rejected": -36.636959075927734, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": 3.452755928039551, "rewards/margins": 3.452755928039551, "rewards/rejected": 0.0, "step": 2379 }, { "epoch": 13.296089385474861, "grad_norm": 1.131102020935119, "learning_rate": 8.45405466544768e-08, "logits/chosen": -3.4479286670684814, "logits/rejected": -3.5594661235809326, "logps/chosen": -2.1537208557128906, "logps/rejected": -59.7096061706543, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 2.1846818923950195, "rewards/margins": 2.1846818923950195, "rewards/rejected": 0.0, "step": 2380 }, { "epoch": 13.296089385474861, "eval_logits/chosen": -3.2597923278808594, "eval_logits/rejected": -3.3839545249938965, "eval_logps/chosen": -30.492467880249023, "eval_logps/rejected": -55.89141082763672, "eval_loss": 1.0290724039077759, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.04224271699786186, "eval_rewards/margins": -0.04224271699786186, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7095, "eval_samples_per_second": 9.477, "eval_steps_per_second": 0.306, "step": 2380 }, { "epoch": 13.30167597765363, "grad_norm": 0.4456367654266066, "learning_rate": 8.420170880772415e-08, "logits/chosen": -3.3668711185455322, "logits/rejected": -3.6564130783081055, "logps/chosen": -0.4650506377220154, "logps/rejected": -63.997406005859375, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 2.1184096336364746, "rewards/margins": 2.1184096336364746, "rewards/rejected": 0.0, "step": 2381 }, { "epoch": 13.307262569832401, "grad_norm": 0.36569527767192056, "learning_rate": 8.386348891128198e-08, "logits/chosen": -3.3553197383880615, "logits/rejected": -3.3262808322906494, "logps/chosen": -0.3164428770542145, "logps/rejected": -34.770713806152344, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 1.7639267444610596, "rewards/margins": 1.7639267444610596, "rewards/rejected": 0.0, "step": 2382 }, { "epoch": 13.312849162011172, "grad_norm": 0.40184913178279863, "learning_rate": 8.352588746780487e-08, "logits/chosen": -3.364434003829956, "logits/rejected": -3.323281764984131, "logps/chosen": -5.599338531494141, "logps/rejected": -35.86624526977539, "loss": 0.0869, "rewards/accuracies": 1.0, "rewards/chosen": 3.009035587310791, "rewards/margins": 3.009035587310791, "rewards/rejected": 0.0, "step": 2383 }, { "epoch": 13.318435754189943, "grad_norm": 0.3866840029712693, "learning_rate": 8.318890497902914e-08, "logits/chosen": -3.225160837173462, "logits/rejected": -3.2643654346466064, "logps/chosen": -6.571699142456055, "logps/rejected": -31.512510299682617, "loss": 0.089, "rewards/accuracies": 1.0, "rewards/chosen": 3.1004815101623535, "rewards/margins": 3.1004815101623535, "rewards/rejected": 0.0, "step": 2384 }, { "epoch": 13.324022346368714, "grad_norm": 0.41761581252563856, "learning_rate": 8.285254194577057e-08, "logits/chosen": -3.3451530933380127, "logits/rejected": -3.414379358291626, "logps/chosen": -0.5896964073181152, "logps/rejected": -47.427093505859375, "loss": 0.1089, "rewards/accuracies": 1.0, "rewards/chosen": 2.404115676879883, "rewards/margins": 2.404115676879883, "rewards/rejected": 0.0, "step": 2385 }, { "epoch": 13.329608938547485, "grad_norm": 0.45632078737894827, "learning_rate": 8.251679886792457e-08, "logits/chosen": -3.568117380142212, "logits/rejected": -3.6124401092529297, "logps/chosen": -7.401118278503418, "logps/rejected": -71.91776275634766, "loss": 0.0893, "rewards/accuracies": 1.0, "rewards/chosen": 3.2571563720703125, "rewards/margins": 3.2571563720703125, "rewards/rejected": 0.0, "step": 2386 }, { "epoch": 13.335195530726256, "grad_norm": 0.5625236710883265, "learning_rate": 8.218167624446504e-08, "logits/chosen": -3.2756669521331787, "logits/rejected": -3.131768226623535, "logps/chosen": -0.7099722623825073, "logps/rejected": -32.888465881347656, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 2.3063101768493652, "rewards/margins": 2.3063101768493652, "rewards/rejected": 0.0, "step": 2387 }, { "epoch": 13.340782122905027, "grad_norm": 0.7671117523165139, "learning_rate": 8.18471745734442e-08, "logits/chosen": -3.6030690670013428, "logits/rejected": -3.4550702571868896, "logps/chosen": -0.3922824263572693, "logps/rejected": -50.66614532470703, "loss": 0.1121, "rewards/accuracies": 1.0, "rewards/chosen": 2.2038583755493164, "rewards/margins": 2.2038583755493164, "rewards/rejected": 0.0, "step": 2388 }, { "epoch": 13.346368715083798, "grad_norm": 0.4001003233855212, "learning_rate": 8.151329435199067e-08, "logits/chosen": -3.3189687728881836, "logits/rejected": -3.348407745361328, "logps/chosen": -0.6709566116333008, "logps/rejected": -64.72965240478516, "loss": 0.1045, "rewards/accuracies": 1.0, "rewards/chosen": 2.204169273376465, "rewards/margins": 2.204169273376465, "rewards/rejected": 0.0, "step": 2389 }, { "epoch": 13.35195530726257, "grad_norm": 0.3865552716765946, "learning_rate": 8.118003607631024e-08, "logits/chosen": -3.5786283016204834, "logits/rejected": -3.579033374786377, "logps/chosen": -2.694607734680176, "logps/rejected": -81.97444152832031, "loss": 0.1167, "rewards/accuracies": 1.0, "rewards/chosen": 2.57879900932312, "rewards/margins": 2.57879900932312, "rewards/rejected": 0.0, "step": 2390 }, { "epoch": 13.35754189944134, "grad_norm": 0.48414906708529765, "learning_rate": 8.084740024168407e-08, "logits/chosen": -3.3128931522369385, "logits/rejected": -3.3622987270355225, "logps/chosen": -0.8101568222045898, "logps/rejected": -55.46433639526367, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": 2.5311551094055176, "rewards/margins": 2.5311551094055176, "rewards/rejected": 0.0, "step": 2391 }, { "epoch": 13.363128491620111, "grad_norm": 0.4407183387585617, "learning_rate": 8.051538734246793e-08, "logits/chosen": -3.258774995803833, "logits/rejected": -3.069319486618042, "logps/chosen": -2.305711030960083, "logps/rejected": -43.63422393798828, "loss": 0.0942, "rewards/accuracies": 1.0, "rewards/chosen": 3.1751363277435303, "rewards/margins": 3.1751363277435303, "rewards/rejected": 0.0, "step": 2392 }, { "epoch": 13.368715083798882, "grad_norm": 0.40758371727240833, "learning_rate": 8.018399787209256e-08, "logits/chosen": -3.353487491607666, "logits/rejected": -3.504014492034912, "logps/chosen": -0.7513201236724854, "logps/rejected": -46.85102081298828, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": 2.2565627098083496, "rewards/margins": 2.2565627098083496, "rewards/rejected": 0.0, "step": 2393 }, { "epoch": 13.374301675977653, "grad_norm": 1.5687597986504542, "learning_rate": 7.985323232306124e-08, "logits/chosen": -3.660555124282837, "logits/rejected": -3.657735586166382, "logps/chosen": -5.776373386383057, "logps/rejected": -43.98847961425781, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 3.4119997024536133, "rewards/margins": 3.4119997024536133, "rewards/rejected": 0.0, "step": 2394 }, { "epoch": 13.379888268156424, "grad_norm": 0.3220517556472039, "learning_rate": 7.952309118695083e-08, "logits/chosen": -3.4327592849731445, "logits/rejected": -3.3201828002929688, "logps/chosen": -4.240723133087158, "logps/rejected": -66.45988464355469, "loss": 0.136, "rewards/accuracies": 1.0, "rewards/chosen": 3.1899328231811523, "rewards/margins": 3.1899328231811523, "rewards/rejected": 0.0, "step": 2395 }, { "epoch": 13.385474860335195, "grad_norm": 0.35886696892881076, "learning_rate": 7.919357495440976e-08, "logits/chosen": -3.4310574531555176, "logits/rejected": -3.372286558151245, "logps/chosen": -5.797863483428955, "logps/rejected": -46.23900604248047, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 3.9535272121429443, "rewards/margins": 3.9535272121429443, "rewards/rejected": 0.0, "step": 2396 }, { "epoch": 13.391061452513966, "grad_norm": 0.396275412424554, "learning_rate": 7.886468411515784e-08, "logits/chosen": -3.400118350982666, "logits/rejected": -3.1456902027130127, "logps/chosen": -8.911643028259277, "logps/rejected": -44.610939025878906, "loss": 0.1109, "rewards/accuracies": 1.0, "rewards/chosen": 3.2362236976623535, "rewards/margins": 3.2362236976623535, "rewards/rejected": 0.0, "step": 2397 }, { "epoch": 13.396648044692737, "grad_norm": 0.4708518096849073, "learning_rate": 7.853641915798531e-08, "logits/chosen": -2.7484145164489746, "logits/rejected": -2.6871142387390137, "logps/chosen": -4.712937831878662, "logps/rejected": -31.69449806213379, "loss": 0.0988, "rewards/accuracies": 1.0, "rewards/chosen": 2.9272632598876953, "rewards/margins": 2.9272632598876953, "rewards/rejected": 0.0, "step": 2398 }, { "epoch": 13.402234636871508, "grad_norm": 0.38999874305736726, "learning_rate": 7.820878057075275e-08, "logits/chosen": -3.495323419570923, "logits/rejected": -3.4236326217651367, "logps/chosen": -6.782710552215576, "logps/rejected": -52.033782958984375, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 2.9194343090057373, "rewards/margins": 2.9194343090057373, "rewards/rejected": 0.0, "step": 2399 }, { "epoch": 13.40782122905028, "grad_norm": 0.41039560712904977, "learning_rate": 7.788176884038889e-08, "logits/chosen": -3.5283381938934326, "logits/rejected": -3.547929048538208, "logps/chosen": -0.29387354850769043, "logps/rejected": -49.664527893066406, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 1.6123600006103516, "rewards/margins": 1.6123600006103516, "rewards/rejected": 0.0, "step": 2400 }, { "epoch": 13.40782122905028, "eval_logits/chosen": -3.258481502532959, "eval_logits/rejected": -3.381990909576416, "eval_logps/chosen": -30.63241195678711, "eval_logps/rejected": -56.020240783691406, "eval_loss": 1.0348480939865112, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": -0.056236814707517624, "eval_rewards/margins": -0.056236814707517624, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7249, "eval_samples_per_second": 9.473, "eval_steps_per_second": 0.306, "step": 2400 }, { "epoch": 13.41340782122905, "grad_norm": 1.2377268202018599, "learning_rate": 7.755538445289179e-08, "logits/chosen": -3.432081699371338, "logits/rejected": -3.4291303157806396, "logps/chosen": -5.873993396759033, "logps/rejected": -46.123741149902344, "loss": 0.0891, "rewards/accuracies": 1.0, "rewards/chosen": 3.1080503463745117, "rewards/margins": 3.1080503463745117, "rewards/rejected": 0.0, "step": 2401 }, { "epoch": 13.418994413407821, "grad_norm": 0.40422115430184585, "learning_rate": 7.722962789332676e-08, "logits/chosen": -3.4764888286590576, "logits/rejected": -3.468198776245117, "logps/chosen": -5.431130886077881, "logps/rejected": -34.02408218383789, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": 3.5071778297424316, "rewards/margins": 3.5071778297424316, "rewards/rejected": 0.0, "step": 2402 }, { "epoch": 13.424581005586592, "grad_norm": 0.41757046049489394, "learning_rate": 7.69044996458258e-08, "logits/chosen": -3.4845077991485596, "logits/rejected": -3.4692227840423584, "logps/chosen": -0.43687373399734497, "logps/rejected": -37.74633026123047, "loss": 0.0858, "rewards/accuracies": 1.0, "rewards/chosen": 2.040178060531616, "rewards/margins": 2.040178060531616, "rewards/rejected": 0.0, "step": 2403 }, { "epoch": 13.430167597765363, "grad_norm": 1.1194557115758397, "learning_rate": 7.658000019358762e-08, "logits/chosen": -3.3184359073638916, "logits/rejected": -3.2005765438079834, "logps/chosen": -0.553019642829895, "logps/rejected": -111.14012145996094, "loss": 0.1347, "rewards/accuracies": 1.0, "rewards/chosen": 1.7884210348129272, "rewards/margins": 1.7884210348129272, "rewards/rejected": 0.0, "step": 2404 }, { "epoch": 13.435754189944134, "grad_norm": 0.4788821439759304, "learning_rate": 7.62561300188761e-08, "logits/chosen": -3.4542956352233887, "logits/rejected": -3.4098803997039795, "logps/chosen": -4.152714729309082, "logps/rejected": -39.71040344238281, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 3.5099453926086426, "rewards/margins": 3.5099453926086426, "rewards/rejected": 0.0, "step": 2405 }, { "epoch": 13.441340782122905, "grad_norm": 0.49335687149670227, "learning_rate": 7.593288960302008e-08, "logits/chosen": -3.4500067234039307, "logits/rejected": -3.3621268272399902, "logps/chosen": -0.4197743237018585, "logps/rejected": -51.166534423828125, "loss": 0.0916, "rewards/accuracies": 1.0, "rewards/chosen": 2.170515775680542, "rewards/margins": 2.170515775680542, "rewards/rejected": 0.0, "step": 2406 }, { "epoch": 13.446927374301676, "grad_norm": 0.41892482487072363, "learning_rate": 7.561027942641218e-08, "logits/chosen": -3.198925495147705, "logits/rejected": -3.25957989692688, "logps/chosen": -8.140546798706055, "logps/rejected": -102.78670501708984, "loss": 0.0732, "rewards/accuracies": 1.0, "rewards/chosen": 3.6075353622436523, "rewards/margins": 3.6075353622436523, "rewards/rejected": 0.0, "step": 2407 }, { "epoch": 13.452513966480447, "grad_norm": 0.8317060791614108, "learning_rate": 7.528829996850899e-08, "logits/chosen": -3.583693504333496, "logits/rejected": -3.5818145275115967, "logps/chosen": -14.598260879516602, "logps/rejected": -41.034305572509766, "loss": 0.0853, "rewards/accuracies": 1.0, "rewards/chosen": 4.366270542144775, "rewards/margins": 4.366270542144775, "rewards/rejected": 0.0, "step": 2408 }, { "epoch": 13.458100558659218, "grad_norm": 0.4295777228419252, "learning_rate": 7.496695170782879e-08, "logits/chosen": -3.62851619720459, "logits/rejected": -3.5785186290740967, "logps/chosen": -2.085975408554077, "logps/rejected": -66.775634765625, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 2.925323963165283, "rewards/margins": 2.925323963165283, "rewards/rejected": 0.0, "step": 2409 }, { "epoch": 13.46368715083799, "grad_norm": 0.37220209217410677, "learning_rate": 7.464623512195278e-08, "logits/chosen": -3.257615327835083, "logits/rejected": -3.296339988708496, "logps/chosen": -0.5343749523162842, "logps/rejected": -43.92362976074219, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 2.4658780097961426, "rewards/margins": 2.4658780097961426, "rewards/rejected": 0.0, "step": 2410 }, { "epoch": 13.46927374301676, "grad_norm": 0.3584433295653612, "learning_rate": 7.432615068752279e-08, "logits/chosen": -3.3008205890655518, "logits/rejected": -3.3128490447998047, "logps/chosen": -8.472904205322266, "logps/rejected": -41.18153381347656, "loss": 0.0959, "rewards/accuracies": 1.0, "rewards/chosen": 3.400026321411133, "rewards/margins": 3.400026321411133, "rewards/rejected": 0.0, "step": 2411 }, { "epoch": 13.474860335195531, "grad_norm": 0.39790237405570594, "learning_rate": 7.400669888024108e-08, "logits/chosen": -3.236492395401001, "logits/rejected": -3.2329535484313965, "logps/chosen": -7.113299369812012, "logps/rejected": -41.26221466064453, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": 2.763033866882324, "rewards/margins": 2.763033866882324, "rewards/rejected": 0.0, "step": 2412 }, { "epoch": 13.480446927374302, "grad_norm": 0.4062685015336655, "learning_rate": 7.368788017487016e-08, "logits/chosen": -3.2081613540649414, "logits/rejected": -3.2045440673828125, "logps/chosen": -1.781062364578247, "logps/rejected": -66.75691223144531, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": 2.8163845539093018, "rewards/margins": 2.8163845539093018, "rewards/rejected": 0.0, "step": 2413 }, { "epoch": 13.486033519553073, "grad_norm": 0.3760925307992988, "learning_rate": 7.336969504523133e-08, "logits/chosen": -3.3029186725616455, "logits/rejected": -3.1940293312072754, "logps/chosen": -1.1946035623550415, "logps/rejected": -51.66459655761719, "loss": 0.0853, "rewards/accuracies": 1.0, "rewards/chosen": 2.4233834743499756, "rewards/margins": 2.4233834743499756, "rewards/rejected": 0.0, "step": 2414 }, { "epoch": 13.491620111731844, "grad_norm": 0.38560132937368724, "learning_rate": 7.305214396420439e-08, "logits/chosen": -3.598036527633667, "logits/rejected": -3.5693676471710205, "logps/chosen": -33.32712936401367, "logps/rejected": -30.71826934814453, "loss": 0.0922, "rewards/accuracies": 1.0, "rewards/chosen": 4.256027698516846, "rewards/margins": 4.256027698516846, "rewards/rejected": 0.0, "step": 2415 }, { "epoch": 13.497206703910614, "grad_norm": 0.4257576326590357, "learning_rate": 7.273522740372662e-08, "logits/chosen": -3.414893388748169, "logits/rejected": -3.609196186065674, "logps/chosen": -0.678642749786377, "logps/rejected": -100.84706115722656, "loss": 0.139, "rewards/accuracies": 1.0, "rewards/chosen": 2.088613510131836, "rewards/margins": 2.088613510131836, "rewards/rejected": 0.0, "step": 2416 }, { "epoch": 13.502793296089386, "grad_norm": 0.352367799804756, "learning_rate": 7.241894583479285e-08, "logits/chosen": -3.357116937637329, "logits/rejected": -3.5004019737243652, "logps/chosen": -1.0748229026794434, "logps/rejected": -55.708335876464844, "loss": 0.1256, "rewards/accuracies": 1.0, "rewards/chosen": 2.753563404083252, "rewards/margins": 2.753563404083252, "rewards/rejected": 0.0, "step": 2417 }, { "epoch": 13.508379888268156, "grad_norm": 0.46647670703766125, "learning_rate": 7.210329972745343e-08, "logits/chosen": -3.197568893432617, "logits/rejected": -3.4688611030578613, "logps/chosen": -1.6286230087280273, "logps/rejected": -61.687870025634766, "loss": 0.1062, "rewards/accuracies": 1.0, "rewards/chosen": 2.151333808898926, "rewards/margins": 2.151333808898926, "rewards/rejected": 0.0, "step": 2418 }, { "epoch": 13.513966480446927, "grad_norm": 0.37209910965383863, "learning_rate": 7.17882895508149e-08, "logits/chosen": -3.263106346130371, "logits/rejected": -3.3420896530151367, "logps/chosen": -0.9882630109786987, "logps/rejected": -48.91582489013672, "loss": 0.1219, "rewards/accuracies": 1.0, "rewards/chosen": 2.689342498779297, "rewards/margins": 2.689342498779297, "rewards/rejected": 0.0, "step": 2419 }, { "epoch": 13.519553072625698, "grad_norm": 0.5386244488849635, "learning_rate": 7.147391577303846e-08, "logits/chosen": -3.374979257583618, "logits/rejected": -3.3277366161346436, "logps/chosen": -0.9086631536483765, "logps/rejected": -60.96214294433594, "loss": 0.0812, "rewards/accuracies": 1.0, "rewards/chosen": 2.7497658729553223, "rewards/margins": 2.7497658729553223, "rewards/rejected": 0.0, "step": 2420 }, { "epoch": 13.519553072625698, "eval_logits/chosen": -3.2651188373565674, "eval_logits/rejected": -3.3867595195770264, "eval_logps/chosen": -30.666950225830078, "eval_logps/rejected": -55.848052978515625, "eval_loss": 1.037139654159546, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": -0.059690773487091064, "eval_rewards/margins": -0.059690773487091064, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6905, "eval_samples_per_second": 9.483, "eval_steps_per_second": 0.306, "step": 2420 }, { "epoch": 13.525139664804469, "grad_norm": 0.3462712885292635, "learning_rate": 7.116017886133946e-08, "logits/chosen": -3.36868953704834, "logits/rejected": -3.415334463119507, "logps/chosen": -1.1047523021697998, "logps/rejected": -50.67715835571289, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": 2.1574814319610596, "rewards/margins": 2.1574814319610596, "rewards/rejected": 0.0, "step": 2421 }, { "epoch": 13.53072625698324, "grad_norm": 0.8207276516407748, "learning_rate": 7.084707928198702e-08, "logits/chosen": -3.461683511734009, "logits/rejected": -3.4847850799560547, "logps/chosen": -0.8327699899673462, "logps/rejected": -43.814453125, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": 2.402855396270752, "rewards/margins": 2.402855396270752, "rewards/rejected": 0.0, "step": 2422 }, { "epoch": 13.53631284916201, "grad_norm": 1.3697531927469424, "learning_rate": 7.053461750030249e-08, "logits/chosen": -3.266954183578491, "logits/rejected": -3.3406190872192383, "logps/chosen": -0.3930329382419586, "logps/rejected": -58.03802490234375, "loss": 0.1051, "rewards/accuracies": 1.0, "rewards/chosen": 1.7540619373321533, "rewards/margins": 1.7540619373321533, "rewards/rejected": 0.0, "step": 2423 }, { "epoch": 13.541899441340782, "grad_norm": 0.3806872794265044, "learning_rate": 7.022279398066005e-08, "logits/chosen": -3.372802734375, "logits/rejected": -3.563987970352173, "logps/chosen": -3.2917611598968506, "logps/rejected": -73.0836181640625, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": 3.173635959625244, "rewards/margins": 3.173635959625244, "rewards/rejected": 0.0, "step": 2424 }, { "epoch": 13.547486033519553, "grad_norm": 0.8882674169538836, "learning_rate": 6.991160918648493e-08, "logits/chosen": -3.353987455368042, "logits/rejected": -3.4794535636901855, "logps/chosen": -1.3371047973632812, "logps/rejected": -49.239593505859375, "loss": 0.1083, "rewards/accuracies": 1.0, "rewards/chosen": 2.450688362121582, "rewards/margins": 2.450688362121582, "rewards/rejected": 0.0, "step": 2425 }, { "epoch": 13.553072625698324, "grad_norm": 0.3142022219595674, "learning_rate": 6.960106358025308e-08, "logits/chosen": -3.6025381088256836, "logits/rejected": -3.450307846069336, "logps/chosen": -3.1776933670043945, "logps/rejected": -82.77340698242188, "loss": 0.1186, "rewards/accuracies": 1.0, "rewards/chosen": 3.1456172466278076, "rewards/margins": 3.1456172466278076, "rewards/rejected": 0.0, "step": 2426 }, { "epoch": 13.558659217877095, "grad_norm": 0.3932636939664874, "learning_rate": 6.929115762349047e-08, "logits/chosen": -3.5441834926605225, "logits/rejected": -3.4236481189727783, "logps/chosen": -0.7303559184074402, "logps/rejected": -36.51518630981445, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": 2.1503782272338867, "rewards/margins": 2.1503782272338867, "rewards/rejected": 0.0, "step": 2427 }, { "epoch": 13.564245810055866, "grad_norm": 0.40096723201484263, "learning_rate": 6.898189177677294e-08, "logits/chosen": -3.488233804702759, "logits/rejected": -3.387334108352661, "logps/chosen": -4.267492771148682, "logps/rejected": -40.590179443359375, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 3.034738540649414, "rewards/margins": 3.034738540649414, "rewards/rejected": 0.0, "step": 2428 }, { "epoch": 13.569832402234637, "grad_norm": 0.4350445433608281, "learning_rate": 6.867326649972415e-08, "logits/chosen": -3.425281286239624, "logits/rejected": -3.5342533588409424, "logps/chosen": -0.9734011888504028, "logps/rejected": -107.08503723144531, "loss": 0.1467, "rewards/accuracies": 1.0, "rewards/chosen": 2.7057478427886963, "rewards/margins": 2.7057478427886963, "rewards/rejected": 0.0, "step": 2429 }, { "epoch": 13.575418994413408, "grad_norm": 0.8901314257917506, "learning_rate": 6.836528225101651e-08, "logits/chosen": -3.3475356101989746, "logits/rejected": -3.3032736778259277, "logps/chosen": -1.2079683542251587, "logps/rejected": -50.296966552734375, "loss": 0.0984, "rewards/accuracies": 1.0, "rewards/chosen": 2.3507351875305176, "rewards/margins": 2.3507351875305176, "rewards/rejected": 0.0, "step": 2430 }, { "epoch": 13.581005586592179, "grad_norm": 0.42334171374625834, "learning_rate": 6.805793948836941e-08, "logits/chosen": -3.5956475734710693, "logits/rejected": -3.687025547027588, "logps/chosen": -4.879007339477539, "logps/rejected": -56.3758430480957, "loss": 0.0739, "rewards/accuracies": 1.0, "rewards/chosen": 3.328376054763794, "rewards/margins": 3.328376054763794, "rewards/rejected": 0.0, "step": 2431 }, { "epoch": 13.58659217877095, "grad_norm": 0.47845070758021246, "learning_rate": 6.775123866854888e-08, "logits/chosen": -3.5741028785705566, "logits/rejected": -3.652073383331299, "logps/chosen": -2.6571004390716553, "logps/rejected": -65.98702239990234, "loss": 0.0884, "rewards/accuracies": 1.0, "rewards/chosen": 2.9428372383117676, "rewards/margins": 2.9428372383117676, "rewards/rejected": 0.0, "step": 2432 }, { "epoch": 13.59217877094972, "grad_norm": 0.901852773461313, "learning_rate": 6.744518024736696e-08, "logits/chosen": -3.3788886070251465, "logits/rejected": -3.3145554065704346, "logps/chosen": -0.9111521244049072, "logps/rejected": -66.0184097290039, "loss": 0.0901, "rewards/accuracies": 1.0, "rewards/chosen": 2.013181686401367, "rewards/margins": 2.013181686401367, "rewards/rejected": 0.0, "step": 2433 }, { "epoch": 13.597765363128492, "grad_norm": 0.43246246536836536, "learning_rate": 6.713976467968102e-08, "logits/chosen": -3.3495986461639404, "logits/rejected": -3.4264187812805176, "logps/chosen": -0.4869656562805176, "logps/rejected": -58.441123962402344, "loss": 0.0954, "rewards/accuracies": 1.0, "rewards/chosen": 2.1961636543273926, "rewards/margins": 2.1961636543273926, "rewards/rejected": 0.0, "step": 2434 }, { "epoch": 13.603351955307263, "grad_norm": 0.5290181129690829, "learning_rate": 6.683499241939295e-08, "logits/chosen": -3.6454989910125732, "logits/rejected": -3.300635814666748, "logps/chosen": -1.7670848369598389, "logps/rejected": -61.70159912109375, "loss": 0.108, "rewards/accuracies": 1.0, "rewards/chosen": 2.9578745365142822, "rewards/margins": 2.9578745365142822, "rewards/rejected": 0.0, "step": 2435 }, { "epoch": 13.608938547486034, "grad_norm": 0.6807000900549025, "learning_rate": 6.653086391944851e-08, "logits/chosen": -2.8142776489257812, "logits/rejected": -2.8938748836517334, "logps/chosen": -41.72840118408203, "logps/rejected": -94.65515899658203, "loss": 0.0995, "rewards/accuracies": 1.0, "rewards/chosen": 2.8236005306243896, "rewards/margins": 2.8236005306243896, "rewards/rejected": 0.0, "step": 2436 }, { "epoch": 13.614525139664805, "grad_norm": 0.42846793246064874, "learning_rate": 6.62273796318371e-08, "logits/chosen": -3.383944034576416, "logits/rejected": -3.426647424697876, "logps/chosen": -0.9192503094673157, "logps/rejected": -69.60661315917969, "loss": 0.0979, "rewards/accuracies": 1.0, "rewards/chosen": 2.3855140209198, "rewards/margins": 2.3855140209198, "rewards/rejected": 0.0, "step": 2437 }, { "epoch": 13.620111731843576, "grad_norm": 0.3407561917723307, "learning_rate": 6.592454000759007e-08, "logits/chosen": -3.527812957763672, "logits/rejected": -3.5327913761138916, "logps/chosen": -3.513021469116211, "logps/rejected": -22.4188175201416, "loss": 0.1275, "rewards/accuracies": 1.0, "rewards/chosen": 2.804161548614502, "rewards/margins": 2.804161548614502, "rewards/rejected": 0.0, "step": 2438 }, { "epoch": 13.625698324022347, "grad_norm": 1.4595379049514006, "learning_rate": 6.56223454967813e-08, "logits/chosen": -3.3430094718933105, "logits/rejected": -3.432039737701416, "logps/chosen": -0.7493870258331299, "logps/rejected": -61.24337387084961, "loss": 0.1196, "rewards/accuracies": 1.0, "rewards/chosen": 2.4020402431488037, "rewards/margins": 2.4020402431488037, "rewards/rejected": 0.0, "step": 2439 }, { "epoch": 13.631284916201118, "grad_norm": 0.4298410453976905, "learning_rate": 6.532079654852551e-08, "logits/chosen": -3.794787883758545, "logits/rejected": -3.6870479583740234, "logps/chosen": -1.8893942832946777, "logps/rejected": -34.15863037109375, "loss": 0.1006, "rewards/accuracies": 1.0, "rewards/chosen": 3.4425315856933594, "rewards/margins": 3.4425315856933594, "rewards/rejected": 0.0, "step": 2440 }, { "epoch": 13.631284916201118, "eval_logits/chosen": -3.2630057334899902, "eval_logits/rejected": -3.386068820953369, "eval_logps/chosen": -30.32887840270996, "eval_logps/rejected": -55.73801803588867, "eval_loss": 1.0398709774017334, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": -0.025883492082357407, "eval_rewards/margins": -0.025883492082357407, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6927, "eval_samples_per_second": 9.482, "eval_steps_per_second": 0.306, "step": 2440 }, { "epoch": 13.636871508379889, "grad_norm": 0.48349027312865905, "learning_rate": 6.501989361097821e-08, "logits/chosen": -3.492025375366211, "logits/rejected": -3.4451024532318115, "logps/chosen": -1.0900187492370605, "logps/rejected": -40.548744201660156, "loss": 0.094, "rewards/accuracies": 1.0, "rewards/chosen": 2.4581050872802734, "rewards/margins": 2.4581050872802734, "rewards/rejected": 0.0, "step": 2441 }, { "epoch": 13.64245810055866, "grad_norm": 0.31571172431084366, "learning_rate": 6.471963713133471e-08, "logits/chosen": -3.242710828781128, "logits/rejected": -3.1574764251708984, "logps/chosen": -1.191739797592163, "logps/rejected": -74.82826232910156, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 2.147900342941284, "rewards/margins": 2.147900342941284, "rewards/rejected": 0.0, "step": 2442 }, { "epoch": 13.64804469273743, "grad_norm": 0.3764505868960777, "learning_rate": 6.442002755582965e-08, "logits/chosen": -3.401458978652954, "logits/rejected": -3.4201838970184326, "logps/chosen": -1.2702481746673584, "logps/rejected": -29.992412567138672, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 2.3849074840545654, "rewards/margins": 2.3849074840545654, "rewards/rejected": 0.0, "step": 2443 }, { "epoch": 13.653631284916202, "grad_norm": 0.3700985671014757, "learning_rate": 6.412106532973616e-08, "logits/chosen": -3.3580126762390137, "logits/rejected": -3.4226503372192383, "logps/chosen": -3.0248465538024902, "logps/rejected": -48.47951126098633, "loss": 0.0812, "rewards/accuracies": 1.0, "rewards/chosen": 3.143270969390869, "rewards/margins": 3.143270969390869, "rewards/rejected": 0.0, "step": 2444 }, { "epoch": 13.659217877094973, "grad_norm": 0.3835411769332013, "learning_rate": 6.382275089736533e-08, "logits/chosen": -3.449427604675293, "logits/rejected": -3.3362691402435303, "logps/chosen": -0.9317790269851685, "logps/rejected": -48.91193389892578, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 2.655461311340332, "rewards/margins": 2.655461311340332, "rewards/rejected": 0.0, "step": 2445 }, { "epoch": 13.664804469273744, "grad_norm": 0.4041573936421392, "learning_rate": 6.352508470206574e-08, "logits/chosen": -3.378014087677002, "logits/rejected": -3.3828506469726562, "logps/chosen": -0.7811074256896973, "logps/rejected": -68.95384979248047, "loss": 0.1156, "rewards/accuracies": 1.0, "rewards/chosen": 2.2991526126861572, "rewards/margins": 2.2991526126861572, "rewards/rejected": 0.0, "step": 2446 }, { "epoch": 13.670391061452515, "grad_norm": 0.381333318267043, "learning_rate": 6.322806718622204e-08, "logits/chosen": -3.376889944076538, "logits/rejected": -3.3511784076690674, "logps/chosen": -9.594871520996094, "logps/rejected": -79.86514282226562, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": 3.468109607696533, "rewards/margins": 3.468109607696533, "rewards/rejected": 0.0, "step": 2447 }, { "epoch": 13.675977653631286, "grad_norm": 0.43726183992411216, "learning_rate": 6.293169879125538e-08, "logits/chosen": -3.6175291538238525, "logits/rejected": -3.491042375564575, "logps/chosen": -0.26917219161987305, "logps/rejected": -42.95506286621094, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": 2.0813467502593994, "rewards/margins": 2.0813467502593994, "rewards/rejected": 0.0, "step": 2448 }, { "epoch": 13.681564245810057, "grad_norm": 0.3479759291901259, "learning_rate": 6.263597995762199e-08, "logits/chosen": -3.379270553588867, "logits/rejected": -3.3774287700653076, "logps/chosen": -1.5853116512298584, "logps/rejected": -22.441299438476562, "loss": 0.0664, "rewards/accuracies": 1.0, "rewards/chosen": 2.8696866035461426, "rewards/margins": 2.8696866035461426, "rewards/rejected": 0.0, "step": 2449 }, { "epoch": 13.687150837988828, "grad_norm": 0.8532118856100707, "learning_rate": 6.23409111248126e-08, "logits/chosen": -3.2234463691711426, "logits/rejected": -3.2407162189483643, "logps/chosen": -0.7098406553268433, "logps/rejected": -96.90626525878906, "loss": 0.1352, "rewards/accuracies": 1.0, "rewards/chosen": 2.1984286308288574, "rewards/margins": 2.1984286308288574, "rewards/rejected": 0.0, "step": 2450 }, { "epoch": 13.692737430167599, "grad_norm": 0.3862349518604081, "learning_rate": 6.204649273135209e-08, "logits/chosen": -3.3121843338012695, "logits/rejected": -3.284202814102173, "logps/chosen": -6.544798851013184, "logps/rejected": -29.416772842407227, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 3.399103879928589, "rewards/margins": 3.399103879928589, "rewards/rejected": 0.0, "step": 2451 }, { "epoch": 13.69832402234637, "grad_norm": 0.4318334352488587, "learning_rate": 6.175272521479868e-08, "logits/chosen": -3.348818302154541, "logits/rejected": -3.2802324295043945, "logps/chosen": -0.7584010362625122, "logps/rejected": -87.66751098632812, "loss": 0.0838, "rewards/accuracies": 1.0, "rewards/chosen": 2.5336742401123047, "rewards/margins": 2.5336742401123047, "rewards/rejected": 0.0, "step": 2452 }, { "epoch": 13.703910614525139, "grad_norm": 0.3912107462193857, "learning_rate": 6.145960901174313e-08, "logits/chosen": -3.1701173782348633, "logits/rejected": -3.342404365539551, "logps/chosen": -1.729007363319397, "logps/rejected": -80.1790771484375, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": 2.575544834136963, "rewards/margins": 2.575544834136963, "rewards/rejected": 0.0, "step": 2453 }, { "epoch": 13.70949720670391, "grad_norm": 0.33826820534948204, "learning_rate": 6.116714455780841e-08, "logits/chosen": -3.6022965908050537, "logits/rejected": -3.5730085372924805, "logps/chosen": -0.2734066843986511, "logps/rejected": -37.372413635253906, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": 1.6353272199630737, "rewards/margins": 1.6353272199630737, "rewards/rejected": 0.0, "step": 2454 }, { "epoch": 13.71508379888268, "grad_norm": 0.4282417536358434, "learning_rate": 6.087533228764869e-08, "logits/chosen": -3.5499305725097656, "logits/rejected": -3.4979584217071533, "logps/chosen": -3.1904568672180176, "logps/rejected": -50.225433349609375, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 2.881998062133789, "rewards/margins": 2.881998062133789, "rewards/rejected": 0.0, "step": 2455 }, { "epoch": 13.720670391061452, "grad_norm": 0.4670839027146506, "learning_rate": 6.058417263494892e-08, "logits/chosen": -3.258791446685791, "logits/rejected": -3.4401707649230957, "logps/chosen": -1.52991783618927, "logps/rejected": -37.89375305175781, "loss": 0.0864, "rewards/accuracies": 1.0, "rewards/chosen": 2.6789238452911377, "rewards/margins": 2.6789238452911377, "rewards/rejected": 0.0, "step": 2456 }, { "epoch": 13.726256983240223, "grad_norm": 0.4269025969426856, "learning_rate": 6.029366603242453e-08, "logits/chosen": -3.4440407752990723, "logits/rejected": -3.2248666286468506, "logps/chosen": -6.943201065063477, "logps/rejected": -71.95067596435547, "loss": 0.095, "rewards/accuracies": 1.0, "rewards/chosen": 2.6368632316589355, "rewards/margins": 2.6368632316589355, "rewards/rejected": 0.0, "step": 2457 }, { "epoch": 13.731843575418994, "grad_norm": 0.47097584071658793, "learning_rate": 6.00038129118196e-08, "logits/chosen": -3.3258748054504395, "logits/rejected": -3.1533684730529785, "logps/chosen": -0.4051774740219116, "logps/rejected": -113.83723449707031, "loss": 0.1042, "rewards/accuracies": 1.0, "rewards/chosen": 1.7320462465286255, "rewards/margins": 1.7320462465286255, "rewards/rejected": 0.0, "step": 2458 }, { "epoch": 13.737430167597765, "grad_norm": 0.3694995698827105, "learning_rate": 5.971461370390779e-08, "logits/chosen": -3.4326939582824707, "logits/rejected": -3.613823652267456, "logps/chosen": -4.124021053314209, "logps/rejected": -35.067256927490234, "loss": 0.0764, "rewards/accuracies": 1.0, "rewards/chosen": 3.1807050704956055, "rewards/margins": 3.1807050704956055, "rewards/rejected": 0.0, "step": 2459 }, { "epoch": 13.743016759776536, "grad_norm": 0.3412927447785637, "learning_rate": 5.942606883849061e-08, "logits/chosen": -3.3147404193878174, "logits/rejected": -3.4530279636383057, "logps/chosen": -10.147854804992676, "logps/rejected": -50.02666091918945, "loss": 0.076, "rewards/accuracies": 1.0, "rewards/chosen": 3.6943697929382324, "rewards/margins": 3.6943697929382324, "rewards/rejected": 0.0, "step": 2460 }, { "epoch": 13.743016759776536, "eval_logits/chosen": -3.2568747997283936, "eval_logits/rejected": -3.381835460662842, "eval_logps/chosen": -30.75702476501465, "eval_logps/rejected": -55.996238708496094, "eval_loss": 1.0451282262802124, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": -0.06869839131832123, "eval_rewards/margins": -0.06869839131832123, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7157, "eval_samples_per_second": 9.476, "eval_steps_per_second": 0.306, "step": 2460 }, { "epoch": 13.748603351955307, "grad_norm": 0.5056231660460868, "learning_rate": 5.9138178744397026e-08, "logits/chosen": -3.45645809173584, "logits/rejected": -3.566572666168213, "logps/chosen": -0.503404438495636, "logps/rejected": -55.78275680541992, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": 2.135225772857666, "rewards/margins": 2.135225772857666, "rewards/rejected": 0.0, "step": 2461 }, { "epoch": 13.754189944134078, "grad_norm": 0.38893605521270735, "learning_rate": 5.8850943849483024e-08, "logits/chosen": -3.5577340126037598, "logits/rejected": -3.289719820022583, "logps/chosen": -1.7229567766189575, "logps/rejected": -35.91162109375, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": 2.82952880859375, "rewards/margins": 2.82952880859375, "rewards/rejected": 0.0, "step": 2462 }, { "epoch": 13.759776536312849, "grad_norm": 1.391989449429673, "learning_rate": 5.856436458063085e-08, "logits/chosen": -2.970513343811035, "logits/rejected": -3.20465350151062, "logps/chosen": -31.5120792388916, "logps/rejected": -67.71055603027344, "loss": 0.1104, "rewards/accuracies": 1.0, "rewards/chosen": 2.864408254623413, "rewards/margins": 2.864408254623413, "rewards/rejected": 0.0, "step": 2463 }, { "epoch": 13.76536312849162, "grad_norm": 0.3956122948278953, "learning_rate": 5.8278441363748266e-08, "logits/chosen": -3.3388075828552246, "logits/rejected": -3.2895302772521973, "logps/chosen": -1.098218560218811, "logps/rejected": -84.32713317871094, "loss": 0.0985, "rewards/accuracies": 1.0, "rewards/chosen": 2.219403028488159, "rewards/margins": 2.219403028488159, "rewards/rejected": 0.0, "step": 2464 }, { "epoch": 13.77094972067039, "grad_norm": 0.5040526789435122, "learning_rate": 5.799317462376802e-08, "logits/chosen": -3.611959457397461, "logits/rejected": -3.395556688308716, "logps/chosen": -9.191370964050293, "logps/rejected": -23.500995635986328, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 3.60721755027771, "rewards/margins": 3.60721755027771, "rewards/rejected": 0.0, "step": 2465 }, { "epoch": 13.776536312849162, "grad_norm": 0.3623352437962981, "learning_rate": 5.7708564784647585e-08, "logits/chosen": -3.712226629257202, "logits/rejected": -3.5574452877044678, "logps/chosen": -2.2654924392700195, "logps/rejected": -69.63509368896484, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 2.731916904449463, "rewards/margins": 2.731916904449463, "rewards/rejected": 0.0, "step": 2466 }, { "epoch": 13.782122905027933, "grad_norm": 0.4276455071005255, "learning_rate": 5.742461226936746e-08, "logits/chosen": -3.530557632446289, "logits/rejected": -3.3399665355682373, "logps/chosen": -15.764871597290039, "logps/rejected": -51.784812927246094, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": 3.7098512649536133, "rewards/margins": 3.7098512649536133, "rewards/rejected": 0.0, "step": 2467 }, { "epoch": 13.787709497206704, "grad_norm": 1.0922418529887215, "learning_rate": 5.714131749993201e-08, "logits/chosen": -3.275423049926758, "logits/rejected": -3.168363094329834, "logps/chosen": -6.508008003234863, "logps/rejected": -30.521949768066406, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": 4.0412983894348145, "rewards/margins": 4.0412983894348145, "rewards/rejected": 0.0, "step": 2468 }, { "epoch": 13.793296089385475, "grad_norm": 0.3808300020693675, "learning_rate": 5.685868089736756e-08, "logits/chosen": -3.273233652114868, "logits/rejected": -3.2076663970947266, "logps/chosen": -2.4366536140441895, "logps/rejected": -45.89551544189453, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": 3.258554458618164, "rewards/margins": 3.258554458618164, "rewards/rejected": 0.0, "step": 2469 }, { "epoch": 13.798882681564246, "grad_norm": 0.3625829305182228, "learning_rate": 5.657670288172245e-08, "logits/chosen": -3.233433246612549, "logits/rejected": -3.353445053100586, "logps/chosen": -0.8617339730262756, "logps/rejected": -46.014373779296875, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": 2.202939987182617, "rewards/margins": 2.202939987182617, "rewards/rejected": 0.0, "step": 2470 }, { "epoch": 13.804469273743017, "grad_norm": 0.7421793665656019, "learning_rate": 5.6295383872066174e-08, "logits/chosen": -3.3089280128479004, "logits/rejected": -3.4478418827056885, "logps/chosen": -1.426999807357788, "logps/rejected": -57.93741226196289, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 3.1230382919311523, "rewards/margins": 3.1230382919311523, "rewards/rejected": 0.0, "step": 2471 }, { "epoch": 13.810055865921788, "grad_norm": 0.5560215504161522, "learning_rate": 5.601472428648901e-08, "logits/chosen": -3.2834036350250244, "logits/rejected": -3.293836832046509, "logps/chosen": -5.384027004241943, "logps/rejected": -43.36772918701172, "loss": 0.0979, "rewards/accuracies": 1.0, "rewards/chosen": 3.096794605255127, "rewards/margins": 3.096794605255127, "rewards/rejected": 0.0, "step": 2472 }, { "epoch": 13.815642458100559, "grad_norm": 0.48068691739954705, "learning_rate": 5.573472454210099e-08, "logits/chosen": -3.3021554946899414, "logits/rejected": -3.2614760398864746, "logps/chosen": -0.7331230640411377, "logps/rejected": -48.53559875488281, "loss": 0.1139, "rewards/accuracies": 1.0, "rewards/chosen": 2.484909772872925, "rewards/margins": 2.484909772872925, "rewards/rejected": 0.0, "step": 2473 }, { "epoch": 13.82122905027933, "grad_norm": 0.4228967725332, "learning_rate": 5.54553850550315e-08, "logits/chosen": -3.1667332649230957, "logits/rejected": -3.569082260131836, "logps/chosen": -4.470986366271973, "logps/rejected": -49.33929443359375, "loss": 0.1316, "rewards/accuracies": 1.0, "rewards/chosen": 2.8256728649139404, "rewards/margins": 2.8256728649139404, "rewards/rejected": 0.0, "step": 2474 }, { "epoch": 13.8268156424581, "grad_norm": 0.5758302695747335, "learning_rate": 5.517670624042908e-08, "logits/chosen": -3.196789503097534, "logits/rejected": -3.2065672874450684, "logps/chosen": -2.022822856903076, "logps/rejected": -60.15814208984375, "loss": 0.1015, "rewards/accuracies": 1.0, "rewards/chosen": 2.174039363861084, "rewards/margins": 2.174039363861084, "rewards/rejected": 0.0, "step": 2475 }, { "epoch": 13.832402234636872, "grad_norm": 0.3537566198334273, "learning_rate": 5.489868851245966e-08, "logits/chosen": -3.4229576587677, "logits/rejected": -3.4131813049316406, "logps/chosen": -0.5902296304702759, "logps/rejected": -44.242584228515625, "loss": 0.1254, "rewards/accuracies": 1.0, "rewards/chosen": 2.101193428039551, "rewards/margins": 2.101193428039551, "rewards/rejected": 0.0, "step": 2476 }, { "epoch": 13.837988826815643, "grad_norm": 0.5164478856248721, "learning_rate": 5.4621332284307485e-08, "logits/chosen": -3.5133912563323975, "logits/rejected": -3.3796379566192627, "logps/chosen": -0.5123422145843506, "logps/rejected": -72.57955932617188, "loss": 0.1038, "rewards/accuracies": 1.0, "rewards/chosen": 1.9728453159332275, "rewards/margins": 1.9728453159332275, "rewards/rejected": 0.0, "step": 2477 }, { "epoch": 13.843575418994414, "grad_norm": 0.34060610282066395, "learning_rate": 5.4344637968173066e-08, "logits/chosen": -3.4484715461730957, "logits/rejected": -3.355842113494873, "logps/chosen": -4.761620044708252, "logps/rejected": -63.75336456298828, "loss": 0.1036, "rewards/accuracies": 1.0, "rewards/chosen": 3.49881649017334, "rewards/margins": 3.49881649017334, "rewards/rejected": 0.0, "step": 2478 }, { "epoch": 13.849162011173185, "grad_norm": 0.36252659299686363, "learning_rate": 5.406860597527352e-08, "logits/chosen": -3.4152116775512695, "logits/rejected": -3.427280902862549, "logps/chosen": -1.1045894622802734, "logps/rejected": -71.25437927246094, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 2.2757999897003174, "rewards/margins": 2.2757999897003174, "rewards/rejected": 0.0, "step": 2479 }, { "epoch": 13.854748603351956, "grad_norm": 0.4692979948850874, "learning_rate": 5.379323671584157e-08, "logits/chosen": -3.529876470565796, "logits/rejected": -3.5923233032226562, "logps/chosen": -28.87140655517578, "logps/rejected": -52.27913284301758, "loss": 0.0781, "rewards/accuracies": 1.0, "rewards/chosen": 4.453380584716797, "rewards/margins": 4.453380584716797, "rewards/rejected": 0.0, "step": 2480 }, { "epoch": 13.854748603351956, "eval_logits/chosen": -3.259183168411255, "eval_logits/rejected": -3.3825583457946777, "eval_logps/chosen": -30.66366195678711, "eval_logps/rejected": -55.99504852294922, "eval_loss": 1.0457611083984375, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.059362124651670456, "eval_rewards/margins": -0.059362124651670456, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7272, "eval_samples_per_second": 9.472, "eval_steps_per_second": 0.306, "step": 2480 }, { "epoch": 13.860335195530727, "grad_norm": 0.3591490248366364, "learning_rate": 5.351853059912492e-08, "logits/chosen": -3.2581000328063965, "logits/rejected": -3.3914709091186523, "logps/chosen": -3.7509045600891113, "logps/rejected": -85.30378723144531, "loss": 0.1322, "rewards/accuracies": 1.0, "rewards/chosen": 2.855607509613037, "rewards/margins": 2.855607509613037, "rewards/rejected": 0.0, "step": 2481 }, { "epoch": 13.865921787709498, "grad_norm": 0.42055219665901744, "learning_rate": 5.324448803338577e-08, "logits/chosen": -3.6724579334259033, "logits/rejected": -3.55611515045166, "logps/chosen": -1.2097103595733643, "logps/rejected": -57.648094177246094, "loss": 0.1112, "rewards/accuracies": 1.0, "rewards/chosen": 2.73215389251709, "rewards/margins": 2.73215389251709, "rewards/rejected": 0.0, "step": 2482 }, { "epoch": 13.871508379888269, "grad_norm": 0.3887641847552534, "learning_rate": 5.297110942590027e-08, "logits/chosen": -3.3861851692199707, "logits/rejected": -3.496201276779175, "logps/chosen": -4.638120651245117, "logps/rejected": -32.964237213134766, "loss": 0.1047, "rewards/accuracies": 1.0, "rewards/chosen": 3.4208145141601562, "rewards/margins": 3.4208145141601562, "rewards/rejected": 0.0, "step": 2483 }, { "epoch": 13.87709497206704, "grad_norm": 0.5033224240333831, "learning_rate": 5.26983951829576e-08, "logits/chosen": -3.4654667377471924, "logits/rejected": -3.3526651859283447, "logps/chosen": -19.81593132019043, "logps/rejected": -29.83510971069336, "loss": 0.0788, "rewards/accuracies": 1.0, "rewards/chosen": 4.035046577453613, "rewards/margins": 4.035046577453613, "rewards/rejected": 0.0, "step": 2484 }, { "epoch": 13.88268156424581, "grad_norm": 0.5211346855195558, "learning_rate": 5.242634570985966e-08, "logits/chosen": -3.5142388343811035, "logits/rejected": -3.537492275238037, "logps/chosen": -9.576385498046875, "logps/rejected": -43.98325729370117, "loss": 0.1114, "rewards/accuracies": 1.0, "rewards/chosen": 2.6981780529022217, "rewards/margins": 2.6981780529022217, "rewards/rejected": 0.0, "step": 2485 }, { "epoch": 13.888268156424582, "grad_norm": 0.394665246695912, "learning_rate": 5.215496141092074e-08, "logits/chosen": -3.624476671218872, "logits/rejected": -3.6229772567749023, "logps/chosen": -1.8939690589904785, "logps/rejected": -43.50250244140625, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 3.396291494369507, "rewards/margins": 3.396291494369507, "rewards/rejected": 0.0, "step": 2486 }, { "epoch": 13.893854748603353, "grad_norm": 0.4739673899151669, "learning_rate": 5.188424268946573e-08, "logits/chosen": -3.3838634490966797, "logits/rejected": -3.3739559650421143, "logps/chosen": -7.8452277183532715, "logps/rejected": -49.918739318847656, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": 3.364142417907715, "rewards/margins": 3.364142417907715, "rewards/rejected": 0.0, "step": 2487 }, { "epoch": 13.899441340782122, "grad_norm": 0.4319820305631335, "learning_rate": 5.161418994783129e-08, "logits/chosen": -3.3190858364105225, "logits/rejected": -3.4019381999969482, "logps/chosen": -5.985474586486816, "logps/rejected": -72.33639526367188, "loss": 0.0974, "rewards/accuracies": 1.0, "rewards/chosen": 2.8086514472961426, "rewards/margins": 2.8086514472961426, "rewards/rejected": 0.0, "step": 2488 }, { "epoch": 13.905027932960895, "grad_norm": 0.5339954859468932, "learning_rate": 5.13448035873637e-08, "logits/chosen": -3.6297895908355713, "logits/rejected": -3.5210063457489014, "logps/chosen": -17.91924285888672, "logps/rejected": -60.66377639770508, "loss": 0.0889, "rewards/accuracies": 1.0, "rewards/chosen": 3.580571174621582, "rewards/margins": 3.580571174621582, "rewards/rejected": 0.0, "step": 2489 }, { "epoch": 13.910614525139664, "grad_norm": 0.4504867638513347, "learning_rate": 5.107608400841912e-08, "logits/chosen": -3.335150718688965, "logits/rejected": -3.196349859237671, "logps/chosen": -5.946123123168945, "logps/rejected": -45.173431396484375, "loss": 0.1225, "rewards/accuracies": 1.0, "rewards/chosen": 3.9648706912994385, "rewards/margins": 3.9648706912994385, "rewards/rejected": 0.0, "step": 2490 }, { "epoch": 13.916201117318435, "grad_norm": 0.34157498460295826, "learning_rate": 5.080803161036268e-08, "logits/chosen": -3.4631834030151367, "logits/rejected": -3.5812928676605225, "logps/chosen": -0.24410250782966614, "logps/rejected": -34.043128967285156, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 1.688918113708496, "rewards/margins": 1.688918113708496, "rewards/rejected": 0.0, "step": 2491 }, { "epoch": 13.921787709497206, "grad_norm": 0.4207208697244406, "learning_rate": 5.054064679156794e-08, "logits/chosen": -3.413464069366455, "logits/rejected": -3.4899497032165527, "logps/chosen": -0.3154837191104889, "logps/rejected": -39.27338409423828, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 1.7795913219451904, "rewards/margins": 1.7795913219451904, "rewards/rejected": 0.0, "step": 2492 }, { "epoch": 13.927374301675977, "grad_norm": 0.36446031525425, "learning_rate": 5.0273929949416425e-08, "logits/chosen": -3.2451226711273193, "logits/rejected": -3.5481081008911133, "logps/chosen": -0.687081515789032, "logps/rejected": -55.72673416137695, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 2.5647451877593994, "rewards/margins": 2.5647451877593994, "rewards/rejected": 0.0, "step": 2493 }, { "epoch": 13.932960893854748, "grad_norm": 1.5084722771222736, "learning_rate": 5.0007881480296777e-08, "logits/chosen": -3.366252899169922, "logits/rejected": -3.440880537033081, "logps/chosen": -0.37535303831100464, "logps/rejected": -87.42872619628906, "loss": 0.1351, "rewards/accuracies": 1.0, "rewards/chosen": 1.7993223667144775, "rewards/margins": 1.7993223667144775, "rewards/rejected": 0.0, "step": 2494 }, { "epoch": 13.938547486033519, "grad_norm": 0.5251464349693189, "learning_rate": 4.974250177960459e-08, "logits/chosen": -3.3979170322418213, "logits/rejected": -3.4452855587005615, "logps/chosen": -9.844120025634766, "logps/rejected": -55.45526123046875, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": 3.4103057384490967, "rewards/margins": 3.4103057384490967, "rewards/rejected": 0.0, "step": 2495 }, { "epoch": 13.94413407821229, "grad_norm": 0.4172781926412672, "learning_rate": 4.9477791241741116e-08, "logits/chosen": -3.177208185195923, "logits/rejected": -3.424086809158325, "logps/chosen": -30.380510330200195, "logps/rejected": -44.14557647705078, "loss": 0.1112, "rewards/accuracies": 1.0, "rewards/chosen": 2.7149908542633057, "rewards/margins": 2.7149908542633057, "rewards/rejected": 0.0, "step": 2496 }, { "epoch": 13.949720670391061, "grad_norm": 0.48841241166711075, "learning_rate": 4.921375026011349e-08, "logits/chosen": -3.4830703735351562, "logits/rejected": -3.333214521408081, "logps/chosen": -1.4240703582763672, "logps/rejected": -56.699066162109375, "loss": 0.0829, "rewards/accuracies": 1.0, "rewards/chosen": 2.1017422676086426, "rewards/margins": 2.1017422676086426, "rewards/rejected": 0.0, "step": 2497 }, { "epoch": 13.955307262569832, "grad_norm": 0.6062750944641184, "learning_rate": 4.895037922713369e-08, "logits/chosen": -3.237408399581909, "logits/rejected": -3.168778419494629, "logps/chosen": -0.853169858455658, "logps/rejected": -77.13175964355469, "loss": 0.127, "rewards/accuracies": 1.0, "rewards/chosen": 2.408933162689209, "rewards/margins": 2.408933162689209, "rewards/rejected": 0.0, "step": 2498 }, { "epoch": 13.960893854748603, "grad_norm": 0.5009395657043406, "learning_rate": 4.8687678534217845e-08, "logits/chosen": -3.6134917736053467, "logits/rejected": -3.5716700553894043, "logps/chosen": -0.602191150188446, "logps/rejected": -55.38518524169922, "loss": 0.0896, "rewards/accuracies": 1.0, "rewards/chosen": 2.096320629119873, "rewards/margins": 2.096320629119873, "rewards/rejected": 0.0, "step": 2499 }, { "epoch": 13.966480446927374, "grad_norm": 0.5193637019708659, "learning_rate": 4.842564857178605e-08, "logits/chosen": -3.6649270057678223, "logits/rejected": -3.5219991207122803, "logps/chosen": -6.3779802322387695, "logps/rejected": -44.12037658691406, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 3.800743341445923, "rewards/margins": 3.800743341445923, "rewards/rejected": 0.0, "step": 2500 }, { "epoch": 13.966480446927374, "eval_logits/chosen": -3.26655912399292, "eval_logits/rejected": -3.389557361602783, "eval_logps/chosen": -30.509235382080078, "eval_logps/rejected": -55.891700744628906, "eval_loss": 1.043331265449524, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.043919093906879425, "eval_rewards/margins": -0.043919093906879425, "eval_rewards/rejected": 0.0, "eval_runtime": 32.705, "eval_samples_per_second": 9.479, "eval_steps_per_second": 0.306, "step": 2500 }, { "epoch": 13.972067039106145, "grad_norm": 1.3732558235325156, "learning_rate": 4.8164289729261456e-08, "logits/chosen": -3.3140311241149902, "logits/rejected": -3.2370216846466064, "logps/chosen": -9.072028160095215, "logps/rejected": -29.328149795532227, "loss": 0.1126, "rewards/accuracies": 1.0, "rewards/chosen": 3.3890528678894043, "rewards/margins": 3.3890528678894043, "rewards/rejected": 0.0, "step": 2501 }, { "epoch": 13.977653631284916, "grad_norm": 0.4150946977059628, "learning_rate": 4.7903602395069854e-08, "logits/chosen": -3.29394268989563, "logits/rejected": -3.1019275188446045, "logps/chosen": -0.9298399090766907, "logps/rejected": -29.049501419067383, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 2.187602996826172, "rewards/margins": 2.187602996826172, "rewards/rejected": 0.0, "step": 2502 }, { "epoch": 13.983240223463687, "grad_norm": 2.046627753222545, "learning_rate": 4.7643586956639004e-08, "logits/chosen": -3.40679669380188, "logits/rejected": -3.2866289615631104, "logps/chosen": -1.019418716430664, "logps/rejected": -25.22226333618164, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 2.2473247051239014, "rewards/margins": 2.2473247051239014, "rewards/rejected": 0.0, "step": 2503 }, { "epoch": 13.988826815642458, "grad_norm": 0.4593675037651337, "learning_rate": 4.738424380039835e-08, "logits/chosen": -3.554079055786133, "logits/rejected": -3.388239860534668, "logps/chosen": -3.9275014400482178, "logps/rejected": -40.449485778808594, "loss": 0.1013, "rewards/accuracies": 1.0, "rewards/chosen": 3.1833853721618652, "rewards/margins": 3.1833853721618652, "rewards/rejected": 0.0, "step": 2504 }, { "epoch": 13.994413407821229, "grad_norm": 0.380511098651879, "learning_rate": 4.712557331177775e-08, "logits/chosen": -3.6009199619293213, "logits/rejected": -3.547635078430176, "logps/chosen": -0.5908699035644531, "logps/rejected": -67.65447235107422, "loss": 0.0844, "rewards/accuracies": 1.0, "rewards/chosen": 2.278252601623535, "rewards/margins": 2.278252601623535, "rewards/rejected": 0.0, "step": 2505 }, { "epoch": 14.0, "grad_norm": 0.6200767041535287, "learning_rate": 4.686757587520773e-08, "logits/chosen": -3.1858692169189453, "logits/rejected": -3.4097728729248047, "logps/chosen": -0.5349952578544617, "logps/rejected": -65.05491638183594, "loss": 0.149, "rewards/accuracies": 1.0, "rewards/chosen": 1.9272640943527222, "rewards/margins": 1.9272640943527222, "rewards/rejected": 0.0, "step": 2506 }, { "epoch": 14.005586592178771, "grad_norm": 0.39174736869501914, "learning_rate": 4.661025187411849e-08, "logits/chosen": -2.706138849258423, "logits/rejected": -2.865696430206299, "logps/chosen": -25.698196411132812, "logps/rejected": -51.548553466796875, "loss": 0.1079, "rewards/accuracies": 1.0, "rewards/chosen": 3.400902271270752, "rewards/margins": 3.400902271270752, "rewards/rejected": 0.0, "step": 2507 }, { "epoch": 14.011173184357542, "grad_norm": 0.4515259932811653, "learning_rate": 4.635360169093927e-08, "logits/chosen": -3.237030506134033, "logits/rejected": -3.3163609504699707, "logps/chosen": -0.3199694752693176, "logps/rejected": -51.186988830566406, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": 1.9741865396499634, "rewards/margins": 1.9741865396499634, "rewards/rejected": 0.0, "step": 2508 }, { "epoch": 14.016759776536313, "grad_norm": 0.34533738462280605, "learning_rate": 4.6097625707097906e-08, "logits/chosen": -3.1123125553131104, "logits/rejected": -3.3256561756134033, "logps/chosen": -10.399734497070312, "logps/rejected": -43.868534088134766, "loss": 0.1201, "rewards/accuracies": 1.0, "rewards/chosen": 3.3708457946777344, "rewards/margins": 3.3708457946777344, "rewards/rejected": 0.0, "step": 2509 }, { "epoch": 14.022346368715084, "grad_norm": 0.5019482966361488, "learning_rate": 4.584232430302032e-08, "logits/chosen": -3.5791730880737305, "logits/rejected": -3.4997928142547607, "logps/chosen": -8.169157028198242, "logps/rejected": -37.524234771728516, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": 3.1666946411132812, "rewards/margins": 3.1666946411132812, "rewards/rejected": 0.0, "step": 2510 }, { "epoch": 14.027932960893855, "grad_norm": 0.36052654630885134, "learning_rate": 4.5587697858129936e-08, "logits/chosen": -3.343183994293213, "logits/rejected": -3.2706618309020996, "logps/chosen": -5.515871047973633, "logps/rejected": -42.919532775878906, "loss": 0.1026, "rewards/accuracies": 1.0, "rewards/chosen": 3.694736957550049, "rewards/margins": 3.694736957550049, "rewards/rejected": 0.0, "step": 2511 }, { "epoch": 14.033519553072626, "grad_norm": 0.4548371099956356, "learning_rate": 4.533374675084689e-08, "logits/chosen": -3.6490325927734375, "logits/rejected": -3.58361554145813, "logps/chosen": -3.832155466079712, "logps/rejected": -53.35199737548828, "loss": 0.0833, "rewards/accuracies": 1.0, "rewards/chosen": 2.9735910892486572, "rewards/margins": 2.9735910892486572, "rewards/rejected": 0.0, "step": 2512 }, { "epoch": 14.039106145251397, "grad_norm": 0.31969567415625777, "learning_rate": 4.508047135858778e-08, "logits/chosen": -2.882420063018799, "logits/rejected": -2.9554548263549805, "logps/chosen": -2.9381048679351807, "logps/rejected": -37.72036361694336, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 2.9198451042175293, "rewards/margins": 2.9198451042175293, "rewards/rejected": 0.0, "step": 2513 }, { "epoch": 14.044692737430168, "grad_norm": 0.4494046124884269, "learning_rate": 4.482787205776495e-08, "logits/chosen": -3.3964457511901855, "logits/rejected": -3.201207160949707, "logps/chosen": -0.42687922716140747, "logps/rejected": -38.09913635253906, "loss": 0.1204, "rewards/accuracies": 1.0, "rewards/chosen": 1.7753281593322754, "rewards/margins": 1.7753281593322754, "rewards/rejected": 0.0, "step": 2514 }, { "epoch": 14.050279329608939, "grad_norm": 0.39334101637067403, "learning_rate": 4.45759492237861e-08, "logits/chosen": -3.2897698879241943, "logits/rejected": -3.1025137901306152, "logps/chosen": -3.072049617767334, "logps/rejected": -51.14334487915039, "loss": 0.0997, "rewards/accuracies": 1.0, "rewards/chosen": 2.66015887260437, "rewards/margins": 2.66015887260437, "rewards/rejected": 0.0, "step": 2515 }, { "epoch": 14.05586592178771, "grad_norm": 0.44879904847492463, "learning_rate": 4.432470323105309e-08, "logits/chosen": -3.555020332336426, "logits/rejected": -3.409587860107422, "logps/chosen": -2.543583393096924, "logps/rejected": -87.54447174072266, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 2.9039902687072754, "rewards/margins": 2.9039902687072754, "rewards/rejected": 0.0, "step": 2516 }, { "epoch": 14.061452513966481, "grad_norm": 0.3836887953363112, "learning_rate": 4.407413445296254e-08, "logits/chosen": -3.2767131328582764, "logits/rejected": -3.217470169067383, "logps/chosen": -1.1344311237335205, "logps/rejected": -57.810550689697266, "loss": 0.1074, "rewards/accuracies": 1.0, "rewards/chosen": 2.732820510864258, "rewards/margins": 2.732820510864258, "rewards/rejected": 0.0, "step": 2517 }, { "epoch": 14.067039106145252, "grad_norm": 0.45793046586598585, "learning_rate": 4.3824243261904146e-08, "logits/chosen": -3.396780490875244, "logits/rejected": -3.420109510421753, "logps/chosen": -0.9700617790222168, "logps/rejected": -62.61637496948242, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": 2.1917288303375244, "rewards/margins": 2.1917288303375244, "rewards/rejected": 0.0, "step": 2518 }, { "epoch": 14.072625698324023, "grad_norm": 0.40737209774679406, "learning_rate": 4.3575030029260715e-08, "logits/chosen": -3.4730730056762695, "logits/rejected": -3.267848491668701, "logps/chosen": -13.06028938293457, "logps/rejected": -40.35478973388672, "loss": 0.0829, "rewards/accuracies": 1.0, "rewards/chosen": 3.1192760467529297, "rewards/margins": 3.1192760467529297, "rewards/rejected": 0.0, "step": 2519 }, { "epoch": 14.078212290502794, "grad_norm": 0.3516096457488634, "learning_rate": 4.332649512540748e-08, "logits/chosen": -3.439720392227173, "logits/rejected": -3.3801205158233643, "logps/chosen": -1.9102089405059814, "logps/rejected": -59.78977966308594, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 2.7264504432678223, "rewards/margins": 2.7264504432678223, "rewards/rejected": 0.0, "step": 2520 }, { "epoch": 14.078212290502794, "eval_logits/chosen": -3.254176616668701, "eval_logits/rejected": -3.378744602203369, "eval_logps/chosen": -30.575481414794922, "eval_logps/rejected": -56.211090087890625, "eval_loss": 1.0447423458099365, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.05054422467947006, "eval_rewards/margins": -0.05054422467947006, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7152, "eval_samples_per_second": 9.476, "eval_steps_per_second": 0.306, "step": 2520 }, { "epoch": 14.083798882681565, "grad_norm": 0.47111034592068946, "learning_rate": 4.307863891971164e-08, "logits/chosen": -3.168804407119751, "logits/rejected": -3.2795393466949463, "logps/chosen": -11.020282745361328, "logps/rejected": -35.376094818115234, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": 4.0943098068237305, "rewards/margins": 4.0943098068237305, "rewards/rejected": 0.0, "step": 2521 }, { "epoch": 14.089385474860336, "grad_norm": 0.40081552329055525, "learning_rate": 4.283146178053154e-08, "logits/chosen": -3.5196638107299805, "logits/rejected": -3.485856771469116, "logps/chosen": -0.3709907829761505, "logps/rejected": -83.14396667480469, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": 2.1558685302734375, "rewards/margins": 2.1558685302734375, "rewards/rejected": 0.0, "step": 2522 }, { "epoch": 14.094972067039107, "grad_norm": 0.34594186164033264, "learning_rate": 4.258496407521644e-08, "logits/chosen": -3.193652391433716, "logits/rejected": -3.2667601108551025, "logps/chosen": -27.787944793701172, "logps/rejected": -39.35036087036133, "loss": 0.0808, "rewards/accuracies": 1.0, "rewards/chosen": 3.521094799041748, "rewards/margins": 3.521094799041748, "rewards/rejected": 0.0, "step": 2523 }, { "epoch": 14.100558659217878, "grad_norm": 0.3721317257821369, "learning_rate": 4.2339146170106086e-08, "logits/chosen": -3.4976680278778076, "logits/rejected": -3.675879716873169, "logps/chosen": -0.825690746307373, "logps/rejected": -30.738506317138672, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": 2.6704134941101074, "rewards/margins": 2.6704134941101074, "rewards/rejected": 0.0, "step": 2524 }, { "epoch": 14.106145251396647, "grad_norm": 0.38951100725223997, "learning_rate": 4.209400843052924e-08, "logits/chosen": -3.378603458404541, "logits/rejected": -3.3314805030822754, "logps/chosen": -1.0683079957962036, "logps/rejected": -33.909095764160156, "loss": 0.0852, "rewards/accuracies": 1.0, "rewards/chosen": 2.552832841873169, "rewards/margins": 2.552832841873169, "rewards/rejected": 0.0, "step": 2525 }, { "epoch": 14.111731843575418, "grad_norm": 0.43354575033828296, "learning_rate": 4.184955122080458e-08, "logits/chosen": -3.3340320587158203, "logits/rejected": -3.439610004425049, "logps/chosen": -0.18201176822185516, "logps/rejected": -59.50272750854492, "loss": 0.1094, "rewards/accuracies": 1.0, "rewards/chosen": 1.106680154800415, "rewards/margins": 1.106680154800415, "rewards/rejected": 0.0, "step": 2526 }, { "epoch": 14.11731843575419, "grad_norm": 0.663533112419587, "learning_rate": 4.160577490423894e-08, "logits/chosen": -3.37919282913208, "logits/rejected": -3.4174959659576416, "logps/chosen": -4.811269760131836, "logps/rejected": -30.667572021484375, "loss": 0.1119, "rewards/accuracies": 1.0, "rewards/chosen": 2.9435300827026367, "rewards/margins": 2.9435300827026367, "rewards/rejected": 0.0, "step": 2527 }, { "epoch": 14.12290502793296, "grad_norm": 0.3576851415028276, "learning_rate": 4.1362679843127356e-08, "logits/chosen": -3.211427688598633, "logits/rejected": -3.2749810218811035, "logps/chosen": -28.984590530395508, "logps/rejected": -51.678653717041016, "loss": 0.1046, "rewards/accuracies": 1.0, "rewards/chosen": 3.3913588523864746, "rewards/margins": 3.3913588523864746, "rewards/rejected": 0.0, "step": 2528 }, { "epoch": 14.128491620111731, "grad_norm": 0.4741016444318694, "learning_rate": 4.1120266398752326e-08, "logits/chosen": -3.5302581787109375, "logits/rejected": -3.514535427093506, "logps/chosen": -8.391327857971191, "logps/rejected": -37.496742248535156, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 4.247954368591309, "rewards/margins": 4.247954368591309, "rewards/rejected": 0.0, "step": 2529 }, { "epoch": 14.134078212290502, "grad_norm": 0.4395296616487134, "learning_rate": 4.0878534931383435e-08, "logits/chosen": -3.499288558959961, "logits/rejected": -3.4895715713500977, "logps/chosen": -1.2045276165008545, "logps/rejected": -32.941505432128906, "loss": 0.1146, "rewards/accuracies": 1.0, "rewards/chosen": 2.3039512634277344, "rewards/margins": 2.3039512634277344, "rewards/rejected": 0.0, "step": 2530 }, { "epoch": 14.139664804469273, "grad_norm": 0.35539218186692995, "learning_rate": 4.063748580027676e-08, "logits/chosen": -3.5145745277404785, "logits/rejected": -3.4668242931365967, "logps/chosen": -0.7444800138473511, "logps/rejected": -56.768516540527344, "loss": 0.0792, "rewards/accuracies": 1.0, "rewards/chosen": 2.1890501976013184, "rewards/margins": 2.1890501976013184, "rewards/rejected": 0.0, "step": 2531 }, { "epoch": 14.145251396648044, "grad_norm": 0.5206543439377311, "learning_rate": 4.0397119363674015e-08, "logits/chosen": -3.6280972957611084, "logits/rejected": -3.4742085933685303, "logps/chosen": -8.813718795776367, "logps/rejected": -34.00957489013672, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 4.191366195678711, "rewards/margins": 4.191366195678711, "rewards/rejected": 0.0, "step": 2532 }, { "epoch": 14.150837988826815, "grad_norm": 0.43278429496198656, "learning_rate": 4.015743597880289e-08, "logits/chosen": -3.387723207473755, "logits/rejected": -3.6379129886627197, "logps/chosen": -0.9776702523231506, "logps/rejected": -97.9456787109375, "loss": 0.1267, "rewards/accuracies": 1.0, "rewards/chosen": 2.0290231704711914, "rewards/margins": 2.0290231704711914, "rewards/rejected": 0.0, "step": 2533 }, { "epoch": 14.156424581005586, "grad_norm": 0.5068155687807977, "learning_rate": 3.991843600187522e-08, "logits/chosen": -3.269091844558716, "logits/rejected": -3.4184460639953613, "logps/chosen": -1.414226770401001, "logps/rejected": -63.77281188964844, "loss": 0.1273, "rewards/accuracies": 1.0, "rewards/chosen": 2.5663299560546875, "rewards/margins": 2.5663299560546875, "rewards/rejected": 0.0, "step": 2534 }, { "epoch": 14.162011173184357, "grad_norm": 0.40955927938213654, "learning_rate": 3.968011978808783e-08, "logits/chosen": -3.5155231952667236, "logits/rejected": -3.675851345062256, "logps/chosen": -0.7108304500579834, "logps/rejected": -74.70698547363281, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 2.098842144012451, "rewards/margins": 2.098842144012451, "rewards/rejected": 0.0, "step": 2535 }, { "epoch": 14.167597765363128, "grad_norm": 0.40162096835373096, "learning_rate": 3.944248769162079e-08, "logits/chosen": -3.493814706802368, "logits/rejected": -3.4598453044891357, "logps/chosen": -21.722007751464844, "logps/rejected": -58.04270553588867, "loss": 0.0687, "rewards/accuracies": 1.0, "rewards/chosen": 3.4508676528930664, "rewards/margins": 3.4508676528930664, "rewards/rejected": 0.0, "step": 2536 }, { "epoch": 14.1731843575419, "grad_norm": 0.43248293866437015, "learning_rate": 3.920554006563792e-08, "logits/chosen": -3.0406687259674072, "logits/rejected": -2.966635227203369, "logps/chosen": -1.9222922325134277, "logps/rejected": -38.26771545410156, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": 2.8196706771850586, "rewards/margins": 2.8196706771850586, "rewards/rejected": 0.0, "step": 2537 }, { "epoch": 14.17877094972067, "grad_norm": 0.40370307551024015, "learning_rate": 3.8969277262285495e-08, "logits/chosen": -3.1226117610931396, "logits/rejected": -3.253596782684326, "logps/chosen": -7.3527936935424805, "logps/rejected": -64.65121459960938, "loss": 0.0959, "rewards/accuracies": 1.0, "rewards/chosen": 2.6517767906188965, "rewards/margins": 2.6517767906188965, "rewards/rejected": 0.0, "step": 2538 }, { "epoch": 14.184357541899441, "grad_norm": 0.42280144704831885, "learning_rate": 3.873369963269219e-08, "logits/chosen": -3.481581687927246, "logits/rejected": -3.59002947807312, "logps/chosen": -0.7101815938949585, "logps/rejected": -84.5839614868164, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": 2.281764030456543, "rewards/margins": 2.281764030456543, "rewards/rejected": 0.0, "step": 2539 }, { "epoch": 14.189944134078212, "grad_norm": 0.3405423202530319, "learning_rate": 3.8498807526968215e-08, "logits/chosen": -3.2610485553741455, "logits/rejected": -3.4200282096862793, "logps/chosen": -0.4112340807914734, "logps/rejected": -55.193363189697266, "loss": 0.1257, "rewards/accuracies": 1.0, "rewards/chosen": 2.174483299255371, "rewards/margins": 2.174483299255371, "rewards/rejected": 0.0, "step": 2540 }, { "epoch": 14.189944134078212, "eval_logits/chosen": -3.257338285446167, "eval_logits/rejected": -3.381732940673828, "eval_logps/chosen": -30.871402740478516, "eval_logps/rejected": -56.050811767578125, "eval_loss": 1.0482580661773682, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": -0.08013595640659332, "eval_rewards/margins": -0.08013595640659332, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6937, "eval_samples_per_second": 9.482, "eval_steps_per_second": 0.306, "step": 2540 }, { "epoch": 14.195530726256983, "grad_norm": 0.38538809453258466, "learning_rate": 3.826460129420511e-08, "logits/chosen": -3.4766619205474854, "logits/rejected": -3.399369478225708, "logps/chosen": -22.428661346435547, "logps/rejected": -32.86236572265625, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 3.9801149368286133, "rewards/margins": 3.9801149368286133, "rewards/rejected": 0.0, "step": 2541 }, { "epoch": 14.201117318435754, "grad_norm": 0.35870652344020576, "learning_rate": 3.803108128247512e-08, "logits/chosen": -3.4228286743164062, "logits/rejected": -3.4164814949035645, "logps/chosen": -1.1249449253082275, "logps/rejected": -62.07542419433594, "loss": 0.1017, "rewards/accuracies": 1.0, "rewards/chosen": 2.915952682495117, "rewards/margins": 2.915952682495117, "rewards/rejected": 0.0, "step": 2542 }, { "epoch": 14.206703910614525, "grad_norm": 0.38860705277975816, "learning_rate": 3.77982478388304e-08, "logits/chosen": -3.1100687980651855, "logits/rejected": -3.129997491836548, "logps/chosen": -8.455366134643555, "logps/rejected": -24.278011322021484, "loss": 0.1109, "rewards/accuracies": 1.0, "rewards/chosen": 2.617151975631714, "rewards/margins": 2.617151975631714, "rewards/rejected": 0.0, "step": 2543 }, { "epoch": 14.212290502793296, "grad_norm": 0.4470156301698857, "learning_rate": 3.7566101309303156e-08, "logits/chosen": -3.403496265411377, "logits/rejected": -3.533383369445801, "logps/chosen": -12.462495803833008, "logps/rejected": -74.77552795410156, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": 2.8615341186523438, "rewards/margins": 2.8615341186523438, "rewards/rejected": 0.0, "step": 2544 }, { "epoch": 14.217877094972067, "grad_norm": 0.42620057037924514, "learning_rate": 3.7334642038904185e-08, "logits/chosen": -3.377558469772339, "logits/rejected": -3.22792649269104, "logps/chosen": -0.30472633242607117, "logps/rejected": -60.303619384765625, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": 1.7432624101638794, "rewards/margins": 1.7432624101638794, "rewards/rejected": 0.0, "step": 2545 }, { "epoch": 14.223463687150838, "grad_norm": 0.35496394065499254, "learning_rate": 3.7103870371623434e-08, "logits/chosen": -3.5415914058685303, "logits/rejected": -3.546121597290039, "logps/chosen": -0.2317655086517334, "logps/rejected": -44.095947265625, "loss": 0.1163, "rewards/accuracies": 1.0, "rewards/chosen": 1.514262080192566, "rewards/margins": 1.514262080192566, "rewards/rejected": 0.0, "step": 2546 }, { "epoch": 14.22905027932961, "grad_norm": 0.5830025937584982, "learning_rate": 3.687378665042862e-08, "logits/chosen": -3.3116681575775146, "logits/rejected": -3.3289129734039307, "logps/chosen": -0.45762497186660767, "logps/rejected": -55.56715393066406, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/chosen": 1.9170093536376953, "rewards/margins": 1.9170093536376953, "rewards/rejected": 0.0, "step": 2547 }, { "epoch": 14.23463687150838, "grad_norm": 0.36813760464536227, "learning_rate": 3.664439121726509e-08, "logits/chosen": -3.4270031452178955, "logits/rejected": -3.4405107498168945, "logps/chosen": -3.3159165382385254, "logps/rejected": -52.413536071777344, "loss": 0.0867, "rewards/accuracies": 1.0, "rewards/chosen": 3.170335292816162, "rewards/margins": 3.170335292816162, "rewards/rejected": 0.0, "step": 2548 }, { "epoch": 14.240223463687151, "grad_norm": 0.40096396614975366, "learning_rate": 3.641568441305532e-08, "logits/chosen": -3.6501471996307373, "logits/rejected": -3.588867425918579, "logps/chosen": -31.158584594726562, "logps/rejected": -20.77127456665039, "loss": 0.0702, "rewards/accuracies": 1.0, "rewards/chosen": 3.6337647438049316, "rewards/margins": 3.6337647438049316, "rewards/rejected": 0.0, "step": 2549 }, { "epoch": 14.245810055865922, "grad_norm": 0.3740658947458769, "learning_rate": 3.618766657769834e-08, "logits/chosen": -3.2856688499450684, "logits/rejected": -3.244035005569458, "logps/chosen": -1.9044477939605713, "logps/rejected": -32.862998962402344, "loss": 0.1112, "rewards/accuracies": 1.0, "rewards/chosen": 3.0152015686035156, "rewards/margins": 3.0152015686035156, "rewards/rejected": 0.0, "step": 2550 }, { "epoch": 14.251396648044693, "grad_norm": 0.4314091033780081, "learning_rate": 3.5960338050069316e-08, "logits/chosen": -3.368588447570801, "logits/rejected": -3.219712257385254, "logps/chosen": -3.6364493370056152, "logps/rejected": -63.97681427001953, "loss": 0.1024, "rewards/accuracies": 1.0, "rewards/chosen": 3.1884875297546387, "rewards/margins": 3.1884875297546387, "rewards/rejected": 0.0, "step": 2551 }, { "epoch": 14.256983240223464, "grad_norm": 0.37774614647382987, "learning_rate": 3.5733699168018826e-08, "logits/chosen": -3.426694393157959, "logits/rejected": -3.3392457962036133, "logps/chosen": -4.514124393463135, "logps/rejected": -33.23273849487305, "loss": 0.076, "rewards/accuracies": 1.0, "rewards/chosen": 3.8817319869995117, "rewards/margins": 3.8817319869995117, "rewards/rejected": 0.0, "step": 2552 }, { "epoch": 14.262569832402235, "grad_norm": 0.4382203653417537, "learning_rate": 3.550775026837283e-08, "logits/chosen": -3.526038408279419, "logits/rejected": -3.4986283779144287, "logps/chosen": -3.162095546722412, "logps/rejected": -37.188873291015625, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": 2.527355670928955, "rewards/margins": 2.527355670928955, "rewards/rejected": 0.0, "step": 2553 }, { "epoch": 14.268156424581006, "grad_norm": 0.3481932633927292, "learning_rate": 3.528249168693132e-08, "logits/chosen": -3.265181064605713, "logits/rejected": -3.219682455062866, "logps/chosen": -0.8243135809898376, "logps/rejected": -44.84501266479492, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 1.9604628086090088, "rewards/margins": 1.9604628086090088, "rewards/rejected": 0.0, "step": 2554 }, { "epoch": 14.273743016759777, "grad_norm": 0.35303876429225944, "learning_rate": 3.505792375846889e-08, "logits/chosen": -3.349062204360962, "logits/rejected": -3.506606101989746, "logps/chosen": -4.68916130065918, "logps/rejected": -51.4373893737793, "loss": 0.1179, "rewards/accuracies": 1.0, "rewards/chosen": 2.8279829025268555, "rewards/margins": 2.8279829025268555, "rewards/rejected": 0.0, "step": 2555 }, { "epoch": 14.279329608938548, "grad_norm": 0.40274354528791934, "learning_rate": 3.483404681673341e-08, "logits/chosen": -3.368523597717285, "logits/rejected": -3.2170631885528564, "logps/chosen": -5.536831378936768, "logps/rejected": -51.38372802734375, "loss": 0.0755, "rewards/accuracies": 1.0, "rewards/chosen": 3.8051626682281494, "rewards/margins": 3.8051626682281494, "rewards/rejected": 0.0, "step": 2556 }, { "epoch": 14.28491620111732, "grad_norm": 0.38131181541146014, "learning_rate": 3.461086119444589e-08, "logits/chosen": -3.321594715118408, "logits/rejected": -3.440338134765625, "logps/chosen": -12.703758239746094, "logps/rejected": -84.10555267333984, "loss": 0.0924, "rewards/accuracies": 1.0, "rewards/chosen": 3.1931815147399902, "rewards/margins": 3.1931815147399902, "rewards/rejected": 0.0, "step": 2557 }, { "epoch": 14.29050279329609, "grad_norm": 0.34319082466630746, "learning_rate": 3.4388367223299906e-08, "logits/chosen": -3.3441390991210938, "logits/rejected": -3.4788408279418945, "logps/chosen": -2.0679593086242676, "logps/rejected": -112.08697509765625, "loss": 0.0886, "rewards/accuracies": 1.0, "rewards/chosen": 2.617946147918701, "rewards/margins": 2.617946147918701, "rewards/rejected": 0.0, "step": 2558 }, { "epoch": 14.296089385474861, "grad_norm": 0.5708486696705353, "learning_rate": 3.416656523396111e-08, "logits/chosen": -3.172494649887085, "logits/rejected": -3.3157424926757812, "logps/chosen": -0.9829146862030029, "logps/rejected": -70.50743103027344, "loss": 0.1169, "rewards/accuracies": 1.0, "rewards/chosen": 2.420619249343872, "rewards/margins": 2.420619249343872, "rewards/rejected": 0.0, "step": 2559 }, { "epoch": 14.30167597765363, "grad_norm": 0.3337373841054444, "learning_rate": 3.394545555606681e-08, "logits/chosen": -3.390951633453369, "logits/rejected": -3.4657745361328125, "logps/chosen": -1.568161129951477, "logps/rejected": -43.94268035888672, "loss": 0.1197, "rewards/accuracies": 1.0, "rewards/chosen": 3.1630754470825195, "rewards/margins": 3.1630754470825195, "rewards/rejected": 0.0, "step": 2560 }, { "epoch": 14.30167597765363, "eval_logits/chosen": -3.255465030670166, "eval_logits/rejected": -3.378861665725708, "eval_logps/chosen": -30.90744400024414, "eval_logps/rejected": -56.270057678222656, "eval_loss": 1.056686520576477, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": -0.08374010026454926, "eval_rewards/margins": -0.08374010026454926, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7032, "eval_samples_per_second": 9.479, "eval_steps_per_second": 0.306, "step": 2560 }, { "epoch": 14.307262569832401, "grad_norm": 0.4062184874937163, "learning_rate": 3.3725038518225224e-08, "logits/chosen": -2.9874773025512695, "logits/rejected": -3.006955623626709, "logps/chosen": -14.823481559753418, "logps/rejected": -57.64149856567383, "loss": 0.1158, "rewards/accuracies": 1.0, "rewards/chosen": 3.2055070400238037, "rewards/margins": 3.2055070400238037, "rewards/rejected": 0.0, "step": 2561 }, { "epoch": 14.312849162011172, "grad_norm": 0.35126790298425564, "learning_rate": 3.3505314448015574e-08, "logits/chosen": -3.2318477630615234, "logits/rejected": -3.3444159030914307, "logps/chosen": -0.5783787965774536, "logps/rejected": -36.68168640136719, "loss": 0.1083, "rewards/accuracies": 1.0, "rewards/chosen": 2.0725185871124268, "rewards/margins": 2.0725185871124268, "rewards/rejected": 0.0, "step": 2562 }, { "epoch": 14.318435754189943, "grad_norm": 0.5177101421785699, "learning_rate": 3.328628367198666e-08, "logits/chosen": -3.232680559158325, "logits/rejected": -3.465559244155884, "logps/chosen": -0.8780964612960815, "logps/rejected": -35.866153717041016, "loss": 0.1048, "rewards/accuracies": 1.0, "rewards/chosen": 2.195763111114502, "rewards/margins": 2.195763111114502, "rewards/rejected": 0.0, "step": 2563 }, { "epoch": 14.324022346368714, "grad_norm": 0.4864073422162365, "learning_rate": 3.306794651565753e-08, "logits/chosen": -3.5679423809051514, "logits/rejected": -3.545628786087036, "logps/chosen": -3.0100579261779785, "logps/rejected": -58.98371124267578, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 2.029555559158325, "rewards/margins": 2.029555559158325, "rewards/rejected": 0.0, "step": 2564 }, { "epoch": 14.329608938547485, "grad_norm": 0.35815007533728227, "learning_rate": 3.2850303303515794e-08, "logits/chosen": -3.2007193565368652, "logits/rejected": -3.4571046829223633, "logps/chosen": -2.7985048294067383, "logps/rejected": -53.030723571777344, "loss": 0.0917, "rewards/accuracies": 1.0, "rewards/chosen": 3.0638461112976074, "rewards/margins": 3.0638461112976074, "rewards/rejected": 0.0, "step": 2565 }, { "epoch": 14.335195530726256, "grad_norm": 0.4100680157995391, "learning_rate": 3.263335435901821e-08, "logits/chosen": -3.204197645187378, "logits/rejected": -3.0503411293029785, "logps/chosen": -1.3740668296813965, "logps/rejected": -47.070106506347656, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": 2.610954999923706, "rewards/margins": 2.610954999923706, "rewards/rejected": 0.0, "step": 2566 }, { "epoch": 14.340782122905027, "grad_norm": 1.2958371240100712, "learning_rate": 3.2417100004589526e-08, "logits/chosen": -3.090535879135132, "logits/rejected": -3.2727270126342773, "logps/chosen": -2.032452344894409, "logps/rejected": -106.64765167236328, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 2.0451555252075195, "rewards/margins": 2.0451555252075195, "rewards/rejected": 0.0, "step": 2567 }, { "epoch": 14.346368715083798, "grad_norm": 0.5461939516497486, "learning_rate": 3.220154056162216e-08, "logits/chosen": -3.3030285835266113, "logits/rejected": -3.4315459728240967, "logps/chosen": -3.357105255126953, "logps/rejected": -42.3886604309082, "loss": 0.1331, "rewards/accuracies": 1.0, "rewards/chosen": 3.027128219604492, "rewards/margins": 3.027128219604492, "rewards/rejected": 0.0, "step": 2568 }, { "epoch": 14.35195530726257, "grad_norm": 0.3496657848254997, "learning_rate": 3.198667635047592e-08, "logits/chosen": -3.306248664855957, "logits/rejected": -3.3260040283203125, "logps/chosen": -4.4225053787231445, "logps/rejected": -50.34362030029297, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 3.587620973587036, "rewards/margins": 3.587620973587036, "rewards/rejected": 0.0, "step": 2569 }, { "epoch": 14.35754189944134, "grad_norm": 0.3618740932211241, "learning_rate": 3.177250769047718e-08, "logits/chosen": -3.3666021823883057, "logits/rejected": -3.229046106338501, "logps/chosen": -8.65818977355957, "logps/rejected": -52.667110443115234, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": 3.7629356384277344, "rewards/margins": 3.7629356384277344, "rewards/rejected": 0.0, "step": 2570 }, { "epoch": 14.363128491620111, "grad_norm": 0.39086312368819265, "learning_rate": 3.155903489991879e-08, "logits/chosen": -3.4258573055267334, "logits/rejected": -3.39311146736145, "logps/chosen": -3.4805006980895996, "logps/rejected": -29.188730239868164, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": 2.843630313873291, "rewards/margins": 2.843630313873291, "rewards/rejected": 0.0, "step": 2571 }, { "epoch": 14.368715083798882, "grad_norm": 0.430391431592419, "learning_rate": 3.134625829605925e-08, "logits/chosen": -3.3774266242980957, "logits/rejected": -3.485069513320923, "logps/chosen": -1.3629817962646484, "logps/rejected": -63.495811462402344, "loss": 0.1085, "rewards/accuracies": 1.0, "rewards/chosen": 2.650252103805542, "rewards/margins": 2.650252103805542, "rewards/rejected": 0.0, "step": 2572 }, { "epoch": 14.374301675977653, "grad_norm": 0.4148997561832557, "learning_rate": 3.113417819512265e-08, "logits/chosen": -3.3535616397857666, "logits/rejected": -3.6837968826293945, "logps/chosen": -2.745608329772949, "logps/rejected": -50.464202880859375, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 2.9353556632995605, "rewards/margins": 2.9353556632995605, "rewards/rejected": 0.0, "step": 2573 }, { "epoch": 14.379888268156424, "grad_norm": 0.40928154109029374, "learning_rate": 3.09227949122976e-08, "logits/chosen": -3.064005136489868, "logits/rejected": -3.35892915725708, "logps/chosen": -0.336436927318573, "logps/rejected": -98.37713623046875, "loss": 0.1195, "rewards/accuracies": 1.0, "rewards/chosen": 2.0511691570281982, "rewards/margins": 2.0511691570281982, "rewards/rejected": 0.0, "step": 2574 }, { "epoch": 14.385474860335195, "grad_norm": 0.3885864644716112, "learning_rate": 3.071210876173741e-08, "logits/chosen": -3.5664894580841064, "logits/rejected": -3.6670656204223633, "logps/chosen": -1.5626771450042725, "logps/rejected": -50.25803756713867, "loss": 0.11, "rewards/accuracies": 1.0, "rewards/chosen": 2.824280261993408, "rewards/margins": 2.824280261993408, "rewards/rejected": 0.0, "step": 2575 }, { "epoch": 14.391061452513966, "grad_norm": 0.39290590716934026, "learning_rate": 3.0502120056559225e-08, "logits/chosen": -3.382290840148926, "logits/rejected": -3.4783260822296143, "logps/chosen": -3.0286645889282227, "logps/rejected": -51.1189079284668, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": 3.124781847000122, "rewards/margins": 3.124781847000122, "rewards/rejected": 0.0, "step": 2576 }, { "epoch": 14.396648044692737, "grad_norm": 0.8561349864475638, "learning_rate": 3.029282910884362e-08, "logits/chosen": -3.3393774032592773, "logits/rejected": -3.3798530101776123, "logps/chosen": -0.8968712091445923, "logps/rejected": -67.7798080444336, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 2.1824569702148438, "rewards/margins": 2.1824569702148438, "rewards/rejected": 0.0, "step": 2577 }, { "epoch": 14.402234636871508, "grad_norm": 0.38494307401336597, "learning_rate": 3.008423622963424e-08, "logits/chosen": -3.4091882705688477, "logits/rejected": -3.287452459335327, "logps/chosen": -0.607304036617279, "logps/rejected": -57.99492645263672, "loss": 0.0917, "rewards/accuracies": 1.0, "rewards/chosen": 1.7982110977172852, "rewards/margins": 1.7982110977172852, "rewards/rejected": 0.0, "step": 2578 }, { "epoch": 14.40782122905028, "grad_norm": 0.39067061708928996, "learning_rate": 2.987634172893727e-08, "logits/chosen": -3.2327842712402344, "logits/rejected": -3.473045587539673, "logps/chosen": -1.3123928308486938, "logps/rejected": -74.77027893066406, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": 2.2215685844421387, "rewards/margins": 2.2215685844421387, "rewards/rejected": 0.0, "step": 2579 }, { "epoch": 14.41340782122905, "grad_norm": 0.35512891358892545, "learning_rate": 2.9669145915720896e-08, "logits/chosen": -3.2294089794158936, "logits/rejected": -3.275222063064575, "logps/chosen": -0.3199467062950134, "logps/rejected": -84.38740539550781, "loss": 0.1024, "rewards/accuracies": 1.0, "rewards/chosen": 1.7708466053009033, "rewards/margins": 1.7708466053009033, "rewards/rejected": 0.0, "step": 2580 }, { "epoch": 14.41340782122905, "eval_logits/chosen": -3.2508492469787598, "eval_logits/rejected": -3.3756213188171387, "eval_logps/chosen": -30.67477035522461, "eval_logps/rejected": -56.07917022705078, "eval_loss": 1.0568053722381592, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.06047270447015762, "eval_rewards/margins": -0.06047270447015762, "eval_rewards/rejected": 0.0, "eval_runtime": 32.693, "eval_samples_per_second": 9.482, "eval_steps_per_second": 0.306, "step": 2580 }, { "epoch": 14.418994413407821, "grad_norm": 0.40818006036734, "learning_rate": 2.9462649097915015e-08, "logits/chosen": -3.4440417289733887, "logits/rejected": -3.4083168506622314, "logps/chosen": -10.291391372680664, "logps/rejected": -36.64921569824219, "loss": 0.08, "rewards/accuracies": 1.0, "rewards/chosen": 3.854292869567871, "rewards/margins": 3.854292869567871, "rewards/rejected": 0.0, "step": 2581 }, { "epoch": 14.424581005586592, "grad_norm": 0.3520335560565263, "learning_rate": 2.9256851582410756e-08, "logits/chosen": -3.4099106788635254, "logits/rejected": -3.474188804626465, "logps/chosen": -0.756075918674469, "logps/rejected": -31.355152130126953, "loss": 0.0865, "rewards/accuracies": 1.0, "rewards/chosen": 2.440232753753662, "rewards/margins": 2.440232753753662, "rewards/rejected": 0.0, "step": 2582 }, { "epoch": 14.430167597765363, "grad_norm": 0.4478345951399215, "learning_rate": 2.9051753675059744e-08, "logits/chosen": -3.2319183349609375, "logits/rejected": -3.228318452835083, "logps/chosen": -0.3813924193382263, "logps/rejected": -126.91836547851562, "loss": 0.0816, "rewards/accuracies": 1.0, "rewards/chosen": 2.2227683067321777, "rewards/margins": 2.2227683067321777, "rewards/rejected": 0.0, "step": 2583 }, { "epoch": 14.435754189944134, "grad_norm": 0.4553624792675634, "learning_rate": 2.884735568067409e-08, "logits/chosen": -3.5689172744750977, "logits/rejected": -3.6130666732788086, "logps/chosen": -1.2657164335250854, "logps/rejected": -44.229610443115234, "loss": 0.0886, "rewards/accuracies": 1.0, "rewards/chosen": 2.909900188446045, "rewards/margins": 2.909900188446045, "rewards/rejected": 0.0, "step": 2584 }, { "epoch": 14.441340782122905, "grad_norm": 0.4124485290524065, "learning_rate": 2.864365790302564e-08, "logits/chosen": -3.4473507404327393, "logits/rejected": -3.515202045440674, "logps/chosen": -3.42461895942688, "logps/rejected": -63.259376525878906, "loss": 0.0743, "rewards/accuracies": 1.0, "rewards/chosen": 2.9966366291046143, "rewards/margins": 2.9966366291046143, "rewards/rejected": 0.0, "step": 2585 }, { "epoch": 14.446927374301676, "grad_norm": 0.38026992578702373, "learning_rate": 2.844066064484546e-08, "logits/chosen": -3.2951223850250244, "logits/rejected": -3.399473190307617, "logps/chosen": -0.6012113094329834, "logps/rejected": -53.583499908447266, "loss": 0.1027, "rewards/accuracies": 1.0, "rewards/chosen": 2.072929859161377, "rewards/margins": 2.072929859161377, "rewards/rejected": 0.0, "step": 2586 }, { "epoch": 14.452513966480447, "grad_norm": 0.40418546186396853, "learning_rate": 2.8238364207823717e-08, "logits/chosen": -3.137958288192749, "logits/rejected": -3.0798516273498535, "logps/chosen": -20.386611938476562, "logps/rejected": -57.91345977783203, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 2.8627772331237793, "rewards/margins": 2.8627772331237793, "rewards/rejected": 0.0, "step": 2587 }, { "epoch": 14.458100558659218, "grad_norm": 0.4793948181881813, "learning_rate": 2.803676889260892e-08, "logits/chosen": -3.414140462875366, "logits/rejected": -3.2515978813171387, "logps/chosen": -5.7660675048828125, "logps/rejected": -34.76047134399414, "loss": 0.1344, "rewards/accuracies": 1.0, "rewards/chosen": 3.0209760665893555, "rewards/margins": 3.0209760665893555, "rewards/rejected": 0.0, "step": 2588 }, { "epoch": 14.46368715083799, "grad_norm": 0.36959361743706953, "learning_rate": 2.7835874998807575e-08, "logits/chosen": -3.1530601978302, "logits/rejected": -3.2044665813446045, "logps/chosen": -6.9558563232421875, "logps/rejected": -76.71287536621094, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 3.355774164199829, "rewards/margins": 3.355774164199829, "rewards/rejected": 0.0, "step": 2589 }, { "epoch": 14.46927374301676, "grad_norm": 0.414943361556342, "learning_rate": 2.763568282498374e-08, "logits/chosen": -2.955747365951538, "logits/rejected": -3.082019567489624, "logps/chosen": -11.286369323730469, "logps/rejected": -44.17339324951172, "loss": 0.0869, "rewards/accuracies": 1.0, "rewards/chosen": 3.6280245780944824, "rewards/margins": 3.6280245780944824, "rewards/rejected": 0.0, "step": 2590 }, { "epoch": 14.474860335195531, "grad_norm": 0.45495298373564647, "learning_rate": 2.7436192668658876e-08, "logits/chosen": -3.0897738933563232, "logits/rejected": -3.135404109954834, "logps/chosen": -3.8595781326293945, "logps/rejected": -71.08548736572266, "loss": 0.1146, "rewards/accuracies": 1.0, "rewards/chosen": 2.9746313095092773, "rewards/margins": 2.9746313095092773, "rewards/rejected": 0.0, "step": 2591 }, { "epoch": 14.480446927374302, "grad_norm": 0.38281341793748186, "learning_rate": 2.7237404826310595e-08, "logits/chosen": -3.3906188011169434, "logits/rejected": -3.6005892753601074, "logps/chosen": -4.082036018371582, "logps/rejected": -43.455020904541016, "loss": 0.078, "rewards/accuracies": 1.0, "rewards/chosen": 3.56838321685791, "rewards/margins": 3.56838321685791, "rewards/rejected": 0.0, "step": 2592 }, { "epoch": 14.486033519553073, "grad_norm": 0.37134089965678546, "learning_rate": 2.7039319593373245e-08, "logits/chosen": -3.496018171310425, "logits/rejected": -3.381351947784424, "logps/chosen": -2.4031014442443848, "logps/rejected": -39.70995330810547, "loss": 0.1094, "rewards/accuracies": 1.0, "rewards/chosen": 3.413717746734619, "rewards/margins": 3.413717746734619, "rewards/rejected": 0.0, "step": 2593 }, { "epoch": 14.491620111731844, "grad_norm": 0.46900536140110083, "learning_rate": 2.684193726423656e-08, "logits/chosen": -3.3678228855133057, "logits/rejected": -3.2361197471618652, "logps/chosen": -3.444424867630005, "logps/rejected": -67.34076690673828, "loss": 0.0993, "rewards/accuracies": 1.0, "rewards/chosen": 2.425002098083496, "rewards/margins": 2.425002098083496, "rewards/rejected": 0.0, "step": 2594 }, { "epoch": 14.497206703910614, "grad_norm": 0.36701090376847895, "learning_rate": 2.6645258132245994e-08, "logits/chosen": -3.5504260063171387, "logits/rejected": -3.4358696937561035, "logps/chosen": -1.103353500366211, "logps/rejected": -43.75018310546875, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 2.983163833618164, "rewards/margins": 2.983163833618164, "rewards/rejected": 0.0, "step": 2595 }, { "epoch": 14.502793296089386, "grad_norm": 0.39706166504429785, "learning_rate": 2.644928248970174e-08, "logits/chosen": -3.490511178970337, "logits/rejected": -3.5199904441833496, "logps/chosen": -0.4865378439426422, "logps/rejected": -50.04668426513672, "loss": 0.1024, "rewards/accuracies": 1.0, "rewards/chosen": 2.2721471786499023, "rewards/margins": 2.2721471786499023, "rewards/rejected": 0.0, "step": 2596 }, { "epoch": 14.508379888268156, "grad_norm": 0.5558956921166254, "learning_rate": 2.6254010627858425e-08, "logits/chosen": -3.393669366836548, "logits/rejected": -3.3033180236816406, "logps/chosen": -0.1819501668214798, "logps/rejected": -62.02227783203125, "loss": 0.125, "rewards/accuracies": 1.0, "rewards/chosen": 1.5980136394500732, "rewards/margins": 1.5980136394500732, "rewards/rejected": 0.0, "step": 2597 }, { "epoch": 14.513966480446927, "grad_norm": 0.48786314777861517, "learning_rate": 2.6059442836924907e-08, "logits/chosen": -3.3012449741363525, "logits/rejected": -3.271615505218506, "logps/chosen": -3.1856186389923096, "logps/rejected": -50.03583908081055, "loss": 0.0949, "rewards/accuracies": 1.0, "rewards/chosen": 3.0265607833862305, "rewards/margins": 3.0265607833862305, "rewards/rejected": 0.0, "step": 2598 }, { "epoch": 14.519553072625698, "grad_norm": 0.32651791115638257, "learning_rate": 2.586557940606349e-08, "logits/chosen": -3.349930763244629, "logits/rejected": -3.3481690883636475, "logps/chosen": -0.39179161190986633, "logps/rejected": -27.280929565429688, "loss": 0.094, "rewards/accuracies": 1.0, "rewards/chosen": 2.166875123977661, "rewards/margins": 2.166875123977661, "rewards/rejected": 0.0, "step": 2599 }, { "epoch": 14.525139664804469, "grad_norm": 0.823962753447027, "learning_rate": 2.5672420623389867e-08, "logits/chosen": -3.3070006370544434, "logits/rejected": -3.481149435043335, "logps/chosen": -1.2035163640975952, "logps/rejected": -70.06402587890625, "loss": 0.0841, "rewards/accuracies": 1.0, "rewards/chosen": 2.6678123474121094, "rewards/margins": 2.6678123474121094, "rewards/rejected": 0.0, "step": 2600 }, { "epoch": 14.525139664804469, "eval_logits/chosen": -3.251675844192505, "eval_logits/rejected": -3.3768248558044434, "eval_logps/chosen": -30.821081161499023, "eval_logps/rejected": -56.12659454345703, "eval_loss": 1.052950143814087, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": -0.07510419934988022, "eval_rewards/margins": -0.07510419934988022, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6999, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.306, "step": 2600 }, { "epoch": 14.53072625698324, "grad_norm": 0.35034672718428556, "learning_rate": 2.5479966775972196e-08, "logits/chosen": -3.4400594234466553, "logits/rejected": -3.4662489891052246, "logps/chosen": -0.2684183418750763, "logps/rejected": -53.87624740600586, "loss": 0.1112, "rewards/accuracies": 1.0, "rewards/chosen": 1.6512014865875244, "rewards/margins": 1.6512014865875244, "rewards/rejected": 0.0, "step": 2601 }, { "epoch": 14.53631284916201, "grad_norm": 0.33204673460745693, "learning_rate": 2.5288218149831398e-08, "logits/chosen": -2.746803045272827, "logits/rejected": -2.8619046211242676, "logps/chosen": -38.81970977783203, "logps/rejected": -39.522727966308594, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": 4.217256546020508, "rewards/margins": 4.217256546020508, "rewards/rejected": 0.0, "step": 2602 }, { "epoch": 14.541899441340782, "grad_norm": 0.35216370747195497, "learning_rate": 2.509717502993991e-08, "logits/chosen": -3.466219425201416, "logits/rejected": -3.3800175189971924, "logps/chosen": -4.04020881652832, "logps/rejected": -34.599159240722656, "loss": 0.0841, "rewards/accuracies": 1.0, "rewards/chosen": 2.462942600250244, "rewards/margins": 2.462942600250244, "rewards/rejected": 0.0, "step": 2603 }, { "epoch": 14.547486033519553, "grad_norm": 0.37850456251304454, "learning_rate": 2.4906837700221895e-08, "logits/chosen": -3.2788219451904297, "logits/rejected": -3.29059100151062, "logps/chosen": -19.956703186035156, "logps/rejected": -68.52668762207031, "loss": 0.11, "rewards/accuracies": 1.0, "rewards/chosen": 3.91402530670166, "rewards/margins": 3.91402530670166, "rewards/rejected": 0.0, "step": 2604 }, { "epoch": 14.553072625698324, "grad_norm": 0.3620836293269757, "learning_rate": 2.4717206443552573e-08, "logits/chosen": -3.2138986587524414, "logits/rejected": -3.1645960807800293, "logps/chosen": -7.612255573272705, "logps/rejected": -41.527992248535156, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 3.1727817058563232, "rewards/margins": 3.1727817058563232, "rewards/rejected": 0.0, "step": 2605 }, { "epoch": 14.558659217877095, "grad_norm": 0.3769520825069747, "learning_rate": 2.4528281541757677e-08, "logits/chosen": -3.6428723335266113, "logits/rejected": -3.611280679702759, "logps/chosen": -1.5903761386871338, "logps/rejected": -39.019962310791016, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 2.8951711654663086, "rewards/margins": 2.8951711654663086, "rewards/rejected": 0.0, "step": 2606 }, { "epoch": 14.564245810055866, "grad_norm": 0.5338666848084862, "learning_rate": 2.4340063275613333e-08, "logits/chosen": -3.184285879135132, "logits/rejected": -3.4263367652893066, "logps/chosen": -0.8953671455383301, "logps/rejected": -40.780067443847656, "loss": 0.1237, "rewards/accuracies": 1.0, "rewards/chosen": 2.0662784576416016, "rewards/margins": 2.0662784576416016, "rewards/rejected": 0.0, "step": 2607 }, { "epoch": 14.569832402234637, "grad_norm": 0.44377925234671706, "learning_rate": 2.415255192484533e-08, "logits/chosen": -3.537266254425049, "logits/rejected": -3.3981332778930664, "logps/chosen": -13.35228443145752, "logps/rejected": -40.480587005615234, "loss": 0.1022, "rewards/accuracies": 1.0, "rewards/chosen": 4.2639312744140625, "rewards/margins": 4.2639312744140625, "rewards/rejected": 0.0, "step": 2608 }, { "epoch": 14.575418994413408, "grad_norm": 0.3736243148903746, "learning_rate": 2.396574776812893e-08, "logits/chosen": -3.4469752311706543, "logits/rejected": -3.5112974643707275, "logps/chosen": -1.0641577243804932, "logps/rejected": -59.10731887817383, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 2.733170986175537, "rewards/margins": 2.733170986175537, "rewards/rejected": 0.0, "step": 2609 }, { "epoch": 14.581005586592179, "grad_norm": 0.36598148041686873, "learning_rate": 2.377965108308838e-08, "logits/chosen": -3.415602684020996, "logits/rejected": -3.525176525115967, "logps/chosen": -0.1782577633857727, "logps/rejected": -56.63910675048828, "loss": 0.122, "rewards/accuracies": 1.0, "rewards/chosen": 1.8573319911956787, "rewards/margins": 1.8573319911956787, "rewards/rejected": 0.0, "step": 2610 }, { "epoch": 14.58659217877095, "grad_norm": 0.37965183369100297, "learning_rate": 2.3594262146296672e-08, "logits/chosen": -3.457428216934204, "logits/rejected": -3.622685670852661, "logps/chosen": -0.9535687565803528, "logps/rejected": -60.888526916503906, "loss": 0.107, "rewards/accuracies": 1.0, "rewards/chosen": 2.5787744522094727, "rewards/margins": 2.5787744522094727, "rewards/rejected": 0.0, "step": 2611 }, { "epoch": 14.59217877094972, "grad_norm": 0.3711794585903184, "learning_rate": 2.3409581233274466e-08, "logits/chosen": -3.2968878746032715, "logits/rejected": -3.3927321434020996, "logps/chosen": -0.4315730631351471, "logps/rejected": -76.43840026855469, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": 2.3661458492279053, "rewards/margins": 2.3661458492279053, "rewards/rejected": 0.0, "step": 2612 }, { "epoch": 14.597765363128492, "grad_norm": 0.418805128932327, "learning_rate": 2.3225608618490766e-08, "logits/chosen": -3.59462308883667, "logits/rejected": -3.6426193714141846, "logps/chosen": -6.757093906402588, "logps/rejected": -29.405033111572266, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": 2.983060598373413, "rewards/margins": 2.983060598373413, "rewards/rejected": 0.0, "step": 2613 }, { "epoch": 14.603351955307263, "grad_norm": 0.34918248560991544, "learning_rate": 2.3042344575361528e-08, "logits/chosen": -3.340179920196533, "logits/rejected": -3.330141305923462, "logps/chosen": -1.3053386211395264, "logps/rejected": -34.125205993652344, "loss": 0.092, "rewards/accuracies": 1.0, "rewards/chosen": 2.5008788108825684, "rewards/margins": 2.5008788108825684, "rewards/rejected": 0.0, "step": 2614 }, { "epoch": 14.608938547486034, "grad_norm": 0.4830880413118678, "learning_rate": 2.2859789376249716e-08, "logits/chosen": -3.3018860816955566, "logits/rejected": -3.420578718185425, "logps/chosen": -1.1772595643997192, "logps/rejected": -58.97906494140625, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": 2.2907752990722656, "rewards/margins": 2.2907752990722656, "rewards/rejected": 0.0, "step": 2615 }, { "epoch": 14.614525139664805, "grad_norm": 0.5430887252058046, "learning_rate": 2.2677943292464973e-08, "logits/chosen": -3.4364094734191895, "logits/rejected": -3.503571033477783, "logps/chosen": -0.9159830212593079, "logps/rejected": -69.68579864501953, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": 2.6822304725646973, "rewards/margins": 2.6822304725646973, "rewards/rejected": 0.0, "step": 2616 }, { "epoch": 14.620111731843576, "grad_norm": 0.48130237763903183, "learning_rate": 2.249680659426284e-08, "logits/chosen": -3.2569284439086914, "logits/rejected": -3.2110025882720947, "logps/chosen": -3.1655027866363525, "logps/rejected": -65.03008270263672, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 2.6212363243103027, "rewards/margins": 2.6212363243103027, "rewards/rejected": 0.0, "step": 2617 }, { "epoch": 14.625698324022347, "grad_norm": 0.4390675915180137, "learning_rate": 2.231637955084481e-08, "logits/chosen": -3.1946511268615723, "logits/rejected": -3.288429021835327, "logps/chosen": -18.75676727294922, "logps/rejected": -54.70408630371094, "loss": 0.1064, "rewards/accuracies": 1.0, "rewards/chosen": 3.4994304180145264, "rewards/margins": 3.4994304180145264, "rewards/rejected": 0.0, "step": 2618 }, { "epoch": 14.631284916201118, "grad_norm": 1.2330578550313098, "learning_rate": 2.21366624303575e-08, "logits/chosen": -3.0974862575531006, "logits/rejected": -3.4215402603149414, "logps/chosen": -2.1984615325927734, "logps/rejected": -55.712890625, "loss": 0.1385, "rewards/accuracies": 1.0, "rewards/chosen": 1.8407673835754395, "rewards/margins": 1.8407673835754395, "rewards/rejected": 0.0, "step": 2619 }, { "epoch": 14.636871508379889, "grad_norm": 0.4838777218585874, "learning_rate": 2.195765549989276e-08, "logits/chosen": -3.335300922393799, "logits/rejected": -3.259881019592285, "logps/chosen": -1.1644904613494873, "logps/rejected": -39.106361389160156, "loss": 0.1166, "rewards/accuracies": 1.0, "rewards/chosen": 2.7018516063690186, "rewards/margins": 2.7018516063690186, "rewards/rejected": 0.0, "step": 2620 }, { "epoch": 14.636871508379889, "eval_logits/chosen": -3.2521300315856934, "eval_logits/rejected": -3.377594470977783, "eval_logps/chosen": -30.75811767578125, "eval_logps/rejected": -56.20076370239258, "eval_loss": 1.052037239074707, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": -0.06880731880664825, "eval_rewards/margins": -0.06880731880664825, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7264, "eval_samples_per_second": 9.472, "eval_steps_per_second": 0.306, "step": 2620 }, { "epoch": 14.64245810055866, "grad_norm": 0.3687159054789329, "learning_rate": 2.1779359025486498e-08, "logits/chosen": -3.552938222885132, "logits/rejected": -3.731419563293457, "logps/chosen": -3.871910572052002, "logps/rejected": -38.07981491088867, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": 3.2329859733581543, "rewards/margins": 3.2329859733581543, "rewards/rejected": 0.0, "step": 2621 }, { "epoch": 14.64804469273743, "grad_norm": 0.3875358266687342, "learning_rate": 2.1601773272119318e-08, "logits/chosen": -3.381281614303589, "logits/rejected": -3.2493088245391846, "logps/chosen": -0.6715680360794067, "logps/rejected": -55.6790771484375, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 2.4488768577575684, "rewards/margins": 2.4488768577575684, "rewards/rejected": 0.0, "step": 2622 }, { "epoch": 14.653631284916202, "grad_norm": 0.4122394220220475, "learning_rate": 2.1424898503714985e-08, "logits/chosen": -3.5211215019226074, "logits/rejected": -3.5921006202697754, "logps/chosen": -2.2461485862731934, "logps/rejected": -42.21638488769531, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 2.5515592098236084, "rewards/margins": 2.5515592098236084, "rewards/rejected": 0.0, "step": 2623 }, { "epoch": 14.659217877094973, "grad_norm": 0.3859643799676988, "learning_rate": 2.124873498314106e-08, "logits/chosen": -3.062135696411133, "logits/rejected": -3.137162923812866, "logps/chosen": -3.3561148643493652, "logps/rejected": -31.430452346801758, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": 2.6686174869537354, "rewards/margins": 2.6686174869537354, "rewards/rejected": 0.0, "step": 2624 }, { "epoch": 14.664804469273744, "grad_norm": 0.4212571230933188, "learning_rate": 2.1073282972207896e-08, "logits/chosen": -3.439175605773926, "logits/rejected": -3.3376693725585938, "logps/chosen": -5.655553817749023, "logps/rejected": -68.16014862060547, "loss": 0.1252, "rewards/accuracies": 1.0, "rewards/chosen": 3.194246768951416, "rewards/margins": 3.194246768951416, "rewards/rejected": 0.0, "step": 2625 }, { "epoch": 14.670391061452515, "grad_norm": 0.3567387071922027, "learning_rate": 2.089854273166841e-08, "logits/chosen": -3.439663887023926, "logits/rejected": -3.444983959197998, "logps/chosen": -0.3894228935241699, "logps/rejected": -29.912153244018555, "loss": 0.1173, "rewards/accuracies": 1.0, "rewards/chosen": 2.1726903915405273, "rewards/margins": 2.1726903915405273, "rewards/rejected": 0.0, "step": 2626 }, { "epoch": 14.675977653631286, "grad_norm": 0.3520192576168209, "learning_rate": 2.0724514521217763e-08, "logits/chosen": -3.247497081756592, "logits/rejected": -3.4018421173095703, "logps/chosen": -0.22753706574440002, "logps/rejected": -39.27146530151367, "loss": 0.1053, "rewards/accuracies": 1.0, "rewards/chosen": 1.6547861099243164, "rewards/margins": 1.6547861099243164, "rewards/rejected": 0.0, "step": 2627 }, { "epoch": 14.681564245810057, "grad_norm": 0.3970515001799419, "learning_rate": 2.0551198599492836e-08, "logits/chosen": -3.331470489501953, "logits/rejected": -3.4937338829040527, "logps/chosen": -7.155062675476074, "logps/rejected": -32.809383392333984, "loss": 0.0912, "rewards/accuracies": 1.0, "rewards/chosen": 3.697918176651001, "rewards/margins": 3.697918176651001, "rewards/rejected": 0.0, "step": 2628 }, { "epoch": 14.687150837988828, "grad_norm": 0.45355096438905357, "learning_rate": 2.0378595224071925e-08, "logits/chosen": -3.2169480323791504, "logits/rejected": -3.1807217597961426, "logps/chosen": -7.081135272979736, "logps/rejected": -33.9853630065918, "loss": 0.089, "rewards/accuracies": 1.0, "rewards/chosen": 4.2905778884887695, "rewards/margins": 4.2905778884887695, "rewards/rejected": 0.0, "step": 2629 }, { "epoch": 14.692737430167599, "grad_norm": 0.350916390676908, "learning_rate": 2.0206704651474336e-08, "logits/chosen": -3.228888988494873, "logits/rejected": -3.2587034702301025, "logps/chosen": -1.4819749593734741, "logps/rejected": -37.1948127746582, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": 2.5577054023742676, "rewards/margins": 2.5577054023742676, "rewards/rejected": 0.0, "step": 2630 }, { "epoch": 14.69832402234637, "grad_norm": 0.4109007574926235, "learning_rate": 2.003552713716028e-08, "logits/chosen": -3.43320631980896, "logits/rejected": -3.302509307861328, "logps/chosen": -3.303668975830078, "logps/rejected": -30.332958221435547, "loss": 0.0821, "rewards/accuracies": 1.0, "rewards/chosen": 3.254763126373291, "rewards/margins": 3.254763126373291, "rewards/rejected": 0.0, "step": 2631 }, { "epoch": 14.703910614525139, "grad_norm": 1.0999709243811522, "learning_rate": 1.9865062935529808e-08, "logits/chosen": -3.463435173034668, "logits/rejected": -3.440654754638672, "logps/chosen": -4.47990608215332, "logps/rejected": -64.33112335205078, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 3.208681106567383, "rewards/margins": 3.208681106567383, "rewards/rejected": 0.0, "step": 2632 }, { "epoch": 14.70949720670391, "grad_norm": 0.481934449821989, "learning_rate": 1.9695312299923162e-08, "logits/chosen": -3.424265146255493, "logits/rejected": -3.4145216941833496, "logps/chosen": -0.8707083463668823, "logps/rejected": -77.55708312988281, "loss": 0.1271, "rewards/accuracies": 1.0, "rewards/chosen": 2.0164778232574463, "rewards/margins": 2.0164778232574463, "rewards/rejected": 0.0, "step": 2633 }, { "epoch": 14.71508379888268, "grad_norm": 0.3750543951241516, "learning_rate": 1.952627548262009e-08, "logits/chosen": -3.446359395980835, "logits/rejected": -3.3021953105926514, "logps/chosen": -4.184319972991943, "logps/rejected": -54.65296936035156, "loss": 0.0745, "rewards/accuracies": 1.0, "rewards/chosen": 3.354356527328491, "rewards/margins": 3.354356527328491, "rewards/rejected": 0.0, "step": 2634 }, { "epoch": 14.720670391061452, "grad_norm": 0.44207718376243293, "learning_rate": 1.9357952734839312e-08, "logits/chosen": -2.96549654006958, "logits/rejected": -2.860167980194092, "logps/chosen": -32.893165588378906, "logps/rejected": -67.67338562011719, "loss": 0.0976, "rewards/accuracies": 1.0, "rewards/chosen": 4.400967597961426, "rewards/margins": 4.400967597961426, "rewards/rejected": 0.0, "step": 2635 }, { "epoch": 14.726256983240223, "grad_norm": 0.38039342394313047, "learning_rate": 1.919034430673844e-08, "logits/chosen": -3.536604404449463, "logits/rejected": -3.4818167686462402, "logps/chosen": -1.9360244274139404, "logps/rejected": -59.25251388549805, "loss": 0.09, "rewards/accuracies": 1.0, "rewards/chosen": 2.3960742950439453, "rewards/margins": 2.3960742950439453, "rewards/rejected": 0.0, "step": 2636 }, { "epoch": 14.731843575418994, "grad_norm": 0.3490895639330724, "learning_rate": 1.9023450447413436e-08, "logits/chosen": -3.300827980041504, "logits/rejected": -3.4647958278656006, "logps/chosen": -0.49746406078338623, "logps/rejected": -57.20527648925781, "loss": 0.0689, "rewards/accuracies": 1.0, "rewards/chosen": 2.0046162605285645, "rewards/margins": 2.0046162605285645, "rewards/rejected": 0.0, "step": 2637 }, { "epoch": 14.737430167597765, "grad_norm": 0.40550564877263295, "learning_rate": 1.885727140489829e-08, "logits/chosen": -3.253418445587158, "logits/rejected": -3.5220563411712646, "logps/chosen": -0.15144218504428864, "logps/rejected": -111.05549621582031, "loss": 0.1321, "rewards/accuracies": 1.0, "rewards/chosen": 1.3530927896499634, "rewards/margins": 1.3530927896499634, "rewards/rejected": 0.0, "step": 2638 }, { "epoch": 14.743016759776536, "grad_norm": 0.35351196643128785, "learning_rate": 1.8691807426164607e-08, "logits/chosen": -3.5048904418945312, "logits/rejected": -3.4716110229492188, "logps/chosen": -13.23552417755127, "logps/rejected": -54.78971862792969, "loss": 0.0797, "rewards/accuracies": 1.0, "rewards/chosen": 3.246061086654663, "rewards/margins": 3.246061086654663, "rewards/rejected": 0.0, "step": 2639 }, { "epoch": 14.748603351955307, "grad_norm": 0.4608382579300712, "learning_rate": 1.852705875712157e-08, "logits/chosen": -3.4893388748168945, "logits/rejected": -3.3630337715148926, "logps/chosen": -4.5363874435424805, "logps/rejected": -38.974388122558594, "loss": 0.0786, "rewards/accuracies": 1.0, "rewards/chosen": 3.436383008956909, "rewards/margins": 3.436383008956909, "rewards/rejected": 0.0, "step": 2640 }, { "epoch": 14.748603351955307, "eval_logits/chosen": -3.2588112354278564, "eval_logits/rejected": -3.381526470184326, "eval_logps/chosen": -30.6961727142334, "eval_logps/rejected": -56.20176315307617, "eval_loss": 1.0570828914642334, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.06261324137449265, "eval_rewards/margins": -0.06261324137449265, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7221, "eval_samples_per_second": 9.474, "eval_steps_per_second": 0.306, "step": 2640 }, { "epoch": 14.754189944134078, "grad_norm": 0.36795394142857324, "learning_rate": 1.836302564261466e-08, "logits/chosen": -3.246004581451416, "logits/rejected": -3.4199206829071045, "logps/chosen": -0.334102988243103, "logps/rejected": -123.59196472167969, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 1.6130549907684326, "rewards/margins": 1.6130549907684326, "rewards/rejected": 0.0, "step": 2641 }, { "epoch": 14.759776536312849, "grad_norm": 0.405662194125218, "learning_rate": 1.8199708326426643e-08, "logits/chosen": -3.481472969055176, "logits/rejected": -3.556199073791504, "logps/chosen": -0.28236332535743713, "logps/rejected": -64.80596923828125, "loss": 0.1208, "rewards/accuracies": 1.0, "rewards/chosen": 1.7299556732177734, "rewards/margins": 1.7299556732177734, "rewards/rejected": 0.0, "step": 2642 }, { "epoch": 14.76536312849162, "grad_norm": 0.36918646172301467, "learning_rate": 1.8037107051275923e-08, "logits/chosen": -3.6038081645965576, "logits/rejected": -3.556504487991333, "logps/chosen": -3.957348585128784, "logps/rejected": -57.72964096069336, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": 3.272129774093628, "rewards/margins": 3.272129774093628, "rewards/rejected": 0.0, "step": 2643 }, { "epoch": 14.77094972067039, "grad_norm": 0.42410614403411123, "learning_rate": 1.787522205881703e-08, "logits/chosen": -3.5083959102630615, "logits/rejected": -3.3299269676208496, "logps/chosen": -4.993856430053711, "logps/rejected": -41.53742980957031, "loss": 0.1013, "rewards/accuracies": 1.0, "rewards/chosen": 3.24393367767334, "rewards/margins": 3.24393367767334, "rewards/rejected": 0.0, "step": 2644 }, { "epoch": 14.776536312849162, "grad_norm": 0.43327424424618927, "learning_rate": 1.77140535896399e-08, "logits/chosen": -3.5569281578063965, "logits/rejected": -3.5528361797332764, "logps/chosen": -3.9627926349639893, "logps/rejected": -30.01145362854004, "loss": 0.1239, "rewards/accuracies": 1.0, "rewards/chosen": 3.1907334327697754, "rewards/margins": 3.1907334327697754, "rewards/rejected": 0.0, "step": 2645 }, { "epoch": 14.782122905027933, "grad_norm": 0.3958390691017802, "learning_rate": 1.7553601883269597e-08, "logits/chosen": -3.432112455368042, "logits/rejected": -3.261931896209717, "logps/chosen": -3.6665518283843994, "logps/rejected": -92.46833801269531, "loss": 0.1059, "rewards/accuracies": 1.0, "rewards/chosen": 3.111374855041504, "rewards/margins": 3.111374855041504, "rewards/rejected": 0.0, "step": 2646 }, { "epoch": 14.787709497206704, "grad_norm": 0.38290720230321823, "learning_rate": 1.739386717816582e-08, "logits/chosen": -3.3596928119659424, "logits/rejected": -3.4711050987243652, "logps/chosen": -0.6624868512153625, "logps/rejected": -60.173095703125, "loss": 0.1174, "rewards/accuracies": 1.0, "rewards/chosen": 2.2967941761016846, "rewards/margins": 2.2967941761016846, "rewards/rejected": 0.0, "step": 2647 }, { "epoch": 14.793296089385475, "grad_norm": 0.37660516483796985, "learning_rate": 1.7234849711722842e-08, "logits/chosen": -3.545567512512207, "logits/rejected": -3.5239815711975098, "logps/chosen": -2.9479568004608154, "logps/rejected": -100.97955322265625, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": 2.462566375732422, "rewards/margins": 2.462566375732422, "rewards/rejected": 0.0, "step": 2648 }, { "epoch": 14.798882681564246, "grad_norm": 0.7143914445117544, "learning_rate": 1.7076549720269116e-08, "logits/chosen": -3.246804714202881, "logits/rejected": -3.3367602825164795, "logps/chosen": -0.2639075815677643, "logps/rejected": -54.3329963684082, "loss": 0.1355, "rewards/accuracies": 1.0, "rewards/chosen": 1.8515474796295166, "rewards/margins": 1.8515474796295166, "rewards/rejected": 0.0, "step": 2649 }, { "epoch": 14.804469273743017, "grad_norm": 0.40277215037576825, "learning_rate": 1.6918967439066344e-08, "logits/chosen": -3.342604875564575, "logits/rejected": -3.419504165649414, "logps/chosen": -2.6654586791992188, "logps/rejected": -57.11647033691406, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": 2.9482383728027344, "rewards/margins": 2.9482383728027344, "rewards/rejected": 0.0, "step": 2650 }, { "epoch": 14.810055865921788, "grad_norm": 0.4039025045873746, "learning_rate": 1.6762103102310077e-08, "logits/chosen": -3.5208730697631836, "logits/rejected": -2.899697780609131, "logps/chosen": -0.5285656452178955, "logps/rejected": -153.28765869140625, "loss": 0.106, "rewards/accuracies": 1.0, "rewards/chosen": 2.113286256790161, "rewards/margins": 2.113286256790161, "rewards/rejected": 0.0, "step": 2651 }, { "epoch": 14.815642458100559, "grad_norm": 0.3418434975746601, "learning_rate": 1.660595694312844e-08, "logits/chosen": -3.465338945388794, "logits/rejected": -3.5613439083099365, "logps/chosen": -1.449244737625122, "logps/rejected": -83.42619323730469, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": 2.855820655822754, "rewards/margins": 2.855820655822754, "rewards/rejected": 0.0, "step": 2652 }, { "epoch": 14.82122905027933, "grad_norm": 0.3613168884981856, "learning_rate": 1.6450529193582586e-08, "logits/chosen": -3.3460452556610107, "logits/rejected": -3.55086350440979, "logps/chosen": -0.679241418838501, "logps/rejected": -49.6741943359375, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": 2.1192545890808105, "rewards/margins": 2.1192545890808105, "rewards/rejected": 0.0, "step": 2653 }, { "epoch": 14.8268156424581, "grad_norm": 0.35899306172298456, "learning_rate": 1.6295820084665845e-08, "logits/chosen": -3.6450414657592773, "logits/rejected": -3.6122703552246094, "logps/chosen": -20.39347267150879, "logps/rejected": -44.38009262084961, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 2.465202808380127, "rewards/margins": 2.465202808380127, "rewards/rejected": 0.0, "step": 2654 }, { "epoch": 14.832402234636872, "grad_norm": 0.32645729268390417, "learning_rate": 1.6141829846303468e-08, "logits/chosen": -3.470165491104126, "logits/rejected": -3.3938283920288086, "logps/chosen": -0.5699958205223083, "logps/rejected": -48.6433219909668, "loss": 0.1056, "rewards/accuracies": 1.0, "rewards/chosen": 2.4341936111450195, "rewards/margins": 2.4341936111450195, "rewards/rejected": 0.0, "step": 2655 }, { "epoch": 14.837988826815643, "grad_norm": 0.5142779757414533, "learning_rate": 1.5988558707352327e-08, "logits/chosen": -3.477855682373047, "logits/rejected": -3.513197183609009, "logps/chosen": -48.18681716918945, "logps/rejected": -48.22716522216797, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": 4.520964622497559, "rewards/margins": 4.520964622497559, "rewards/rejected": 0.0, "step": 2656 }, { "epoch": 14.843575418994414, "grad_norm": 0.48715230823999733, "learning_rate": 1.5836006895600717e-08, "logits/chosen": -3.271955966949463, "logits/rejected": -3.4795589447021484, "logps/chosen": -0.3449538052082062, "logps/rejected": -76.18074035644531, "loss": 0.1349, "rewards/accuracies": 1.0, "rewards/chosen": 1.436866044998169, "rewards/margins": 1.436866044998169, "rewards/rejected": 0.0, "step": 2657 }, { "epoch": 14.849162011173185, "grad_norm": 0.3772270483117669, "learning_rate": 1.5684174637767723e-08, "logits/chosen": -3.569859266281128, "logits/rejected": -3.5872387886047363, "logps/chosen": -40.804996490478516, "logps/rejected": -63.99356460571289, "loss": 0.0954, "rewards/accuracies": 1.0, "rewards/chosen": 3.621565818786621, "rewards/margins": 3.621565818786621, "rewards/rejected": 0.0, "step": 2658 }, { "epoch": 14.854748603351956, "grad_norm": 0.6019501026357243, "learning_rate": 1.5533062159503064e-08, "logits/chosen": -3.6645209789276123, "logits/rejected": -3.5907912254333496, "logps/chosen": -11.660226821899414, "logps/rejected": -94.15673065185547, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": 2.636493682861328, "rewards/margins": 2.636493682861328, "rewards/rejected": 0.0, "step": 2659 }, { "epoch": 14.860335195530727, "grad_norm": 0.39926621115797817, "learning_rate": 1.538266968538704e-08, "logits/chosen": -3.1913115978240967, "logits/rejected": -3.3153505325317383, "logps/chosen": -14.5741605758667, "logps/rejected": -61.53295135498047, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": 3.4079737663269043, "rewards/margins": 3.4079737663269043, "rewards/rejected": 0.0, "step": 2660 }, { "epoch": 14.860335195530727, "eval_logits/chosen": -3.2547249794006348, "eval_logits/rejected": -3.3794593811035156, "eval_logps/chosen": -30.67974281311035, "eval_logps/rejected": -55.7110481262207, "eval_loss": 1.0554580688476562, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": -0.060970108956098557, "eval_rewards/margins": -0.060970108956098557, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7183, "eval_samples_per_second": 9.475, "eval_steps_per_second": 0.306, "step": 2660 }, { "epoch": 14.865921787709498, "grad_norm": 0.4576728086417737, "learning_rate": 1.5232997438929307e-08, "logits/chosen": -3.0724565982818604, "logits/rejected": -3.2148144245147705, "logps/chosen": -2.644747018814087, "logps/rejected": -51.407508850097656, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": 2.1311590671539307, "rewards/margins": 2.1311590671539307, "rewards/rejected": 0.0, "step": 2661 }, { "epoch": 14.871508379888269, "grad_norm": 0.3438935345685653, "learning_rate": 1.5084045642569764e-08, "logits/chosen": -3.4441304206848145, "logits/rejected": -3.6405422687530518, "logps/chosen": -32.07526779174805, "logps/rejected": -113.20217895507812, "loss": 0.0879, "rewards/accuracies": 1.0, "rewards/chosen": 2.755329132080078, "rewards/margins": 2.755329132080078, "rewards/rejected": 0.0, "step": 2662 }, { "epoch": 14.87709497206704, "grad_norm": 0.35839125398176075, "learning_rate": 1.4935814517677225e-08, "logits/chosen": -3.478083372116089, "logits/rejected": -3.4834911823272705, "logps/chosen": -14.865629196166992, "logps/rejected": -74.30992126464844, "loss": 0.1284, "rewards/accuracies": 1.0, "rewards/chosen": 2.5339725017547607, "rewards/margins": 2.5339725017547607, "rewards/rejected": 0.0, "step": 2663 }, { "epoch": 14.88268156424581, "grad_norm": 0.417511190026462, "learning_rate": 1.4788304284549468e-08, "logits/chosen": -3.2606968879699707, "logits/rejected": -3.337392568588257, "logps/chosen": -0.19713765382766724, "logps/rejected": -74.63046264648438, "loss": 0.0982, "rewards/accuracies": 1.0, "rewards/chosen": 1.7466249465942383, "rewards/margins": 1.7466249465942383, "rewards/rejected": 0.0, "step": 2664 }, { "epoch": 14.888268156424582, "grad_norm": 0.7624378878488587, "learning_rate": 1.4641515162413021e-08, "logits/chosen": -3.3800857067108154, "logits/rejected": -3.2392077445983887, "logps/chosen": -5.718407154083252, "logps/rejected": -102.63508605957031, "loss": 0.0973, "rewards/accuracies": 1.0, "rewards/chosen": 2.418631076812744, "rewards/margins": 2.418631076812744, "rewards/rejected": 0.0, "step": 2665 }, { "epoch": 14.893854748603353, "grad_norm": 0.3408636206728141, "learning_rate": 1.4495447369422765e-08, "logits/chosen": -3.474879264831543, "logits/rejected": -3.3781349658966064, "logps/chosen": -19.407983779907227, "logps/rejected": -40.66181564331055, "loss": 0.0863, "rewards/accuracies": 1.0, "rewards/chosen": 3.638751983642578, "rewards/margins": 3.638751983642578, "rewards/rejected": 0.0, "step": 2666 }, { "epoch": 14.899441340782122, "grad_norm": 0.3584504928456228, "learning_rate": 1.4350101122661219e-08, "logits/chosen": -3.325920820236206, "logits/rejected": -3.4892213344573975, "logps/chosen": -0.6450057625770569, "logps/rejected": -92.8226547241211, "loss": 0.0924, "rewards/accuracies": 1.0, "rewards/chosen": 2.411738634109497, "rewards/margins": 2.411738634109497, "rewards/rejected": 0.0, "step": 2667 }, { "epoch": 14.905027932960895, "grad_norm": 0.3456695753014961, "learning_rate": 1.4205476638138924e-08, "logits/chosen": -3.2882425785064697, "logits/rejected": -3.3540515899658203, "logps/chosen": -0.23389209806919098, "logps/rejected": -79.29186248779297, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 1.7516508102416992, "rewards/margins": 1.7516508102416992, "rewards/rejected": 0.0, "step": 2668 }, { "epoch": 14.910614525139664, "grad_norm": 0.36433128295571626, "learning_rate": 1.406157413079373e-08, "logits/chosen": -3.3356049060821533, "logits/rejected": -3.394909620285034, "logps/chosen": -7.042104721069336, "logps/rejected": -54.11695098876953, "loss": 0.0829, "rewards/accuracies": 1.0, "rewards/chosen": 3.572568893432617, "rewards/margins": 3.572568893432617, "rewards/rejected": 0.0, "step": 2669 }, { "epoch": 14.916201117318435, "grad_norm": 0.3917565352504246, "learning_rate": 1.391839381449017e-08, "logits/chosen": -3.6086223125457764, "logits/rejected": -3.6328821182250977, "logps/chosen": -17.311904907226562, "logps/rejected": -64.36416625976562, "loss": 0.0721, "rewards/accuracies": 1.0, "rewards/chosen": 3.573361396789551, "rewards/margins": 3.573361396789551, "rewards/rejected": 0.0, "step": 2670 }, { "epoch": 14.921787709497206, "grad_norm": 0.36165601537723724, "learning_rate": 1.3775935902019808e-08, "logits/chosen": -3.27400803565979, "logits/rejected": -3.322840452194214, "logps/chosen": -0.5292971134185791, "logps/rejected": -99.6595230102539, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": 2.121829032897949, "rewards/margins": 2.121829032897949, "rewards/rejected": 0.0, "step": 2671 }, { "epoch": 14.927374301675977, "grad_norm": 0.3660794490358241, "learning_rate": 1.3634200605100509e-08, "logits/chosen": -3.366211414337158, "logits/rejected": -3.605614185333252, "logps/chosen": -0.6540247201919556, "logps/rejected": -90.26676940917969, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 2.179960250854492, "rewards/margins": 2.179960250854492, "rewards/rejected": 0.0, "step": 2672 }, { "epoch": 14.932960893854748, "grad_norm": 0.39284028631774637, "learning_rate": 1.349318813437611e-08, "logits/chosen": -3.196241617202759, "logits/rejected": -3.1078386306762695, "logps/chosen": -5.704566955566406, "logps/rejected": -43.353031158447266, "loss": 0.1064, "rewards/accuracies": 1.0, "rewards/chosen": 2.7156381607055664, "rewards/margins": 2.7156381607055664, "rewards/rejected": 0.0, "step": 2673 }, { "epoch": 14.938547486033519, "grad_norm": 1.3109298078540716, "learning_rate": 1.3352898699416248e-08, "logits/chosen": -3.215620517730713, "logits/rejected": -3.4648029804229736, "logps/chosen": -25.826221466064453, "logps/rejected": -37.723182678222656, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": 3.5699305534362793, "rewards/margins": 3.5699305534362793, "rewards/rejected": 0.0, "step": 2674 }, { "epoch": 14.94413407821229, "grad_norm": 0.39928406022927004, "learning_rate": 1.321333250871609e-08, "logits/chosen": -3.2574706077575684, "logits/rejected": -3.259279251098633, "logps/chosen": -1.2326209545135498, "logps/rejected": -86.41130828857422, "loss": 0.089, "rewards/accuracies": 1.0, "rewards/chosen": 1.9776511192321777, "rewards/margins": 1.9776511192321777, "rewards/rejected": 0.0, "step": 2675 }, { "epoch": 14.949720670391061, "grad_norm": 0.4625925721043822, "learning_rate": 1.3074489769695718e-08, "logits/chosen": -3.4715349674224854, "logits/rejected": -3.3700530529022217, "logps/chosen": -3.6988751888275146, "logps/rejected": -30.277475357055664, "loss": 0.1116, "rewards/accuracies": 1.0, "rewards/chosen": 3.283562183380127, "rewards/margins": 3.283562183380127, "rewards/rejected": 0.0, "step": 2676 }, { "epoch": 14.955307262569832, "grad_norm": 0.35754320889886004, "learning_rate": 1.2936370688700238e-08, "logits/chosen": -3.3388655185699463, "logits/rejected": -3.2582459449768066, "logps/chosen": -1.3774136304855347, "logps/rejected": -40.6134033203125, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": 3.365615129470825, "rewards/margins": 3.365615129470825, "rewards/rejected": 0.0, "step": 2677 }, { "epoch": 14.960893854748603, "grad_norm": 0.40319193458720853, "learning_rate": 1.2798975470999284e-08, "logits/chosen": -3.1612911224365234, "logits/rejected": -3.018653154373169, "logps/chosen": -0.9814417362213135, "logps/rejected": -51.494178771972656, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 2.3663196563720703, "rewards/margins": 2.3663196563720703, "rewards/rejected": 0.0, "step": 2678 }, { "epoch": 14.966480446927374, "grad_norm": 0.7016258468644363, "learning_rate": 1.266230432078641e-08, "logits/chosen": -3.057927131652832, "logits/rejected": -3.1106045246124268, "logps/chosen": -1.153242588043213, "logps/rejected": -37.66864013671875, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 2.3504390716552734, "rewards/margins": 2.3504390716552734, "rewards/rejected": 0.0, "step": 2679 }, { "epoch": 14.972067039106145, "grad_norm": 0.38532646113777913, "learning_rate": 1.2526357441179525e-08, "logits/chosen": -3.4418842792510986, "logits/rejected": -3.4901740550994873, "logps/chosen": -3.085214614868164, "logps/rejected": -59.00658416748047, "loss": 0.1146, "rewards/accuracies": 1.0, "rewards/chosen": 2.921659469604492, "rewards/margins": 2.921659469604492, "rewards/rejected": 0.0, "step": 2680 }, { "epoch": 14.972067039106145, "eval_logits/chosen": -3.2502505779266357, "eval_logits/rejected": -3.375441312789917, "eval_logps/chosen": -30.633203506469727, "eval_logps/rejected": -55.960472106933594, "eval_loss": 1.0556432008743286, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.05631597712635994, "eval_rewards/margins": -0.05631597712635994, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7228, "eval_samples_per_second": 9.474, "eval_steps_per_second": 0.306, "step": 2680 }, { "epoch": 14.977653631284916, "grad_norm": 0.4249891450285254, "learning_rate": 1.239113503421968e-08, "logits/chosen": -3.3209052085876465, "logits/rejected": -3.5237913131713867, "logps/chosen": -20.31497573852539, "logps/rejected": -75.81820678710938, "loss": 0.0874, "rewards/accuracies": 1.0, "rewards/chosen": 3.690044403076172, "rewards/margins": 3.690044403076172, "rewards/rejected": 0.0, "step": 2681 }, { "epoch": 14.983240223463687, "grad_norm": 0.9186611554219153, "learning_rate": 1.2256637300871619e-08, "logits/chosen": -3.2406022548675537, "logits/rejected": -3.4094669818878174, "logps/chosen": -0.22653105854988098, "logps/rejected": -88.873779296875, "loss": 0.0887, "rewards/accuracies": 1.0, "rewards/chosen": 1.779331922531128, "rewards/margins": 1.779331922531128, "rewards/rejected": 0.0, "step": 2682 }, { "epoch": 14.988826815642458, "grad_norm": 0.3797020995264553, "learning_rate": 1.2122864441022896e-08, "logits/chosen": -3.150249719619751, "logits/rejected": -3.466625213623047, "logps/chosen": -2.8326287269592285, "logps/rejected": -87.0802001953125, "loss": 0.1132, "rewards/accuracies": 1.0, "rewards/chosen": 2.5402469635009766, "rewards/margins": 2.5402469635009766, "rewards/rejected": 0.0, "step": 2683 }, { "epoch": 14.994413407821229, "grad_norm": 0.5331114849910051, "learning_rate": 1.1989816653483753e-08, "logits/chosen": -3.565075159072876, "logits/rejected": -3.598769426345825, "logps/chosen": -1.9038105010986328, "logps/rejected": -47.03626251220703, "loss": 0.0681, "rewards/accuracies": 1.0, "rewards/chosen": 3.2623674869537354, "rewards/margins": 3.2623674869537354, "rewards/rejected": 0.0, "step": 2684 }, { "epoch": 15.0, "grad_norm": 0.38551657826205105, "learning_rate": 1.1857494135986857e-08, "logits/chosen": -3.2997381687164307, "logits/rejected": -3.363992214202881, "logps/chosen": -0.6210635900497437, "logps/rejected": -45.32338333129883, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 2.5469043254852295, "rewards/margins": 2.5469043254852295, "rewards/rejected": 0.0, "step": 2685 }, { "epoch": 15.005586592178771, "grad_norm": 0.39172324615560666, "learning_rate": 1.172589708518723e-08, "logits/chosen": -3.440688133239746, "logits/rejected": -3.56842041015625, "logps/chosen": -2.112839698791504, "logps/rejected": -47.4237174987793, "loss": 0.0762, "rewards/accuracies": 1.0, "rewards/chosen": 2.6917471885681152, "rewards/margins": 2.6917471885681152, "rewards/rejected": 0.0, "step": 2686 }, { "epoch": 15.011173184357542, "grad_norm": 0.4049448507105249, "learning_rate": 1.1595025696661375e-08, "logits/chosen": -3.44490385055542, "logits/rejected": -3.497697353363037, "logps/chosen": -4.48445987701416, "logps/rejected": -65.00106811523438, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 3.183898448944092, "rewards/margins": 3.183898448944092, "rewards/rejected": 0.0, "step": 2687 }, { "epoch": 15.016759776536313, "grad_norm": 0.3432792319540246, "learning_rate": 1.1464880164907376e-08, "logits/chosen": -3.2594828605651855, "logits/rejected": -3.5466315746307373, "logps/chosen": -1.9839377403259277, "logps/rejected": -102.76213836669922, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": 2.6706361770629883, "rewards/margins": 2.6706361770629883, "rewards/rejected": 0.0, "step": 2688 }, { "epoch": 15.022346368715084, "grad_norm": 0.4694490269016035, "learning_rate": 1.1335460683344955e-08, "logits/chosen": -3.1902191638946533, "logits/rejected": -3.242811679840088, "logps/chosen": -4.908884048461914, "logps/rejected": -54.772071838378906, "loss": 0.0885, "rewards/accuracies": 1.0, "rewards/chosen": 3.411595582962036, "rewards/margins": 3.411595582962036, "rewards/rejected": 0.0, "step": 2689 }, { "epoch": 15.027932960893855, "grad_norm": 0.4142338154777935, "learning_rate": 1.1206767444314202e-08, "logits/chosen": -3.1747968196868896, "logits/rejected": -3.1738836765289307, "logps/chosen": -4.6701812744140625, "logps/rejected": -58.981422424316406, "loss": 0.1169, "rewards/accuracies": 1.0, "rewards/chosen": 2.1388354301452637, "rewards/margins": 2.1388354301452637, "rewards/rejected": 0.0, "step": 2690 }, { "epoch": 15.033519553072626, "grad_norm": 0.3075111500743282, "learning_rate": 1.1078800639076458e-08, "logits/chosen": -3.4383513927459717, "logits/rejected": -3.5019662380218506, "logps/chosen": -6.686107158660889, "logps/rejected": -43.87648010253906, "loss": 0.0966, "rewards/accuracies": 1.0, "rewards/chosen": 2.979459285736084, "rewards/margins": 2.979459285736084, "rewards/rejected": 0.0, "step": 2691 }, { "epoch": 15.039106145251397, "grad_norm": 0.4069977774294662, "learning_rate": 1.0951560457813036e-08, "logits/chosen": -3.448096990585327, "logits/rejected": -3.5652003288269043, "logps/chosen": -4.753368854522705, "logps/rejected": -47.4362907409668, "loss": 0.0832, "rewards/accuracies": 1.0, "rewards/chosen": 3.358572483062744, "rewards/margins": 3.358572483062744, "rewards/rejected": 0.0, "step": 2692 }, { "epoch": 15.044692737430168, "grad_norm": 0.5367508361295136, "learning_rate": 1.0825047089625505e-08, "logits/chosen": -3.3518686294555664, "logits/rejected": -3.2453532218933105, "logps/chosen": -0.5641560554504395, "logps/rejected": -35.182167053222656, "loss": 0.1216, "rewards/accuracies": 1.0, "rewards/chosen": 1.9657347202301025, "rewards/margins": 1.9657347202301025, "rewards/rejected": 0.0, "step": 2693 }, { "epoch": 15.050279329608939, "grad_norm": 0.7600421614889334, "learning_rate": 1.0699260722535241e-08, "logits/chosen": -3.469454526901245, "logits/rejected": -3.1513831615448, "logps/chosen": -12.951601028442383, "logps/rejected": -59.583045959472656, "loss": 0.1334, "rewards/accuracies": 1.0, "rewards/chosen": 2.727524757385254, "rewards/margins": 2.727524757385254, "rewards/rejected": 0.0, "step": 2694 }, { "epoch": 15.05586592178771, "grad_norm": 0.3705788733523154, "learning_rate": 1.0574201543483375e-08, "logits/chosen": -3.3381845951080322, "logits/rejected": -3.3471267223358154, "logps/chosen": -2.429077386856079, "logps/rejected": -49.2909049987793, "loss": 0.0995, "rewards/accuracies": 1.0, "rewards/chosen": 3.4597177505493164, "rewards/margins": 3.4597177505493164, "rewards/rejected": 0.0, "step": 2695 }, { "epoch": 15.061452513966481, "grad_norm": 0.32981565886700426, "learning_rate": 1.0449869738329952e-08, "logits/chosen": -3.3754477500915527, "logits/rejected": -3.1595022678375244, "logps/chosen": -19.427053451538086, "logps/rejected": -65.10870361328125, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 3.7762484550476074, "rewards/margins": 3.7762484550476074, "rewards/rejected": 0.0, "step": 2696 }, { "epoch": 15.067039106145252, "grad_norm": 0.41375104021716336, "learning_rate": 1.0326265491854113e-08, "logits/chosen": -3.4222311973571777, "logits/rejected": -3.4941632747650146, "logps/chosen": -11.23513412475586, "logps/rejected": -70.00746154785156, "loss": 0.0823, "rewards/accuracies": 1.0, "rewards/chosen": 3.793848752975464, "rewards/margins": 3.793848752975464, "rewards/rejected": 0.0, "step": 2697 }, { "epoch": 15.072625698324023, "grad_norm": 0.336578667935485, "learning_rate": 1.0203388987753969e-08, "logits/chosen": -3.333406925201416, "logits/rejected": -3.3734652996063232, "logps/chosen": -1.4314913749694824, "logps/rejected": -46.64700698852539, "loss": 0.0924, "rewards/accuracies": 1.0, "rewards/chosen": 2.6283531188964844, "rewards/margins": 2.6283531188964844, "rewards/rejected": 0.0, "step": 2698 }, { "epoch": 15.078212290502794, "grad_norm": 0.5739988082155488, "learning_rate": 1.008124040864572e-08, "logits/chosen": -3.5269570350646973, "logits/rejected": -3.4450862407684326, "logps/chosen": -3.9541525840759277, "logps/rejected": -48.523494720458984, "loss": 0.0776, "rewards/accuracies": 1.0, "rewards/chosen": 3.282543659210205, "rewards/margins": 3.282543659210205, "rewards/rejected": 0.0, "step": 2699 }, { "epoch": 15.083798882681565, "grad_norm": 0.4819194039282161, "learning_rate": 9.95981993606404e-09, "logits/chosen": -3.373324394226074, "logits/rejected": -3.571223258972168, "logps/chosen": -0.398854523897171, "logps/rejected": -76.59867095947266, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 2.2527942657470703, "rewards/margins": 2.2527942657470703, "rewards/rejected": 0.0, "step": 2700 }, { "epoch": 15.083798882681565, "eval_logits/chosen": -3.2535011768341064, "eval_logits/rejected": -3.377370834350586, "eval_logps/chosen": -30.89455223083496, "eval_logps/rejected": -56.2887077331543, "eval_loss": 1.0546748638153076, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": -0.08245096355676651, "eval_rewards/margins": -0.08245096355676651, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6911, "eval_samples_per_second": 9.483, "eval_steps_per_second": 0.306, "step": 2700 }, { "epoch": 15.089385474860336, "grad_norm": 0.5165000292690357, "learning_rate": 9.839127750461307e-09, "logits/chosen": -3.2869677543640137, "logits/rejected": -3.3428895473480225, "logps/chosen": -12.017826080322266, "logps/rejected": -70.00727844238281, "loss": 0.1354, "rewards/accuracies": 1.0, "rewards/chosen": 2.796191453933716, "rewards/margins": 2.796191453933716, "rewards/rejected": 0.0, "step": 2701 }, { "epoch": 15.094972067039107, "grad_norm": 0.9315881207818567, "learning_rate": 9.719164031207594e-09, "logits/chosen": -3.3606231212615967, "logits/rejected": -3.4705748558044434, "logps/chosen": -4.759427547454834, "logps/rejected": -79.03294372558594, "loss": 0.1026, "rewards/accuracies": 1.0, "rewards/chosen": 3.3535492420196533, "rewards/margins": 3.3535492420196533, "rewards/rejected": 0.0, "step": 2702 }, { "epoch": 15.100558659217878, "grad_norm": 0.3500163013017413, "learning_rate": 9.599928956590341e-09, "logits/chosen": -3.4071614742279053, "logits/rejected": -3.410222291946411, "logps/chosen": -0.421855628490448, "logps/rejected": -62.42582702636719, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": 2.2876079082489014, "rewards/margins": 2.2876079082489014, "rewards/rejected": 0.0, "step": 2703 }, { "epoch": 15.106145251396647, "grad_norm": 0.3711339402041741, "learning_rate": 9.48142270381419e-09, "logits/chosen": -3.186708688735962, "logits/rejected": -3.0210330486297607, "logps/chosen": -0.47165483236312866, "logps/rejected": -112.34162139892578, "loss": 0.1204, "rewards/accuracies": 1.0, "rewards/chosen": 2.1767287254333496, "rewards/margins": 2.1767287254333496, "rewards/rejected": 0.0, "step": 2704 }, { "epoch": 15.111731843575418, "grad_norm": 0.3545778721974017, "learning_rate": 9.363645449000424e-09, "logits/chosen": -3.1981797218322754, "logits/rejected": -3.3513071537017822, "logps/chosen": -2.14025616645813, "logps/rejected": -47.29647445678711, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 3.368718385696411, "rewards/margins": 3.368718385696411, "rewards/rejected": 0.0, "step": 2705 }, { "epoch": 15.11731843575419, "grad_norm": 0.36042445074666246, "learning_rate": 9.246597367187137e-09, "logits/chosen": -3.6300017833709717, "logits/rejected": -3.638338088989258, "logps/chosen": -0.6851445436477661, "logps/rejected": -44.4206657409668, "loss": 0.1063, "rewards/accuracies": 1.0, "rewards/chosen": 2.4193100929260254, "rewards/margins": 2.4193100929260254, "rewards/rejected": 0.0, "step": 2706 }, { "epoch": 15.12290502793296, "grad_norm": 0.35750649403745627, "learning_rate": 9.13027863232857e-09, "logits/chosen": -3.5256896018981934, "logits/rejected": -3.6527068614959717, "logps/chosen": -0.9782887697219849, "logps/rejected": -79.44715118408203, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 2.6410274505615234, "rewards/margins": 2.6410274505615234, "rewards/rejected": 0.0, "step": 2707 }, { "epoch": 15.128491620111731, "grad_norm": 0.3815683215151135, "learning_rate": 9.014689417294997e-09, "logits/chosen": -3.341118335723877, "logits/rejected": -3.4082393646240234, "logps/chosen": -4.818462371826172, "logps/rejected": -28.40052032470703, "loss": 0.0648, "rewards/accuracies": 1.0, "rewards/chosen": 3.1663920879364014, "rewards/margins": 3.1663920879364014, "rewards/rejected": 0.0, "step": 2708 }, { "epoch": 15.134078212290502, "grad_norm": 0.32520778180684184, "learning_rate": 8.899829893872723e-09, "logits/chosen": -3.397822141647339, "logits/rejected": -3.574009418487549, "logps/chosen": -0.23265886306762695, "logps/rejected": -67.71945190429688, "loss": 0.1075, "rewards/accuracies": 1.0, "rewards/chosen": 1.8376045227050781, "rewards/margins": 1.8376045227050781, "rewards/rejected": 0.0, "step": 2709 }, { "epoch": 15.139664804469273, "grad_norm": 0.4223280329092187, "learning_rate": 8.785700232763316e-09, "logits/chosen": -3.484344959259033, "logits/rejected": -3.507127046585083, "logps/chosen": -7.165587902069092, "logps/rejected": -36.34278106689453, "loss": 0.0869, "rewards/accuracies": 1.0, "rewards/chosen": 2.9120960235595703, "rewards/margins": 2.9120960235595703, "rewards/rejected": 0.0, "step": 2710 }, { "epoch": 15.145251396648044, "grad_norm": 0.35484509768665096, "learning_rate": 8.672300603583927e-09, "logits/chosen": -3.5402567386627197, "logits/rejected": -3.580690860748291, "logps/chosen": -0.25957396626472473, "logps/rejected": -25.984642028808594, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 1.6842262744903564, "rewards/margins": 1.6842262744903564, "rewards/rejected": 0.0, "step": 2711 }, { "epoch": 15.150837988826815, "grad_norm": 0.5961823887947978, "learning_rate": 8.559631174866466e-09, "logits/chosen": -3.2367472648620605, "logits/rejected": -3.441416025161743, "logps/chosen": -0.6959924697875977, "logps/rejected": -58.732383728027344, "loss": 0.1091, "rewards/accuracies": 1.0, "rewards/chosen": 2.281392812728882, "rewards/margins": 2.281392812728882, "rewards/rejected": 0.0, "step": 2712 }, { "epoch": 15.156424581005586, "grad_norm": 0.3467801645805499, "learning_rate": 8.447692114057825e-09, "logits/chosen": -3.3672757148742676, "logits/rejected": -3.531606912612915, "logps/chosen": -4.9795637130737305, "logps/rejected": -62.002288818359375, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 2.696640729904175, "rewards/margins": 2.696640729904175, "rewards/rejected": 0.0, "step": 2713 }, { "epoch": 15.162011173184357, "grad_norm": 0.4039314376388327, "learning_rate": 8.336483587519317e-09, "logits/chosen": -3.610227346420288, "logits/rejected": -3.4002838134765625, "logps/chosen": -0.809124231338501, "logps/rejected": -57.50293731689453, "loss": 0.0998, "rewards/accuracies": 1.0, "rewards/chosen": 2.6413872241973877, "rewards/margins": 2.6413872241973877, "rewards/rejected": 0.0, "step": 2714 }, { "epoch": 15.167597765363128, "grad_norm": 0.40278736650436614, "learning_rate": 8.226005760526788e-09, "logits/chosen": -3.2914633750915527, "logits/rejected": -3.4402213096618652, "logps/chosen": -1.4635589122772217, "logps/rejected": -75.35786437988281, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": 2.584859609603882, "rewards/margins": 2.584859609603882, "rewards/rejected": 0.0, "step": 2715 }, { "epoch": 15.1731843575419, "grad_norm": 0.45085494048910235, "learning_rate": 8.116258797269792e-09, "logits/chosen": -3.327390432357788, "logits/rejected": -3.331871271133423, "logps/chosen": -5.626540184020996, "logps/rejected": -59.60590362548828, "loss": 0.1071, "rewards/accuracies": 1.0, "rewards/chosen": 2.6810355186462402, "rewards/margins": 2.6810355186462402, "rewards/rejected": 0.0, "step": 2716 }, { "epoch": 15.17877094972067, "grad_norm": 0.37558104584062896, "learning_rate": 8.00724286085197e-09, "logits/chosen": -3.3111743927001953, "logits/rejected": -3.431515693664551, "logps/chosen": -0.6694455742835999, "logps/rejected": -63.245025634765625, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 2.226891279220581, "rewards/margins": 2.226891279220581, "rewards/rejected": 0.0, "step": 2717 }, { "epoch": 15.184357541899441, "grad_norm": 0.38529163976970043, "learning_rate": 7.898958113290389e-09, "logits/chosen": -3.3776395320892334, "logits/rejected": -3.3438429832458496, "logps/chosen": -1.2175061702728271, "logps/rejected": -47.03485870361328, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": 2.5619194507598877, "rewards/margins": 2.5619194507598877, "rewards/rejected": 0.0, "step": 2718 }, { "epoch": 15.189944134078212, "grad_norm": 0.4038537049443026, "learning_rate": 7.79140471551537e-09, "logits/chosen": -3.1527297496795654, "logits/rejected": -2.9282472133636475, "logps/chosen": -3.5159504413604736, "logps/rejected": -117.24052429199219, "loss": 0.1005, "rewards/accuracies": 1.0, "rewards/chosen": 2.7162249088287354, "rewards/margins": 2.7162249088287354, "rewards/rejected": 0.0, "step": 2719 }, { "epoch": 15.195530726256983, "grad_norm": 0.743766690694123, "learning_rate": 7.684582827370556e-09, "logits/chosen": -3.243110179901123, "logits/rejected": -3.5146403312683105, "logps/chosen": -6.167523384094238, "logps/rejected": -46.33700180053711, "loss": 0.0833, "rewards/accuracies": 1.0, "rewards/chosen": 2.9911394119262695, "rewards/margins": 2.9911394119262695, "rewards/rejected": 0.0, "step": 2720 }, { "epoch": 15.195530726256983, "eval_logits/chosen": -3.2534797191619873, "eval_logits/rejected": -3.377426862716675, "eval_logps/chosen": -30.66217613220215, "eval_logps/rejected": -55.8941764831543, "eval_loss": 1.0542831420898438, "eval_rewards/accuracies": 0.574999988079071, "eval_rewards/chosen": -0.05921328067779541, "eval_rewards/margins": -0.05921328067779541, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7089, "eval_samples_per_second": 9.478, "eval_steps_per_second": 0.306, "step": 2720 }, { "epoch": 15.201117318435754, "grad_norm": 0.3434447764060888, "learning_rate": 7.57849260761223e-09, "logits/chosen": -3.464975595474243, "logits/rejected": -3.4877188205718994, "logps/chosen": -0.31153929233551025, "logps/rejected": -44.04620361328125, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": 2.0053279399871826, "rewards/margins": 2.0053279399871826, "rewards/rejected": 0.0, "step": 2721 }, { "epoch": 15.206703910614525, "grad_norm": 0.46505473654441737, "learning_rate": 7.473134213909326e-09, "logits/chosen": -3.2479467391967773, "logits/rejected": -3.1812796592712402, "logps/chosen": -10.523893356323242, "logps/rejected": -47.685768127441406, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 3.1481878757476807, "rewards/margins": 3.1481878757476807, "rewards/rejected": 0.0, "step": 2722 }, { "epoch": 15.212290502793296, "grad_norm": 0.3462974578647901, "learning_rate": 7.368507802843205e-09, "logits/chosen": -2.9624855518341064, "logits/rejected": -3.018097400665283, "logps/chosen": -8.608769416809082, "logps/rejected": -70.04875183105469, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 2.850444793701172, "rewards/margins": 2.850444793701172, "rewards/rejected": 0.0, "step": 2723 }, { "epoch": 15.217877094972067, "grad_norm": 0.482276154279113, "learning_rate": 7.264613529907482e-09, "logits/chosen": -3.4359025955200195, "logits/rejected": -3.4081170558929443, "logps/chosen": -6.527714729309082, "logps/rejected": -41.29267501831055, "loss": 0.0779, "rewards/accuracies": 1.0, "rewards/chosen": 3.407116651535034, "rewards/margins": 3.407116651535034, "rewards/rejected": 0.0, "step": 2724 }, { "epoch": 15.223463687150838, "grad_norm": 0.4081627577822981, "learning_rate": 7.1614515495073134e-09, "logits/chosen": -3.299471616744995, "logits/rejected": -3.2404634952545166, "logps/chosen": -0.8103320598602295, "logps/rejected": -84.19842529296875, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": 2.342984199523926, "rewards/margins": 2.342984199523926, "rewards/rejected": 0.0, "step": 2725 }, { "epoch": 15.22905027932961, "grad_norm": 0.5901699901597808, "learning_rate": 7.059022014959892e-09, "logits/chosen": -3.3895773887634277, "logits/rejected": -3.554718494415283, "logps/chosen": -6.208129405975342, "logps/rejected": -36.23834228515625, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": 4.03847599029541, "rewards/margins": 4.03847599029541, "rewards/rejected": 0.0, "step": 2726 }, { "epoch": 15.23463687150838, "grad_norm": 0.3070817502639455, "learning_rate": 6.957325078493836e-09, "logits/chosen": -3.4733989238739014, "logits/rejected": -3.4898452758789062, "logps/chosen": -4.381089210510254, "logps/rejected": -35.10344314575195, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": 2.7187013626098633, "rewards/margins": 2.7187013626098633, "rewards/rejected": 0.0, "step": 2727 }, { "epoch": 15.240223463687151, "grad_norm": 0.32389759578194693, "learning_rate": 6.856360891248692e-09, "logits/chosen": -3.2947194576263428, "logits/rejected": -3.4516637325286865, "logps/chosen": -4.336068630218506, "logps/rejected": -58.00398635864258, "loss": 0.1186, "rewards/accuracies": 1.0, "rewards/chosen": 3.1628036499023438, "rewards/margins": 3.1628036499023438, "rewards/rejected": 0.0, "step": 2728 }, { "epoch": 15.245810055865922, "grad_norm": 0.3574632109553235, "learning_rate": 6.756129603275373e-09, "logits/chosen": -3.4045233726501465, "logits/rejected": -3.68743896484375, "logps/chosen": -4.604673862457275, "logps/rejected": -64.30682373046875, "loss": 0.0872, "rewards/accuracies": 1.0, "rewards/chosen": 3.2407960891723633, "rewards/margins": 3.2407960891723633, "rewards/rejected": 0.0, "step": 2729 }, { "epoch": 15.251396648044693, "grad_norm": 0.3472052756323566, "learning_rate": 6.656631363535448e-09, "logits/chosen": -3.3194024562835693, "logits/rejected": -3.561497211456299, "logps/chosen": -29.88372039794922, "logps/rejected": -46.24955368041992, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": 3.303365707397461, "rewards/margins": 3.303365707397461, "rewards/rejected": 0.0, "step": 2730 }, { "epoch": 15.256983240223464, "grad_norm": 0.39174063896126554, "learning_rate": 6.557866319900906e-09, "logits/chosen": -3.5682806968688965, "logits/rejected": -3.375464916229248, "logps/chosen": -21.68202018737793, "logps/rejected": -39.89384460449219, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": 3.635549545288086, "rewards/margins": 3.635549545288086, "rewards/rejected": 0.0, "step": 2731 }, { "epoch": 15.262569832402235, "grad_norm": 0.35820702823310646, "learning_rate": 6.459834619154281e-09, "logits/chosen": -3.4970037937164307, "logits/rejected": -3.3811521530151367, "logps/chosen": -6.336273193359375, "logps/rejected": -83.13807678222656, "loss": 0.1113, "rewards/accuracies": 1.0, "rewards/chosen": 2.971449375152588, "rewards/margins": 2.971449375152588, "rewards/rejected": 0.0, "step": 2732 }, { "epoch": 15.268156424581006, "grad_norm": 0.41517351887392095, "learning_rate": 6.362536406988084e-09, "logits/chosen": -3.433579683303833, "logits/rejected": -3.5727155208587646, "logps/chosen": -6.247230529785156, "logps/rejected": -26.58795928955078, "loss": 0.0755, "rewards/accuracies": 1.0, "rewards/chosen": 3.5267505645751953, "rewards/margins": 3.5267505645751953, "rewards/rejected": 0.0, "step": 2733 }, { "epoch": 15.273743016759777, "grad_norm": 0.43341762233507536, "learning_rate": 6.265971828004812e-09, "logits/chosen": -3.1191630363464355, "logits/rejected": -3.0236997604370117, "logps/chosen": -22.686269760131836, "logps/rejected": -32.200618743896484, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": 3.7814812660217285, "rewards/margins": 3.7814812660217285, "rewards/rejected": 0.0, "step": 2734 }, { "epoch": 15.279329608938548, "grad_norm": 0.5873944055075848, "learning_rate": 6.170141025716668e-09, "logits/chosen": -3.465365409851074, "logits/rejected": -3.3521463871002197, "logps/chosen": -0.907586932182312, "logps/rejected": -32.12206268310547, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": 2.4782509803771973, "rewards/margins": 2.4782509803771973, "rewards/rejected": 0.0, "step": 2735 }, { "epoch": 15.28491620111732, "grad_norm": 0.44320503233625297, "learning_rate": 6.075044142545172e-09, "logits/chosen": -3.2940800189971924, "logits/rejected": -3.42421555519104, "logps/chosen": -2.1653804779052734, "logps/rejected": -38.06364822387695, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 3.0755441188812256, "rewards/margins": 3.0755441188812256, "rewards/rejected": 0.0, "step": 2736 }, { "epoch": 15.29050279329609, "grad_norm": 0.48844686058675907, "learning_rate": 5.980681319821379e-09, "logits/chosen": -3.2081522941589355, "logits/rejected": -3.2111096382141113, "logps/chosen": -0.3553764820098877, "logps/rejected": -47.57817077636719, "loss": 0.1265, "rewards/accuracies": 1.0, "rewards/chosen": 2.0247111320495605, "rewards/margins": 2.0247111320495605, "rewards/rejected": 0.0, "step": 2737 }, { "epoch": 15.296089385474861, "grad_norm": 0.40511396835768404, "learning_rate": 5.887052697785166e-09, "logits/chosen": -3.509035110473633, "logits/rejected": -3.4871180057525635, "logps/chosen": -1.976684808731079, "logps/rejected": -31.386011123657227, "loss": 0.1069, "rewards/accuracies": 1.0, "rewards/chosen": 2.5718722343444824, "rewards/margins": 2.5718722343444824, "rewards/rejected": 0.0, "step": 2738 }, { "epoch": 15.30167597765363, "grad_norm": 0.31613930392954237, "learning_rate": 5.794158415585282e-09, "logits/chosen": -3.2557921409606934, "logits/rejected": -3.3748040199279785, "logps/chosen": -8.889276504516602, "logps/rejected": -51.01299285888672, "loss": 0.1245, "rewards/accuracies": 1.0, "rewards/chosen": 3.503488540649414, "rewards/margins": 3.503488540649414, "rewards/rejected": 0.0, "step": 2739 }, { "epoch": 15.307262569832401, "grad_norm": 0.35332353763257246, "learning_rate": 5.701998611279291e-09, "logits/chosen": -3.317143201828003, "logits/rejected": -3.417137861251831, "logps/chosen": -7.209802150726318, "logps/rejected": -53.06700897216797, "loss": 0.1128, "rewards/accuracies": 1.0, "rewards/chosen": 2.875089645385742, "rewards/margins": 2.875089645385742, "rewards/rejected": 0.0, "step": 2740 }, { "epoch": 15.307262569832401, "eval_logits/chosen": -3.248673677444458, "eval_logits/rejected": -3.3741021156311035, "eval_logps/chosen": -30.63228988647461, "eval_logps/rejected": -55.99335479736328, "eval_loss": 1.0509639978408813, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.05622493475675583, "eval_rewards/margins": -0.05622493475675583, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7092, "eval_samples_per_second": 9.477, "eval_steps_per_second": 0.306, "step": 2740 }, { "epoch": 15.312849162011172, "grad_norm": 0.3515655917271136, "learning_rate": 5.610573421832965e-09, "logits/chosen": -3.2977492809295654, "logits/rejected": -3.5348691940307617, "logps/chosen": -2.017648935317993, "logps/rejected": -41.454952239990234, "loss": 0.0871, "rewards/accuracies": 1.0, "rewards/chosen": 3.260664939880371, "rewards/margins": 3.260664939880371, "rewards/rejected": 0.0, "step": 2741 }, { "epoch": 15.318435754189943, "grad_norm": 0.37468446332625743, "learning_rate": 5.5198829831205606e-09, "logits/chosen": -3.413870096206665, "logits/rejected": -3.137200355529785, "logps/chosen": -0.5963768362998962, "logps/rejected": -49.070533752441406, "loss": 0.0921, "rewards/accuracies": 1.0, "rewards/chosen": 2.7700161933898926, "rewards/margins": 2.7700161933898926, "rewards/rejected": 0.0, "step": 2742 }, { "epoch": 15.324022346368714, "grad_norm": 0.35422052364738515, "learning_rate": 5.429927429924042e-09, "logits/chosen": -3.496981143951416, "logits/rejected": -3.2775912284851074, "logps/chosen": -6.259299278259277, "logps/rejected": -80.9996337890625, "loss": 0.1076, "rewards/accuracies": 1.0, "rewards/chosen": 3.123628616333008, "rewards/margins": 3.123628616333008, "rewards/rejected": 0.0, "step": 2743 }, { "epoch": 15.329608938547485, "grad_norm": 0.48379818395480784, "learning_rate": 5.340706895933577e-09, "logits/chosen": -3.5097808837890625, "logits/rejected": -3.4879391193389893, "logps/chosen": -7.59335994720459, "logps/rejected": -28.682659149169922, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": 3.4279065132141113, "rewards/margins": 3.4279065132141113, "rewards/rejected": 0.0, "step": 2744 }, { "epoch": 15.335195530726256, "grad_norm": 0.4690569174309656, "learning_rate": 5.2522215137465465e-09, "logits/chosen": -3.30172061920166, "logits/rejected": -3.314605712890625, "logps/chosen": -0.37239521741867065, "logps/rejected": -46.06013488769531, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": 1.7459301948547363, "rewards/margins": 1.7459301948547363, "rewards/rejected": 0.0, "step": 2745 }, { "epoch": 15.340782122905027, "grad_norm": 0.36687165605599187, "learning_rate": 5.164471414868199e-09, "logits/chosen": -3.320753335952759, "logits/rejected": -3.253136157989502, "logps/chosen": -1.0493203401565552, "logps/rejected": -22.281099319458008, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 2.4916725158691406, "rewards/margins": 2.4916725158691406, "rewards/rejected": 0.0, "step": 2746 }, { "epoch": 15.346368715083798, "grad_norm": 0.3325703607712872, "learning_rate": 5.0774567297106606e-09, "logits/chosen": -3.440880298614502, "logits/rejected": -3.3833158016204834, "logps/chosen": -4.225191116333008, "logps/rejected": -61.686500549316406, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 3.345611095428467, "rewards/margins": 3.345611095428467, "rewards/rejected": 0.0, "step": 2747 }, { "epoch": 15.35195530726257, "grad_norm": 0.35702848776093915, "learning_rate": 4.99117758759332e-09, "logits/chosen": -3.266876220703125, "logits/rejected": -3.3380188941955566, "logps/chosen": -0.42809247970581055, "logps/rejected": -62.52800750732422, "loss": 0.1115, "rewards/accuracies": 1.0, "rewards/chosen": 2.2515180110931396, "rewards/margins": 2.2515180110931396, "rewards/rejected": 0.0, "step": 2748 }, { "epoch": 15.35754189944134, "grad_norm": 0.41493908331632684, "learning_rate": 4.905634116742386e-09, "logits/chosen": -3.363133430480957, "logits/rejected": -3.332078695297241, "logps/chosen": -1.516545295715332, "logps/rejected": -64.26512145996094, "loss": 0.1157, "rewards/accuracies": 1.0, "rewards/chosen": 2.72746205329895, "rewards/margins": 2.72746205329895, "rewards/rejected": 0.0, "step": 2749 }, { "epoch": 15.363128491620111, "grad_norm": 0.3838088181751727, "learning_rate": 4.820826444290604e-09, "logits/chosen": -3.287088394165039, "logits/rejected": -3.380878210067749, "logps/chosen": -6.814674377441406, "logps/rejected": -46.192665100097656, "loss": 0.1093, "rewards/accuracies": 1.0, "rewards/chosen": 3.646423816680908, "rewards/margins": 3.646423816680908, "rewards/rejected": 0.0, "step": 2750 }, { "epoch": 15.368715083798882, "grad_norm": 0.37050209695463104, "learning_rate": 4.736754696277268e-09, "logits/chosen": -3.5849552154541016, "logits/rejected": -3.4903130531311035, "logps/chosen": -34.72379684448242, "logps/rejected": -35.2935791015625, "loss": 0.1174, "rewards/accuracies": 1.0, "rewards/chosen": 4.601708889007568, "rewards/margins": 4.601708889007568, "rewards/rejected": 0.0, "step": 2751 }, { "epoch": 15.374301675977653, "grad_norm": 0.3436745519374388, "learning_rate": 4.65341899764804e-09, "logits/chosen": -3.405404806137085, "logits/rejected": -3.4282948970794678, "logps/chosen": -0.6028783321380615, "logps/rejected": -52.68398666381836, "loss": 0.0984, "rewards/accuracies": 1.0, "rewards/chosen": 2.1569461822509766, "rewards/margins": 2.1569461822509766, "rewards/rejected": 0.0, "step": 2752 }, { "epoch": 15.379888268156424, "grad_norm": 0.4644076721999623, "learning_rate": 4.570819472254628e-09, "logits/chosen": -3.4488673210144043, "logits/rejected": -3.379628896713257, "logps/chosen": -0.7595265507698059, "logps/rejected": -85.30735778808594, "loss": 0.1259, "rewards/accuracies": 1.0, "rewards/chosen": 2.232293128967285, "rewards/margins": 2.232293128967285, "rewards/rejected": 0.0, "step": 2753 }, { "epoch": 15.385474860335195, "grad_norm": 0.34475775741966763, "learning_rate": 4.488956242854503e-09, "logits/chosen": -3.2192726135253906, "logits/rejected": -3.2809946537017822, "logps/chosen": -1.3715202808380127, "logps/rejected": -53.71438980102539, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": 2.748518943786621, "rewards/margins": 2.748518943786621, "rewards/rejected": 0.0, "step": 2754 }, { "epoch": 15.391061452513966, "grad_norm": 0.3098299269784389, "learning_rate": 4.407829431111176e-09, "logits/chosen": -3.5013234615325928, "logits/rejected": -3.4125614166259766, "logps/chosen": -0.7996332049369812, "logps/rejected": -140.3853759765625, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": 2.476461410522461, "rewards/margins": 2.476461410522461, "rewards/rejected": 0.0, "step": 2755 }, { "epoch": 15.396648044692737, "grad_norm": 0.4045607273273071, "learning_rate": 4.3274391575933714e-09, "logits/chosen": -3.5030741691589355, "logits/rejected": -3.4345967769622803, "logps/chosen": -0.5863586664199829, "logps/rejected": -41.62118148803711, "loss": 0.08, "rewards/accuracies": 1.0, "rewards/chosen": 2.4675443172454834, "rewards/margins": 2.4675443172454834, "rewards/rejected": 0.0, "step": 2756 }, { "epoch": 15.402234636871508, "grad_norm": 0.37688208971115295, "learning_rate": 4.247785541775517e-09, "logits/chosen": -3.214529275894165, "logits/rejected": -3.252412796020508, "logps/chosen": -8.346531867980957, "logps/rejected": -38.34511184692383, "loss": 0.1164, "rewards/accuracies": 1.0, "rewards/chosen": 3.976717472076416, "rewards/margins": 3.976717472076416, "rewards/rejected": 0.0, "step": 2757 }, { "epoch": 15.40782122905028, "grad_norm": 0.3723122284990119, "learning_rate": 4.168868702036921e-09, "logits/chosen": -3.5542376041412354, "logits/rejected": -3.650407314300537, "logps/chosen": -0.21057796478271484, "logps/rejected": -90.39620208740234, "loss": 0.0905, "rewards/accuracies": 1.0, "rewards/chosen": 1.6911938190460205, "rewards/margins": 1.6911938190460205, "rewards/rejected": 0.0, "step": 2758 }, { "epoch": 15.41340782122905, "grad_norm": 0.3624657042072919, "learning_rate": 4.090688755662153e-09, "logits/chosen": -3.197291612625122, "logits/rejected": -2.999779224395752, "logps/chosen": -0.9325481653213501, "logps/rejected": -71.47624969482422, "loss": 0.1334, "rewards/accuracies": 1.0, "rewards/chosen": 2.152543544769287, "rewards/margins": 2.152543544769287, "rewards/rejected": 0.0, "step": 2759 }, { "epoch": 15.418994413407821, "grad_norm": 0.3472619525538552, "learning_rate": 4.0132458188405496e-09, "logits/chosen": -3.5566887855529785, "logits/rejected": -3.4715640544891357, "logps/chosen": -8.182580947875977, "logps/rejected": -33.7274169921875, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": 3.491220712661743, "rewards/margins": 3.491220712661743, "rewards/rejected": 0.0, "step": 2760 }, { "epoch": 15.418994413407821, "eval_logits/chosen": -3.256040573120117, "eval_logits/rejected": -3.3804473876953125, "eval_logps/chosen": -30.82790756225586, "eval_logps/rejected": -56.25598907470703, "eval_loss": 1.0565524101257324, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": -0.0757867619395256, "eval_rewards/margins": -0.0757867619395256, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6514, "eval_samples_per_second": 9.494, "eval_steps_per_second": 0.306, "step": 2760 }, { "epoch": 15.424581005586592, "grad_norm": 0.36330710276209954, "learning_rate": 3.936540006665989e-09, "logits/chosen": -3.451162815093994, "logits/rejected": -3.5979788303375244, "logps/chosen": -3.8159427642822266, "logps/rejected": -34.103302001953125, "loss": 0.0981, "rewards/accuracies": 1.0, "rewards/chosen": 3.461691379547119, "rewards/margins": 3.461691379547119, "rewards/rejected": 0.0, "step": 2761 }, { "epoch": 15.430167597765363, "grad_norm": 0.3461085529795291, "learning_rate": 3.860571433137116e-09, "logits/chosen": -3.5518178939819336, "logits/rejected": -3.4665303230285645, "logps/chosen": -2.5919435024261475, "logps/rejected": -103.93636322021484, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": 3.152487277984619, "rewards/margins": 3.152487277984619, "rewards/rejected": 0.0, "step": 2762 }, { "epoch": 15.435754189944134, "grad_norm": 0.3751277054303707, "learning_rate": 3.785340211156618e-09, "logits/chosen": -3.345266580581665, "logits/rejected": -3.226581335067749, "logps/chosen": -14.586767196655273, "logps/rejected": -28.173521041870117, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 3.4477198123931885, "rewards/margins": 3.4477198123931885, "rewards/rejected": 0.0, "step": 2763 }, { "epoch": 15.441340782122905, "grad_norm": 0.417944946479477, "learning_rate": 3.7108464525316685e-09, "logits/chosen": -3.285778045654297, "logits/rejected": -3.213641405105591, "logps/chosen": -6.269316673278809, "logps/rejected": -47.36651611328125, "loss": 0.0772, "rewards/accuracies": 1.0, "rewards/chosen": 3.783946990966797, "rewards/margins": 3.783946990966797, "rewards/rejected": 0.0, "step": 2764 }, { "epoch": 15.446927374301676, "grad_norm": 0.37137562846821254, "learning_rate": 3.637090267973042e-09, "logits/chosen": -3.311185598373413, "logits/rejected": -3.404200315475464, "logps/chosen": -13.746042251586914, "logps/rejected": -54.992279052734375, "loss": 0.0921, "rewards/accuracies": 1.0, "rewards/chosen": 2.807616710662842, "rewards/margins": 2.807616710662842, "rewards/rejected": 0.0, "step": 2765 }, { "epoch": 15.452513966480447, "grad_norm": 0.38958315114479514, "learning_rate": 3.564071767095722e-09, "logits/chosen": -3.327028274536133, "logits/rejected": -3.3071060180664062, "logps/chosen": -0.4666496515274048, "logps/rejected": -85.70694732666016, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 1.8747763633728027, "rewards/margins": 1.8747763633728027, "rewards/rejected": 0.0, "step": 2766 }, { "epoch": 15.458100558659218, "grad_norm": 0.40104745850233675, "learning_rate": 3.491791058418181e-09, "logits/chosen": -3.304408073425293, "logits/rejected": -3.363276720046997, "logps/chosen": -0.6727848052978516, "logps/rejected": -38.689842224121094, "loss": 0.1013, "rewards/accuracies": 1.0, "rewards/chosen": 2.456422805786133, "rewards/margins": 2.456422805786133, "rewards/rejected": 0.0, "step": 2767 }, { "epoch": 15.46368715083799, "grad_norm": 0.4313312495782813, "learning_rate": 3.4202482493623785e-09, "logits/chosen": -3.247502088546753, "logits/rejected": -3.3012685775756836, "logps/chosen": -1.0190379619598389, "logps/rejected": -45.1026611328125, "loss": 0.0934, "rewards/accuracies": 1.0, "rewards/chosen": 2.6603927612304688, "rewards/margins": 2.6603927612304688, "rewards/rejected": 0.0, "step": 2768 }, { "epoch": 15.46927374301676, "grad_norm": 0.3697268473542088, "learning_rate": 3.349443446253708e-09, "logits/chosen": -3.3885860443115234, "logits/rejected": -3.260544776916504, "logps/chosen": -5.293089866638184, "logps/rejected": -54.82860565185547, "loss": 0.0886, "rewards/accuracies": 1.0, "rewards/chosen": 3.2731010913848877, "rewards/margins": 3.2731010913848877, "rewards/rejected": 0.0, "step": 2769 }, { "epoch": 15.474860335195531, "grad_norm": 0.3749107031852132, "learning_rate": 3.2793767543207172e-09, "logits/chosen": -3.2207043170928955, "logits/rejected": -3.2187039852142334, "logps/chosen": -13.834918975830078, "logps/rejected": -55.70347595214844, "loss": 0.0803, "rewards/accuracies": 1.0, "rewards/chosen": 3.689237117767334, "rewards/margins": 3.689237117767334, "rewards/rejected": 0.0, "step": 2770 }, { "epoch": 15.480446927374302, "grad_norm": 0.35471904615786953, "learning_rate": 3.2100482776949432e-09, "logits/chosen": -3.1515934467315674, "logits/rejected": -3.304704427719116, "logps/chosen": -0.6870374083518982, "logps/rejected": -86.122314453125, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 2.0924084186553955, "rewards/margins": 2.0924084186553955, "rewards/rejected": 0.0, "step": 2771 }, { "epoch": 15.486033519553073, "grad_norm": 0.44887963252520147, "learning_rate": 3.141458119410856e-09, "logits/chosen": -3.3256747722625732, "logits/rejected": -3.313903331756592, "logps/chosen": -2.753474473953247, "logps/rejected": -36.37815475463867, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 2.6246652603149414, "rewards/margins": 2.6246652603149414, "rewards/rejected": 0.0, "step": 2772 }, { "epoch": 15.491620111731844, "grad_norm": 0.4019342901537875, "learning_rate": 3.073606381405747e-09, "logits/chosen": -3.468522071838379, "logits/rejected": -3.572396993637085, "logps/chosen": -2.670816421508789, "logps/rejected": -37.25193405151367, "loss": 0.0834, "rewards/accuracies": 1.0, "rewards/chosen": 3.2995452880859375, "rewards/margins": 3.2995452880859375, "rewards/rejected": 0.0, "step": 2773 }, { "epoch": 15.497206703910614, "grad_norm": 0.4202327403144555, "learning_rate": 3.0064931645192306e-09, "logits/chosen": -3.538756847381592, "logits/rejected": -3.5643651485443115, "logps/chosen": -3.070742607116699, "logps/rejected": -66.23321533203125, "loss": 0.1099, "rewards/accuracies": 1.0, "rewards/chosen": 2.683175563812256, "rewards/margins": 2.683175563812256, "rewards/rejected": 0.0, "step": 2774 }, { "epoch": 15.502793296089386, "grad_norm": 0.5020722710923473, "learning_rate": 2.940118568493577e-09, "logits/chosen": -3.5868358612060547, "logits/rejected": -3.4845035076141357, "logps/chosen": -4.515740394592285, "logps/rejected": -62.495758056640625, "loss": 0.1083, "rewards/accuracies": 1.0, "rewards/chosen": 2.8176839351654053, "rewards/margins": 2.8176839351654053, "rewards/rejected": 0.0, "step": 2775 }, { "epoch": 15.508379888268156, "grad_norm": 0.4227194900443271, "learning_rate": 2.8744826919733233e-09, "logits/chosen": -3.276412010192871, "logits/rejected": -3.315068006515503, "logps/chosen": -2.1350624561309814, "logps/rejected": -52.504066467285156, "loss": 0.0988, "rewards/accuracies": 1.0, "rewards/chosen": 2.8215818405151367, "rewards/margins": 2.8215818405151367, "rewards/rejected": 0.0, "step": 2776 }, { "epoch": 15.513966480446927, "grad_norm": 0.41264950439068643, "learning_rate": 2.8095856325049405e-09, "logits/chosen": -3.0627875328063965, "logits/rejected": -3.2191262245178223, "logps/chosen": -0.2128685712814331, "logps/rejected": -96.5482177734375, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 1.4802885055541992, "rewards/margins": 1.4802885055541992, "rewards/rejected": 0.0, "step": 2777 }, { "epoch": 15.519553072625698, "grad_norm": 0.33991140388893726, "learning_rate": 2.745427486537111e-09, "logits/chosen": -3.030118942260742, "logits/rejected": -3.1139426231384277, "logps/chosen": -2.5844459533691406, "logps/rejected": -53.24211120605469, "loss": 0.0962, "rewards/accuracies": 1.0, "rewards/chosen": 2.950360059738159, "rewards/margins": 2.950360059738159, "rewards/rejected": 0.0, "step": 2778 }, { "epoch": 15.525139664804469, "grad_norm": 0.33174144887954643, "learning_rate": 2.6820083494201727e-09, "logits/chosen": -3.4181268215179443, "logits/rejected": -3.5480806827545166, "logps/chosen": -0.7553225755691528, "logps/rejected": -82.75349426269531, "loss": 0.1136, "rewards/accuracies": 1.0, "rewards/chosen": 2.4046425819396973, "rewards/margins": 2.4046425819396973, "rewards/rejected": 0.0, "step": 2779 }, { "epoch": 15.53072625698324, "grad_norm": 0.36692690904686837, "learning_rate": 2.6193283154063996e-09, "logits/chosen": -3.325366735458374, "logits/rejected": -3.6006627082824707, "logps/chosen": -1.3810858726501465, "logps/rejected": -67.34014892578125, "loss": 0.1223, "rewards/accuracies": 1.0, "rewards/chosen": 2.4140939712524414, "rewards/margins": 2.4140939712524414, "rewards/rejected": 0.0, "step": 2780 }, { "epoch": 15.53072625698324, "eval_logits/chosen": -3.257312297821045, "eval_logits/rejected": -3.3804821968078613, "eval_logps/chosen": -30.872472763061523, "eval_logps/rejected": -56.256752014160156, "eval_loss": 1.0601108074188232, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": -0.08024321496486664, "eval_rewards/margins": -0.08024321496486664, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6862, "eval_samples_per_second": 9.484, "eval_steps_per_second": 0.306, "step": 2780 }, { "epoch": 15.53631284916201, "grad_norm": 0.4061069101464907, "learning_rate": 2.557387477649331e-09, "logits/chosen": -3.199857234954834, "logits/rejected": -3.4232935905456543, "logps/chosen": -4.499742031097412, "logps/rejected": -53.95488739013672, "loss": 0.0864, "rewards/accuracies": 1.0, "rewards/chosen": 3.0268375873565674, "rewards/margins": 3.0268375873565674, "rewards/rejected": 0.0, "step": 2781 }, { "epoch": 15.541899441340782, "grad_norm": 0.40446462715684434, "learning_rate": 2.496185928204164e-09, "logits/chosen": -3.1261484622955322, "logits/rejected": -3.2383618354797363, "logps/chosen": -1.6808207035064697, "logps/rejected": -50.73039245605469, "loss": 0.0984, "rewards/accuracies": 1.0, "rewards/chosen": 2.562302589416504, "rewards/margins": 2.562302589416504, "rewards/rejected": 0.0, "step": 2782 }, { "epoch": 15.547486033519553, "grad_norm": 0.3560462108194252, "learning_rate": 2.4357237580273084e-09, "logits/chosen": -3.512803316116333, "logits/rejected": -3.4058237075805664, "logps/chosen": -1.1344797611236572, "logps/rejected": -44.242820739746094, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": 3.1338982582092285, "rewards/margins": 3.1338982582092285, "rewards/rejected": 0.0, "step": 2783 }, { "epoch": 15.553072625698324, "grad_norm": 0.363655029715933, "learning_rate": 2.3760010569762734e-09, "logits/chosen": -3.3123326301574707, "logits/rejected": -3.3357810974121094, "logps/chosen": -2.1729488372802734, "logps/rejected": -70.17076110839844, "loss": 0.088, "rewards/accuracies": 1.0, "rewards/chosen": 3.0184102058410645, "rewards/margins": 3.0184102058410645, "rewards/rejected": 0.0, "step": 2784 }, { "epoch": 15.558659217877095, "grad_norm": 0.4839223569530984, "learning_rate": 2.3170179138097266e-09, "logits/chosen": -3.253693103790283, "logits/rejected": -3.337338447570801, "logps/chosen": -1.2565081119537354, "logps/rejected": -62.19097900390625, "loss": 0.1098, "rewards/accuracies": 1.0, "rewards/chosen": 2.555415153503418, "rewards/margins": 2.555415153503418, "rewards/rejected": 0.0, "step": 2785 }, { "epoch": 15.564245810055866, "grad_norm": 0.4104548003522868, "learning_rate": 2.2587744161869926e-09, "logits/chosen": -3.2256131172180176, "logits/rejected": -3.183342218399048, "logps/chosen": -2.3459343910217285, "logps/rejected": -71.43930053710938, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": 2.9714996814727783, "rewards/margins": 2.9714996814727783, "rewards/rejected": 0.0, "step": 2786 }, { "epoch": 15.569832402234637, "grad_norm": 0.6294585426588813, "learning_rate": 2.2012706506683297e-09, "logits/chosen": -3.559893846511841, "logits/rejected": -3.4145257472991943, "logps/chosen": -2.7881810665130615, "logps/rejected": -40.22637176513672, "loss": 0.1127, "rewards/accuracies": 1.0, "rewards/chosen": 2.719789743423462, "rewards/margins": 2.719789743423462, "rewards/rejected": 0.0, "step": 2787 }, { "epoch": 15.575418994413408, "grad_norm": 0.33294000108246335, "learning_rate": 2.144506702714599e-09, "logits/chosen": -3.468883514404297, "logits/rejected": -3.5237717628479004, "logps/chosen": -9.07915210723877, "logps/rejected": -29.209163665771484, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 3.923649549484253, "rewards/margins": 3.923649549484253, "rewards/rejected": 0.0, "step": 2788 }, { "epoch": 15.581005586592179, "grad_norm": 0.3837843837695766, "learning_rate": 2.0884826566871516e-09, "logits/chosen": -2.6626386642456055, "logits/rejected": -2.7222583293914795, "logps/chosen": -19.42976188659668, "logps/rejected": -48.52910614013672, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": 4.182826042175293, "rewards/margins": 4.182826042175293, "rewards/rejected": 0.0, "step": 2789 }, { "epoch": 15.58659217877095, "grad_norm": 0.43376266031743754, "learning_rate": 2.033198595847607e-09, "logits/chosen": -3.3460216522216797, "logits/rejected": -3.280305862426758, "logps/chosen": -7.48615837097168, "logps/rejected": -48.15699768066406, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": 3.4884071350097656, "rewards/margins": 3.4884071350097656, "rewards/rejected": 0.0, "step": 2790 }, { "epoch": 15.59217877094972, "grad_norm": 0.43090268920455477, "learning_rate": 1.978654602358021e-09, "logits/chosen": -3.3077661991119385, "logits/rejected": -3.497183084487915, "logps/chosen": -1.2378193140029907, "logps/rejected": -66.26832580566406, "loss": 0.1393, "rewards/accuracies": 1.0, "rewards/chosen": 2.811579704284668, "rewards/margins": 2.811579704284668, "rewards/rejected": 0.0, "step": 2791 }, { "epoch": 15.597765363128492, "grad_norm": 0.40436075782981956, "learning_rate": 1.9248507572804383e-09, "logits/chosen": -3.3102433681488037, "logits/rejected": -3.268357038497925, "logps/chosen": -0.8314454555511475, "logps/rejected": -39.71807098388672, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": 2.867051839828491, "rewards/margins": 2.867051839828491, "rewards/rejected": 0.0, "step": 2792 }, { "epoch": 15.603351955307263, "grad_norm": 0.44023941554623247, "learning_rate": 1.8717871405769526e-09, "logits/chosen": -3.3736393451690674, "logits/rejected": -3.412764310836792, "logps/chosen": -4.920578956604004, "logps/rejected": -51.50208282470703, "loss": 0.0973, "rewards/accuracies": 1.0, "rewards/chosen": 3.1983420848846436, "rewards/margins": 3.1983420848846436, "rewards/rejected": 0.0, "step": 2793 }, { "epoch": 15.608938547486034, "grad_norm": 0.39669380790497527, "learning_rate": 1.8194638311095912e-09, "logits/chosen": -3.205962896347046, "logits/rejected": -3.237531900405884, "logps/chosen": -0.654625415802002, "logps/rejected": -56.472381591796875, "loss": 0.1173, "rewards/accuracies": 1.0, "rewards/chosen": 2.1814088821411133, "rewards/margins": 2.1814088821411133, "rewards/rejected": 0.0, "step": 2794 }, { "epoch": 15.614525139664805, "grad_norm": 0.6864799818663945, "learning_rate": 1.7678809066400401e-09, "logits/chosen": -3.3602356910705566, "logits/rejected": -3.311319351196289, "logps/chosen": -1.0779731273651123, "logps/rejected": -65.07955932617188, "loss": 0.1042, "rewards/accuracies": 1.0, "rewards/chosen": 2.3188016414642334, "rewards/margins": 2.3188016414642334, "rewards/rejected": 0.0, "step": 2795 }, { "epoch": 15.620111731843576, "grad_norm": 0.36891150579566906, "learning_rate": 1.7170384438297548e-09, "logits/chosen": -3.42869234085083, "logits/rejected": -3.382807731628418, "logps/chosen": -0.31308987736701965, "logps/rejected": -74.28213500976562, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": 2.075309991836548, "rewards/margins": 2.075309991836548, "rewards/rejected": 0.0, "step": 2796 }, { "epoch": 15.625698324022347, "grad_norm": 0.45196185506431397, "learning_rate": 1.6669365182396811e-09, "logits/chosen": -3.568336248397827, "logits/rejected": -3.7408010959625244, "logps/chosen": -0.34792613983154297, "logps/rejected": -44.23535919189453, "loss": 0.1178, "rewards/accuracies": 1.0, "rewards/chosen": 1.9495230913162231, "rewards/margins": 1.9495230913162231, "rewards/rejected": 0.0, "step": 2797 }, { "epoch": 15.631284916201118, "grad_norm": 0.5411387809142836, "learning_rate": 1.617575204330257e-09, "logits/chosen": -3.2192094326019287, "logits/rejected": -3.367995262145996, "logps/chosen": -0.28346142172813416, "logps/rejected": -46.0729866027832, "loss": 0.0939, "rewards/accuracies": 1.0, "rewards/chosen": 2.0429527759552, "rewards/margins": 2.0429527759552, "rewards/rejected": 0.0, "step": 2798 }, { "epoch": 15.636871508379889, "grad_norm": 0.3763849368960105, "learning_rate": 1.5689545754610788e-09, "logits/chosen": -3.341198205947876, "logits/rejected": -3.2854690551757812, "logps/chosen": -3.698657512664795, "logps/rejected": -36.41625213623047, "loss": 0.0843, "rewards/accuracies": 1.0, "rewards/chosen": 2.817406177520752, "rewards/margins": 2.817406177520752, "rewards/rejected": 0.0, "step": 2799 }, { "epoch": 15.64245810055866, "grad_norm": 0.33605716429498145, "learning_rate": 1.5210747038911232e-09, "logits/chosen": -3.370471715927124, "logits/rejected": -3.373816728591919, "logps/chosen": -4.742188930511475, "logps/rejected": -40.91890335083008, "loss": 0.1231, "rewards/accuracies": 1.0, "rewards/chosen": 2.6829161643981934, "rewards/margins": 2.6829161643981934, "rewards/rejected": 0.0, "step": 2800 }, { "epoch": 15.64245810055866, "eval_logits/chosen": -3.2499423027038574, "eval_logits/rejected": -3.375277280807495, "eval_logps/chosen": -30.777721405029297, "eval_logps/rejected": -56.15149688720703, "eval_loss": 1.055828332901001, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.07076773047447205, "eval_rewards/margins": -0.07076773047447205, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7114, "eval_samples_per_second": 9.477, "eval_steps_per_second": 0.306, "step": 2800 }, { "epoch": 15.64804469273743, "grad_norm": 0.4077306927781004, "learning_rate": 1.4739356607784136e-09, "logits/chosen": -3.1192891597747803, "logits/rejected": -3.1471028327941895, "logps/chosen": -4.933080673217773, "logps/rejected": -155.74085998535156, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": 3.16139554977417, "rewards/margins": 3.16139554977417, "rewards/rejected": 0.0, "step": 2801 }, { "epoch": 15.653631284916202, "grad_norm": 0.4373517979374343, "learning_rate": 1.427537516179911e-09, "logits/chosen": -3.622830867767334, "logits/rejected": -3.3402109146118164, "logps/chosen": -1.7372000217437744, "logps/rejected": -47.54938507080078, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": 2.2426764965057373, "rewards/margins": 2.2426764965057373, "rewards/rejected": 0.0, "step": 2802 }, { "epoch": 15.659217877094973, "grad_norm": 0.43996780202222885, "learning_rate": 1.3818803390515666e-09, "logits/chosen": -3.3221375942230225, "logits/rejected": -3.4538419246673584, "logps/chosen": -0.5822171568870544, "logps/rejected": -50.92298889160156, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 2.3626534938812256, "rewards/margins": 2.3626534938812256, "rewards/rejected": 0.0, "step": 2803 }, { "epoch": 15.664804469273744, "grad_norm": 0.31288169138884964, "learning_rate": 1.3369641972481027e-09, "logits/chosen": -3.369594097137451, "logits/rejected": -3.592721939086914, "logps/chosen": -5.258505821228027, "logps/rejected": -54.46895980834961, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 3.5000388622283936, "rewards/margins": 3.5000388622283936, "rewards/rejected": 0.0, "step": 2804 }, { "epoch": 15.670391061452515, "grad_norm": 0.41871178002196785, "learning_rate": 1.2927891575228445e-09, "logits/chosen": -3.2368991374969482, "logits/rejected": -3.3737740516662598, "logps/chosen": -1.0429718494415283, "logps/rejected": -34.85597229003906, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": 2.5941667556762695, "rewards/margins": 2.5941667556762695, "rewards/rejected": 0.0, "step": 2805 }, { "epoch": 15.675977653631286, "grad_norm": 0.9885229141249284, "learning_rate": 1.249355285527831e-09, "logits/chosen": -3.5666892528533936, "logits/rejected": -3.6364030838012695, "logps/chosen": -3.115077018737793, "logps/rejected": -45.557373046875, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 3.011629104614258, "rewards/margins": 3.011629104614258, "rewards/rejected": 0.0, "step": 2806 }, { "epoch": 15.681564245810057, "grad_norm": 0.43451908623452734, "learning_rate": 1.2066626458134276e-09, "logits/chosen": -3.215453863143921, "logits/rejected": -3.174268960952759, "logps/chosen": -3.840770721435547, "logps/rejected": -58.07538986206055, "loss": 0.1218, "rewards/accuracies": 1.0, "rewards/chosen": 3.510533094406128, "rewards/margins": 3.510533094406128, "rewards/rejected": 0.0, "step": 2807 }, { "epoch": 15.687150837988828, "grad_norm": 0.34577052547749765, "learning_rate": 1.1647113018286025e-09, "logits/chosen": -3.3677327632904053, "logits/rejected": -3.3671536445617676, "logps/chosen": -0.43579035997390747, "logps/rejected": -92.73271179199219, "loss": 0.0921, "rewards/accuracies": 1.0, "rewards/chosen": 2.171261787414551, "rewards/margins": 2.171261787414551, "rewards/rejected": 0.0, "step": 2808 }, { "epoch": 15.692737430167599, "grad_norm": 0.401057482924194, "learning_rate": 1.1235013159205386e-09, "logits/chosen": -3.073883533477783, "logits/rejected": -3.161151885986328, "logps/chosen": -4.56620979309082, "logps/rejected": -38.5812873840332, "loss": 0.0865, "rewards/accuracies": 1.0, "rewards/chosen": 3.3170461654663086, "rewards/margins": 3.3170461654663086, "rewards/rejected": 0.0, "step": 2809 }, { "epoch": 15.69832402234637, "grad_norm": 0.4264076689718895, "learning_rate": 1.083032749334467e-09, "logits/chosen": -3.21286678314209, "logits/rejected": -3.2060153484344482, "logps/chosen": -1.5400182008743286, "logps/rejected": -42.095340728759766, "loss": 0.1311, "rewards/accuracies": 1.0, "rewards/chosen": 2.428891658782959, "rewards/margins": 2.428891658782959, "rewards/rejected": 0.0, "step": 2810 }, { "epoch": 15.703910614525139, "grad_norm": 0.42769534207869314, "learning_rate": 1.0433056622140557e-09, "logits/chosen": -3.2073304653167725, "logits/rejected": -3.4061129093170166, "logps/chosen": -0.41317540407180786, "logps/rejected": -56.47168731689453, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 2.034308910369873, "rewards/margins": 2.034308910369873, "rewards/rejected": 0.0, "step": 2811 }, { "epoch": 15.70949720670391, "grad_norm": 0.36023229614273433, "learning_rate": 1.0043201136006874e-09, "logits/chosen": -3.570122003555298, "logits/rejected": -3.4414169788360596, "logps/chosen": -2.0092458724975586, "logps/rejected": -32.25787353515625, "loss": 0.107, "rewards/accuracies": 1.0, "rewards/chosen": 3.2174861431121826, "rewards/margins": 3.2174861431121826, "rewards/rejected": 0.0, "step": 2812 }, { "epoch": 15.71508379888268, "grad_norm": 0.403769439570388, "learning_rate": 9.660761614339597e-10, "logits/chosen": -3.461387872695923, "logits/rejected": -3.382807731628418, "logps/chosen": -4.465088844299316, "logps/rejected": -70.43394470214844, "loss": 0.0768, "rewards/accuracies": 1.0, "rewards/chosen": 3.0921897888183594, "rewards/margins": 3.0921897888183594, "rewards/rejected": 0.0, "step": 2813 }, { "epoch": 15.720670391061452, "grad_norm": 0.3666172402209137, "learning_rate": 9.285738625511297e-10, "logits/chosen": -3.2861690521240234, "logits/rejected": -3.5039193630218506, "logps/chosen": -0.2656005024909973, "logps/rejected": -48.74907302856445, "loss": 0.1084, "rewards/accuracies": 1.0, "rewards/chosen": 1.6739084720611572, "rewards/margins": 1.6739084720611572, "rewards/rejected": 0.0, "step": 2814 }, { "epoch": 15.726256983240223, "grad_norm": 0.3917052549432524, "learning_rate": 8.918132726872807e-10, "logits/chosen": -3.465508222579956, "logits/rejected": -3.417743444442749, "logps/chosen": -17.935710906982422, "logps/rejected": -80.55555725097656, "loss": 0.0887, "rewards/accuracies": 1.0, "rewards/chosen": 3.849966049194336, "rewards/margins": 3.849966049194336, "rewards/rejected": 0.0, "step": 2815 }, { "epoch": 15.731843575418994, "grad_norm": 0.32990131461111255, "learning_rate": 8.557944464752109e-10, "logits/chosen": -3.2874207496643066, "logits/rejected": -3.3129971027374268, "logps/chosen": -0.7452528476715088, "logps/rejected": -39.76192092895508, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": 2.0606377124786377, "rewards/margins": 2.0606377124786377, "rewards/rejected": 0.0, "step": 2816 }, { "epoch": 15.737430167597765, "grad_norm": 0.31626882369629983, "learning_rate": 8.205174374453227e-10, "logits/chosen": -3.5613372325897217, "logits/rejected": -3.5797479152679443, "logps/chosen": -0.7551592588424683, "logps/rejected": -57.97498321533203, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 2.154473304748535, "rewards/margins": 2.154473304748535, "rewards/rejected": 0.0, "step": 2817 }, { "epoch": 15.743016759776536, "grad_norm": 0.3616808663572626, "learning_rate": 7.859822980255115e-10, "logits/chosen": -3.7419943809509277, "logits/rejected": -3.503006935119629, "logps/chosen": -2.766098976135254, "logps/rejected": -38.50261688232422, "loss": 0.1045, "rewards/accuracies": 1.0, "rewards/chosen": 2.9870448112487793, "rewards/margins": 2.9870448112487793, "rewards/rejected": 0.0, "step": 2818 }, { "epoch": 15.748603351955307, "grad_norm": 0.3574739958631293, "learning_rate": 7.521890795411656e-10, "logits/chosen": -2.97888445854187, "logits/rejected": -2.923058271408081, "logps/chosen": -11.004693984985352, "logps/rejected": -31.374242782592773, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": 3.3422088623046875, "rewards/margins": 3.3422088623046875, "rewards/rejected": 0.0, "step": 2819 }, { "epoch": 15.754189944134078, "grad_norm": 0.3451826393107498, "learning_rate": 7.191378322150554e-10, "logits/chosen": -3.349212884902954, "logits/rejected": -3.506939172744751, "logps/chosen": -1.2821694612503052, "logps/rejected": -51.124549865722656, "loss": 0.1085, "rewards/accuracies": 1.0, "rewards/chosen": 2.8572986125946045, "rewards/margins": 2.8572986125946045, "rewards/rejected": 0.0, "step": 2820 }, { "epoch": 15.754189944134078, "eval_logits/chosen": -3.257516384124756, "eval_logits/rejected": -3.381110668182373, "eval_logps/chosen": -30.6396484375, "eval_logps/rejected": -56.05371856689453, "eval_loss": 1.049382209777832, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.056960977613925934, "eval_rewards/margins": -0.056960977613925934, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7293, "eval_samples_per_second": 9.472, "eval_steps_per_second": 0.306, "step": 2820 }, { "epoch": 15.759776536312849, "grad_norm": 0.3546071638052034, "learning_rate": 6.868286051671668e-10, "logits/chosen": -3.405954360961914, "logits/rejected": -3.282923698425293, "logps/chosen": -0.9710592031478882, "logps/rejected": -85.11328125, "loss": 0.074, "rewards/accuracies": 1.0, "rewards/chosen": 2.37251615524292, "rewards/margins": 2.37251615524292, "rewards/rejected": 0.0, "step": 2821 }, { "epoch": 15.76536312849162, "grad_norm": 0.3724010116990325, "learning_rate": 6.552614464147566e-10, "logits/chosen": -3.3313539028167725, "logits/rejected": -3.620511293411255, "logps/chosen": -29.941688537597656, "logps/rejected": -38.74200439453125, "loss": 0.1327, "rewards/accuracies": 1.0, "rewards/chosen": 3.5802860260009766, "rewards/margins": 3.5802860260009766, "rewards/rejected": 0.0, "step": 2822 }, { "epoch": 15.77094972067039, "grad_norm": 1.129327964416941, "learning_rate": 6.244364028722971e-10, "logits/chosen": -3.35436749458313, "logits/rejected": -3.415541887283325, "logps/chosen": -11.93348217010498, "logps/rejected": -32.43596649169922, "loss": 0.1275, "rewards/accuracies": 1.0, "rewards/chosen": 3.7170023918151855, "rewards/margins": 3.7170023918151855, "rewards/rejected": 0.0, "step": 2823 }, { "epoch": 15.776536312849162, "grad_norm": 0.3934489794562454, "learning_rate": 5.94353520351254e-10, "logits/chosen": -3.326505422592163, "logits/rejected": -3.2996695041656494, "logps/chosen": -5.983352184295654, "logps/rejected": -51.20966339111328, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": 3.2338926792144775, "rewards/margins": 3.2338926792144775, "rewards/rejected": 0.0, "step": 2824 }, { "epoch": 15.782122905027933, "grad_norm": 0.3626987993167899, "learning_rate": 5.65012843560142e-10, "logits/chosen": -3.15582537651062, "logits/rejected": -3.2727463245391846, "logps/chosen": -1.3835318088531494, "logps/rejected": -82.92355346679688, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 2.4493255615234375, "rewards/margins": 2.4493255615234375, "rewards/rejected": 0.0, "step": 2825 }, { "epoch": 15.787709497206704, "grad_norm": 0.3539223343213207, "learning_rate": 5.364144161044693e-10, "logits/chosen": -3.2896037101745605, "logits/rejected": -3.143247127532959, "logps/chosen": -19.809814453125, "logps/rejected": -46.5184326171875, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 3.245551586151123, "rewards/margins": 3.245551586151123, "rewards/rejected": 0.0, "step": 2826 }, { "epoch": 15.793296089385475, "grad_norm": 0.45218234362373877, "learning_rate": 5.085582804865706e-10, "logits/chosen": -3.4615471363067627, "logits/rejected": -3.479534149169922, "logps/chosen": -7.718883991241455, "logps/rejected": -53.56597900390625, "loss": 0.0755, "rewards/accuracies": 1.0, "rewards/chosen": 3.8744218349456787, "rewards/margins": 3.8744218349456787, "rewards/rejected": 0.0, "step": 2827 }, { "epoch": 15.798882681564246, "grad_norm": 0.47733689901707615, "learning_rate": 4.814444781056082e-10, "logits/chosen": -3.46455979347229, "logits/rejected": -3.5759031772613525, "logps/chosen": -3.0673773288726807, "logps/rejected": -76.47246551513672, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 3.164743661880493, "rewards/margins": 3.164743661880493, "rewards/rejected": 0.0, "step": 2828 }, { "epoch": 15.804469273743017, "grad_norm": 0.49745046031795465, "learning_rate": 4.550730492575705e-10, "logits/chosen": -2.965928316116333, "logits/rejected": -2.9631450176239014, "logps/chosen": -10.267690658569336, "logps/rejected": -35.395652770996094, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": 3.2025234699249268, "rewards/margins": 3.2025234699249268, "rewards/rejected": 0.0, "step": 2829 }, { "epoch": 15.810055865921788, "grad_norm": 0.37343320783515743, "learning_rate": 4.294440331350513e-10, "logits/chosen": -3.3837499618530273, "logits/rejected": -3.3825016021728516, "logps/chosen": -5.725942611694336, "logps/rejected": -31.717445373535156, "loss": 0.0931, "rewards/accuracies": 1.0, "rewards/chosen": 3.850558280944824, "rewards/margins": 3.850558280944824, "rewards/rejected": 0.0, "step": 2830 }, { "epoch": 15.815642458100559, "grad_norm": 0.6451118237468618, "learning_rate": 4.0455746782736e-10, "logits/chosen": -3.4130003452301025, "logits/rejected": -3.327850103378296, "logps/chosen": -0.28376999497413635, "logps/rejected": -112.74542236328125, "loss": 0.1483, "rewards/accuracies": 1.0, "rewards/chosen": 1.6321723461151123, "rewards/margins": 1.6321723461151123, "rewards/rejected": 0.0, "step": 2831 }, { "epoch": 15.82122905027933, "grad_norm": 0.9555876968137641, "learning_rate": 3.8041339032029993e-10, "logits/chosen": -3.3221280574798584, "logits/rejected": -3.451664447784424, "logps/chosen": -47.78090286254883, "logps/rejected": -55.13854217529297, "loss": 0.0975, "rewards/accuracies": 1.0, "rewards/chosen": 3.7417855262756348, "rewards/margins": 3.7417855262756348, "rewards/rejected": 0.0, "step": 2832 }, { "epoch": 15.8268156424581, "grad_norm": 0.33516527586800354, "learning_rate": 3.5701183649639035e-10, "logits/chosen": -3.323343276977539, "logits/rejected": -3.324136257171631, "logps/chosen": -1.7849292755126953, "logps/rejected": -73.18279266357422, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": 2.9350509643554688, "rewards/margins": 2.9350509643554688, "rewards/rejected": 0.0, "step": 2833 }, { "epoch": 15.832402234636872, "grad_norm": 0.34990907835476237, "learning_rate": 3.343528411344221e-10, "logits/chosen": -3.246032476425171, "logits/rejected": -3.4260365962982178, "logps/chosen": -1.6394751071929932, "logps/rejected": -78.3773422241211, "loss": 0.0891, "rewards/accuracies": 1.0, "rewards/chosen": 2.8834900856018066, "rewards/margins": 2.8834900856018066, "rewards/rejected": 0.0, "step": 2834 }, { "epoch": 15.837988826815643, "grad_norm": 0.35826203365915993, "learning_rate": 3.1243643790968e-10, "logits/chosen": -3.260681390762329, "logits/rejected": -3.158780813217163, "logps/chosen": -0.801013708114624, "logps/rejected": -84.7547607421875, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": 2.4451889991760254, "rewards/margins": 2.4451889991760254, "rewards/rejected": 0.0, "step": 2835 }, { "epoch": 15.843575418994414, "grad_norm": 0.37958438137663664, "learning_rate": 2.912626593938872e-10, "logits/chosen": -3.321927070617676, "logits/rejected": -3.245786190032959, "logps/chosen": -3.858067274093628, "logps/rejected": -69.91818237304688, "loss": 0.1202, "rewards/accuracies": 1.0, "rewards/chosen": 2.448540210723877, "rewards/margins": 2.448540210723877, "rewards/rejected": 0.0, "step": 2836 }, { "epoch": 15.849162011173185, "grad_norm": 0.341561244863116, "learning_rate": 2.70831537055094e-10, "logits/chosen": -3.281402349472046, "logits/rejected": -3.4028379917144775, "logps/chosen": -1.1686630249023438, "logps/rejected": -61.36640548706055, "loss": 0.1279, "rewards/accuracies": 1.0, "rewards/chosen": 2.2457661628723145, "rewards/margins": 2.2457661628723145, "rewards/rejected": 0.0, "step": 2837 }, { "epoch": 15.854748603351956, "grad_norm": 0.46006948038488005, "learning_rate": 2.511431012575116e-10, "logits/chosen": -3.2890331745147705, "logits/rejected": -3.3994855880737305, "logps/chosen": -0.8558558225631714, "logps/rejected": -42.05720138549805, "loss": 0.1221, "rewards/accuracies": 1.0, "rewards/chosen": 2.270813465118408, "rewards/margins": 2.270813465118408, "rewards/rejected": 0.0, "step": 2838 }, { "epoch": 15.860335195530727, "grad_norm": 0.425827273136935, "learning_rate": 2.3219738126162292e-10, "logits/chosen": -3.5399320125579834, "logits/rejected": -3.692891836166382, "logps/chosen": -1.2003713846206665, "logps/rejected": -37.772796630859375, "loss": 0.1496, "rewards/accuracies": 1.0, "rewards/chosen": 2.314761161804199, "rewards/margins": 2.314761161804199, "rewards/rejected": 0.0, "step": 2839 }, { "epoch": 15.865921787709498, "grad_norm": 0.5341951624759204, "learning_rate": 2.1399440522429367e-10, "logits/chosen": -3.338050603866577, "logits/rejected": -3.5315940380096436, "logps/chosen": -1.6954944133758545, "logps/rejected": -46.38953399658203, "loss": 0.0907, "rewards/accuracies": 1.0, "rewards/chosen": 2.8678371906280518, "rewards/margins": 2.8678371906280518, "rewards/rejected": 0.0, "step": 2840 }, { "epoch": 15.865921787709498, "eval_logits/chosen": -3.258183240890503, "eval_logits/rejected": -3.381404161453247, "eval_logps/chosen": -30.842792510986328, "eval_logps/rejected": -56.60395431518555, "eval_loss": 1.0576844215393066, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": -0.0772751122713089, "eval_rewards/margins": -0.0772751122713089, "eval_rewards/rejected": 0.0, "eval_runtime": 32.6797, "eval_samples_per_second": 9.486, "eval_steps_per_second": 0.306, "step": 2840 }, { "epoch": 15.871508379888269, "grad_norm": 0.3903180939096369, "learning_rate": 1.965342001982173e-10, "logits/chosen": -3.5267210006713867, "logits/rejected": -3.6699228286743164, "logps/chosen": -1.2052972316741943, "logps/rejected": -32.841796875, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": 2.193192481994629, "rewards/margins": 2.193192481994629, "rewards/rejected": 0.0, "step": 2841 }, { "epoch": 15.87709497206704, "grad_norm": 0.3768808213732159, "learning_rate": 1.7981679213247002e-10, "logits/chosen": -3.294266700744629, "logits/rejected": -3.1123218536376953, "logps/chosen": -0.28513163328170776, "logps/rejected": -139.90008544921875, "loss": 0.0926, "rewards/accuracies": 1.0, "rewards/chosen": 1.780448079109192, "rewards/margins": 1.780448079109192, "rewards/rejected": 0.0, "step": 2842 }, { "epoch": 15.88268156424581, "grad_norm": 0.43974030954277454, "learning_rate": 1.6384220587212227e-10, "logits/chosen": -3.217982053756714, "logits/rejected": -3.167692184448242, "logps/chosen": -1.0253431797027588, "logps/rejected": -22.315250396728516, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 2.5189590454101562, "rewards/margins": 2.5189590454101562, "rewards/rejected": 0.0, "step": 2843 }, { "epoch": 15.888268156424582, "grad_norm": 0.36849146650165276, "learning_rate": 1.4861046515818322e-10, "logits/chosen": -3.2889318466186523, "logits/rejected": -3.4570798873901367, "logps/chosen": -12.28207015991211, "logps/rejected": -52.328094482421875, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": 3.6533098220825195, "rewards/margins": 3.6533098220825195, "rewards/rejected": 0.0, "step": 2844 }, { "epoch": 15.893854748603353, "grad_norm": 0.39614195727997487, "learning_rate": 1.3412159262771173e-10, "logits/chosen": -3.5022990703582764, "logits/rejected": -3.5626304149627686, "logps/chosen": -2.2584948539733887, "logps/rejected": -39.102142333984375, "loss": 0.0891, "rewards/accuracies": 1.0, "rewards/chosen": 2.5635828971862793, "rewards/margins": 2.5635828971862793, "rewards/rejected": 0.0, "step": 2845 }, { "epoch": 15.899441340782122, "grad_norm": 0.3592022131781414, "learning_rate": 1.2037560981381644e-10, "logits/chosen": -3.4192519187927246, "logits/rejected": -3.5048091411590576, "logps/chosen": -4.079451084136963, "logps/rejected": -79.90035247802734, "loss": 0.1145, "rewards/accuracies": 1.0, "rewards/chosen": 2.5213623046875, "rewards/margins": 2.5213623046875, "rewards/rejected": 0.0, "step": 2846 }, { "epoch": 15.905027932960895, "grad_norm": 0.3622297963349978, "learning_rate": 1.0737253714548922e-10, "logits/chosen": -3.217479944229126, "logits/rejected": -3.2967512607574463, "logps/chosen": -2.382681369781494, "logps/rejected": -67.23530578613281, "loss": 0.0973, "rewards/accuracies": 1.0, "rewards/chosen": 3.307535171508789, "rewards/margins": 3.307535171508789, "rewards/rejected": 0.0, "step": 2847 }, { "epoch": 15.910614525139664, "grad_norm": 0.4541421069651796, "learning_rate": 9.511239394754956e-11, "logits/chosen": -3.268536329269409, "logits/rejected": -3.3430495262145996, "logps/chosen": -15.523447036743164, "logps/rejected": -48.1860466003418, "loss": 0.0946, "rewards/accuracies": 1.0, "rewards/chosen": 3.9595518112182617, "rewards/margins": 3.9595518112182617, "rewards/rejected": 0.0, "step": 2848 }, { "epoch": 15.916201117318435, "grad_norm": 0.373783789412415, "learning_rate": 8.35951984408112e-11, "logits/chosen": -3.253858804702759, "logits/rejected": -3.187084674835205, "logps/chosen": -1.0501309633255005, "logps/rejected": -67.25447082519531, "loss": 0.1169, "rewards/accuracies": 1.0, "rewards/chosen": 2.7255032062530518, "rewards/margins": 2.7255032062530518, "rewards/rejected": 0.0, "step": 2849 }, { "epoch": 15.921787709497206, "grad_norm": 0.3555827029802541, "learning_rate": 7.282096774180457e-11, "logits/chosen": -3.4511032104492188, "logits/rejected": -3.380977153778076, "logps/chosen": -4.205111026763916, "logps/rejected": -40.449798583984375, "loss": 0.0988, "rewards/accuracies": 1.0, "rewards/chosen": 3.0369317531585693, "rewards/margins": 3.0369317531585693, "rewards/rejected": 0.0, "step": 2850 }, { "epoch": 15.927374301675977, "grad_norm": 0.32413343270208034, "learning_rate": 6.278971786305432e-11, "logits/chosen": -3.3584396839141846, "logits/rejected": -3.4900057315826416, "logps/chosen": -5.657356262207031, "logps/rejected": -52.383033752441406, "loss": 0.1151, "rewards/accuracies": 1.0, "rewards/chosen": 3.128657341003418, "rewards/margins": 3.128657341003418, "rewards/rejected": 0.0, "step": 2851 }, { "epoch": 15.932960893854748, "grad_norm": 0.3237755223361088, "learning_rate": 5.3501463712690706e-11, "logits/chosen": -3.318645715713501, "logits/rejected": -3.4634900093078613, "logps/chosen": -9.837621688842773, "logps/rejected": -29.679607391357422, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": 3.9674246311187744, "rewards/margins": 3.9674246311187744, "rewards/rejected": 0.0, "step": 2852 }, { "epoch": 15.938547486033519, "grad_norm": 0.45349698875495575, "learning_rate": 4.4956219094782756e-11, "logits/chosen": -3.209752321243286, "logits/rejected": -3.299441337585449, "logps/chosen": -7.105777740478516, "logps/rejected": -48.06958770751953, "loss": 0.1182, "rewards/accuracies": 1.0, "rewards/chosen": 3.6124625205993652, "rewards/margins": 3.6124625205993652, "rewards/rejected": 0.0, "step": 2853 }, { "epoch": 15.94413407821229, "grad_norm": 0.3385902059748917, "learning_rate": 3.71539967090051e-11, "logits/chosen": -3.5183067321777344, "logits/rejected": -3.205066680908203, "logps/chosen": -0.8646174669265747, "logps/rejected": -47.16189956665039, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": 2.7576231956481934, "rewards/margins": 2.7576231956481934, "rewards/rejected": 0.0, "step": 2854 }, { "epoch": 15.949720670391061, "grad_norm": 0.329009735142087, "learning_rate": 3.009480815091559e-11, "logits/chosen": -3.21635365486145, "logits/rejected": -3.232184410095215, "logps/chosen": -0.7088548541069031, "logps/rejected": -44.514739990234375, "loss": 0.0812, "rewards/accuracies": 1.0, "rewards/chosen": 2.5989913940429688, "rewards/margins": 2.5989913940429688, "rewards/rejected": 0.0, "step": 2855 }, { "epoch": 15.955307262569832, "grad_norm": 0.4543984012760132, "learning_rate": 2.377866391173322e-11, "logits/chosen": -3.3538575172424316, "logits/rejected": -3.369420051574707, "logps/chosen": -0.6714050769805908, "logps/rejected": -62.79505157470703, "loss": 0.1132, "rewards/accuracies": 1.0, "rewards/chosen": 2.1580557823181152, "rewards/margins": 2.1580557823181152, "rewards/rejected": 0.0, "step": 2856 }, { "epoch": 15.960893854748603, "grad_norm": 0.3553760966999862, "learning_rate": 1.8205573378338167e-11, "logits/chosen": -3.3686656951904297, "logits/rejected": -3.355982542037964, "logps/chosen": -1.023118019104004, "logps/rejected": -38.32646942138672, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 2.8010873794555664, "rewards/margins": 2.8010873794555664, "rewards/rejected": 0.0, "step": 2857 }, { "epoch": 15.966480446927374, "grad_norm": 0.6577935053933072, "learning_rate": 1.3375544833382769e-11, "logits/chosen": -3.499521017074585, "logits/rejected": -3.325239419937134, "logps/chosen": -2.625056743621826, "logps/rejected": -35.65730667114258, "loss": 0.0853, "rewards/accuracies": 1.0, "rewards/chosen": 2.995539665222168, "rewards/margins": 2.995539665222168, "rewards/rejected": 0.0, "step": 2858 }, { "epoch": 15.972067039106145, "grad_norm": 0.39916437887814754, "learning_rate": 9.288585455069498e-12, "logits/chosen": -3.4343879222869873, "logits/rejected": -3.238798141479492, "logps/chosen": -1.6715790033340454, "logps/rejected": -28.844999313354492, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 2.5408287048339844, "rewards/margins": 2.5408287048339844, "rewards/rejected": 0.0, "step": 2859 }, { "epoch": 15.977653631284916, "grad_norm": 0.4465943684246888, "learning_rate": 5.944701317428524e-12, "logits/chosen": -3.165088415145874, "logits/rejected": -3.23022723197937, "logps/chosen": -1.974366545677185, "logps/rejected": -49.10314178466797, "loss": 0.1089, "rewards/accuracies": 1.0, "rewards/chosen": 2.056807041168213, "rewards/margins": 2.056807041168213, "rewards/rejected": 0.0, "step": 2860 }, { "epoch": 15.977653631284916, "eval_logits/chosen": -3.2575747966766357, "eval_logits/rejected": -3.3810267448425293, "eval_logps/chosen": -30.767574310302734, "eval_logps/rejected": -56.184364318847656, "eval_loss": 1.0558310747146606, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -0.06975345313549042, "eval_rewards/margins": -0.06975345313549042, "eval_rewards/rejected": 0.0, "eval_runtime": 32.7389, "eval_samples_per_second": 9.469, "eval_steps_per_second": 0.305, "step": 2860 }, { "epoch": 15.983240223463687, "grad_norm": 0.67981272571839, "learning_rate": 3.34389738998464e-12, "logits/chosen": -3.2139832973480225, "logits/rejected": -3.546630382537842, "logps/chosen": -6.066982269287109, "logps/rejected": -87.87742614746094, "loss": 0.1009, "rewards/accuracies": 1.0, "rewards/chosen": 2.997310161590576, "rewards/margins": 2.997310161590576, "rewards/rejected": 0.0, "step": 2861 }, { "epoch": 15.988826815642458, "grad_norm": 0.3724043214672477, "learning_rate": 1.4861775380903275e-12, "logits/chosen": -3.2743287086486816, "logits/rejected": -3.3364179134368896, "logps/chosen": -11.646625518798828, "logps/rejected": -46.14485168457031, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 3.116117477416992, "rewards/margins": 3.116117477416992, "rewards/rejected": 0.0, "step": 2862 }, { "epoch": 15.994413407821229, "grad_norm": 0.3525394612067773, "learning_rate": 3.715445225371816e-13, "logits/chosen": -3.5659289360046387, "logits/rejected": -3.6955089569091797, "logps/chosen": -34.07235336303711, "logps/rejected": -28.369863510131836, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 3.8572776317596436, "rewards/margins": 3.8572776317596436, "rewards/rejected": 0.0, "step": 2863 }, { "epoch": 16.0, "grad_norm": 0.41363535239067295, "learning_rate": 0.0, "logits/chosen": -3.784046173095703, "logits/rejected": -3.575810670852661, "logps/chosen": -5.2440080642700195, "logps/rejected": -48.66508483886719, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": 3.3392105102539062, "rewards/margins": 3.3392105102539062, "rewards/rejected": 0.0, "step": 2864 }, { "epoch": 16.0, "step": 2864, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 0.0303, "train_samples_per_second": 3013918.631, "train_steps_per_second": 94647.62 } ], "logging_steps": 1, "max_steps": 2864, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }